Esempio n. 1
0
class DateIndex(UnIndex):
    """ Index for Dates """

    __implements__ = (PluggableIndex.PluggableIndexInterface,)

    meta_type = 'DateIndex'
    query_options = ['query', 'range']

    manage = manage_main = DTMLFile( 'dtml/manageDateIndex', globals() )
    manage_main._setName( 'manage_main' )
    manage_options = ( { 'label' : 'Settings'
                       , 'action' : 'manage_main'
                       },
                     )

    def clear( self ):
        """ Complete reset """
        self._index = IOBTree()
        self._unindex = OIBTree()


    def index_object( self, documentId, obj, threshold=None ):
        """index an object, normalizing the indexed value to an integer

           o Normalized value has granularity of one minute.

           o Objects which have 'None' as indexed value are *omitted*,
             by design.
        """
        returnStatus = 0

        try:
            date_attr = getattr( obj, self.id )
            if callable( date_attr ):
                date_attr = date_attr()

            ConvertedDate = self._convert( value=date_attr, default=_marker )
        except AttributeError:
            ConvertedDate = _marker

        oldConvertedDate = self._unindex.get( documentId, _marker )

        if ConvertedDate != oldConvertedDate:
            if oldConvertedDate is not _marker:
                self.removeForwardIndexEntry(oldConvertedDate, documentId)

            if ConvertedDate is not _marker:
                self.insertForwardIndexEntry( ConvertedDate, documentId )
                self._unindex[documentId] = ConvertedDate

            returnStatus = 1

        return returnStatus


    def _apply_index( self, request, cid='', type=type, None=None ):
Esempio n. 2
0
class DocumentMap(Persistent):
    """ A two-way map between addresses (e.g. location paths) and document ids.

    The map is a persistent object meant to live in a ZODB storage.

    Additionally, the map is capable of mapping 'metadata' to docids.
    """
    _v_nextid = None
    family = BTrees.family32
    _randrange = random.randrange
    docid_to_metadata = None  # latch for b/c

    def __init__(self):
        self.docid_to_address = IOBTree()
        self.address_to_docid = OIBTree()
        self.docid_to_metadata = IOBTree()

    def docid_for_address(self, address):
        """ Retrieve a document id for a given address.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        Return the integer document id corresponding to ``address``.

        If ``address`` doesn't exist in the document map, return None.
        """
        return self.address_to_docid.get(address)

    def address_for_docid(self, docid):
        """ Retrieve an address for a given document id.

        ``docid`` is an integer document id.

        Return the address corresponding to ``docid``.

        If ``docid`` doesn't exist in the document map, return None.
        """
        return self.docid_to_address.get(docid)

    def add(self, address, docid=_marker):
        """ Add a new document to the document map.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        ``docid``, if passed, must be an int.  In this case, remove
        any previous address stored for it before mapping it to the
        new address.  Passing an explicit ``docid`` also removes any
        metadata associated with that docid.
        
        If ``docid`` is not passed, generate a new docid.

        Return the integer document id mapped to ``address``.
        """
        if docid is _marker:
            docid = self.new_docid()

        self.remove_docid(docid)
        self.remove_address(address)

        self.docid_to_address[docid] = address
        self.address_to_docid[address] = docid
        return docid

    def remove_docid(self, docid):
        """ Remove a document from the document map for the given document ID.

        ``docid`` is an integer document id.

        Remove any corresponding metadata for ``docid`` as well.

        Return a True if ``docid`` existed in the map, else return False.
        """
        # It should be an invariant that if one entry exists in
        # docid_to_address for a docid/address pair, exactly one
        # corresponding entry exists in address_to_docid for the same
        # docid/address pair.  However, versions of this code before
        # r.catalog 0.7.3 had a bug which, if this method was called
        # multiple times, each time with the same address but a
        # different docid, the ``docid_to_address`` mapping could
        # contain multiple entries for the same address each with a
        # different docid, causing this invariant to be violated.  The
        # symptom: in systems that used r.catalog 0.7.2 and lower,
        # there might be more entries in docid_to_address than there
        # are in address_to_docid.  The conditional fuzziness in the
        # code directly below is a runtime kindness to systems in that
        # state.  Technically, the administrator of a system in such a
        # state should normalize the two data structures by running a
        # script after upgrading to 0.7.3.  If we made the admin do
        # this, some of the code fuzziness below could go away,
        # replaced with something simpler.  But there's no sense in
        # breaking systems at runtime through being a hardass about
        # consistency if an unsuspecting upgrader has not yet run the
        # data fixer script. The "fix the data" mantra rings a
        # little hollow when you weren't the one who broke the data in
        # the first place ;-)

        self._check_metadata()

        address = self.docid_to_address.get(docid, _marker)
        if address is _marker:
            return False

        old_docid = self.address_to_docid.get(address, _marker)
        if (old_docid is not _marker) and (old_docid != docid):
            self.remove_docid(old_docid)

        if docid in self.docid_to_address:
            del self.docid_to_address[docid]
        if address in self.address_to_docid:
            del self.address_to_docid[address]
        if docid in self.docid_to_metadata:
            del self.docid_to_metadata[docid]

        return True

    def remove_address(self, address):
        """ Remove a document from the document map using an address.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        Remove any corresponding metadata for ``address`` as well.

        Return a True if ``address`` existed in the map, else return False.
        """
        # See the comment in remove_docid for complexity rationalization

        self._check_metadata()

        docid = self.address_to_docid.get(address, _marker)
        if docid is _marker:
            return False

        old_address = self.docid_to_address.get(docid, _marker)
        if (old_address is not _marker) and (old_address != address):
            self.remove_address(old_address)

        if docid in self.docid_to_address:
            del self.docid_to_address[docid]
        if address in self.address_to_docid:
            del self.address_to_docid[address]
        if docid in self.docid_to_metadata:
            del self.docid_to_metadata[docid]

        return True

    def _check_metadata(self):
        # backwards compatibility
        if self.docid_to_metadata is None:
            self.docid_to_metadata = IOBTree()

    def add_metadata(self, docid, data):
        """ Add metadata related to a given document id.

        ``data`` must be a mapping, such as a dictionary.
        
        For each key/value pair in ``data`` insert a metadata key/value pair
        into the metadata stored for ``docid``.

        Overwrite any existing values for the keys in ``data``, leaving values
        unchanged for other existing keys.

        Raise a KeyError If ``docid`` doesn't relate to an address in the
        document map.
        """
        if not docid in self.docid_to_address:
            raise KeyError(docid)
        if len(list(data.keys())) == 0:
            return
        self._check_metadata()
        meta = self.docid_to_metadata.setdefault(docid, OOBTree())
        for k in data:
            meta[k] = data[k]

    def remove_metadata(self, docid, *keys):
        """ Remove metadata related to a given document id.

        If ``docid`` doesn't exist in the metadata map, raise a KeyError.

        For each key in ``keys``, remove the metadata value for the
        docid related to that key.
        
        Do not raise any error if no value exists for a given key.

        If no keys are specified, remove all metadata related to the docid.
        """
        self._check_metadata()
        if keys:
            meta = self.docid_to_metadata.get(docid, _marker)
            if meta is _marker:
                raise KeyError(docid)
            for k in keys:
                if k in meta:
                    del meta[k]
            if not meta:
                del self.docid_to_metadata[docid]
        else:
            if not (docid in self.docid_to_metadata):
                raise KeyError(docid)
            del self.docid_to_metadata[docid]

    def get_metadata(self, docid):
        """ Return the metadata for ``docid``.

        Return a mapping of the keys and values set using ``add_metadata``.

        Raise a KeyError If metadata does not exist for ``docid``.
        """
        if self.docid_to_metadata is None:
            raise KeyError(docid)
        meta = self.docid_to_metadata[docid]
        return meta

    def new_docid(self):
        """ Return a new document id.

        The returned value is guaranteed not to be used already in this
        document map.
        """
        while True:
            if self._v_nextid is None:
                self._v_nextid = self._randrange(self.family.minint,
                                                 self.family.maxint)
            uid = self._v_nextid
            self._v_nextid += 1
            if uid not in self.docid_to_address:
                return uid
            self._v_nextid = None
Esempio n. 3
0
class DateIndex(UnIndex, PropertyManager):
    """Index for dates.
    """
    implements(IDateIndex)

    meta_type = 'DateIndex'
    query_options = ('query', 'range')

    index_naive_time_as_local = True  # False means index as UTC
    _properties = ({
        'id': 'index_naive_time_as_local',
        'type': 'boolean',
        'mode': 'w'
    }, )

    manage = manage_main = DTMLFile('dtml/manageDateIndex', globals())
    manage_browse = DTMLFile('../dtml/browseIndex', globals())

    manage_main._setName('manage_main')
    manage_options = (
        {
            'label': 'Settings',
            'action': 'manage_main'
        },
        {
            'label': 'Browse',
            'action': 'manage_browse',
        },
    ) + PropertyManager.manage_options

    def clear(self):
        """ Complete reset """
        self._index = IOBTree()
        self._unindex = OIBTree()
        self._length = Length()

    def index_object(self, documentId, obj, threshold=None):
        """index an object, normalizing the indexed value to an integer

           o Normalized value has granularity of one minute.

           o Objects which have 'None' as indexed value are *omitted*,
             by design.
        """
        returnStatus = 0

        try:
            date_attr = getattr(obj, self.id)
            if safe_callable(date_attr):
                date_attr = date_attr()

            ConvertedDate = self._convert(value=date_attr, default=_marker)
        except AttributeError:
            ConvertedDate = _marker

        oldConvertedDate = self._unindex.get(documentId, _marker)

        if ConvertedDate != oldConvertedDate:
            if oldConvertedDate is not _marker:
                self.removeForwardIndexEntry(oldConvertedDate, documentId)
                if ConvertedDate is _marker:
                    try:
                        del self._unindex[documentId]
                    except ConflictError:
                        raise
                    except:
                        LOG.error("Should not happen: ConvertedDate was there,"
                                  " now it's not, for document with id %s" %
                                  documentId)

            if ConvertedDate is not _marker:
                self.insertForwardIndexEntry(ConvertedDate, documentId)
                self._unindex[documentId] = ConvertedDate

            returnStatus = 1

        return returnStatus

    def _apply_index(self, request, resultset=None):
        """Apply the index to query parameters given in the argument

        Normalize the 'query' arguments into integer values at minute
        precision before querying.
        """
        record = parseIndexRequest(request, self.id, self.query_options)
        if record.keys is None:
            return None

        keys = map(self._convert, record.keys)

        index = self._index
        r = None
        opr = None

        #experimental code for specifing the operator
        operator = record.get('operator', self.useOperator)
        if not operator in self.operators:
            raise RuntimeError("operator not valid: %s" % operator)

        # depending on the operator we use intersection or union
        if operator == "or":
            set_func = union
        else:
            set_func = intersection

        # range parameter
        range_arg = record.get('range', None)
        if range_arg:
            opr = "range"
            opr_args = []
            if range_arg.find("min") > -1:
                opr_args.append("min")
            if range_arg.find("max") > -1:
                opr_args.append("max")

        if record.get('usage', None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr == "range":  # range search
            if 'min' in opr_args:
                lo = min(keys)
            else:
                lo = None

            if 'max' in opr_args:
                hi = max(keys)
            else:
                hi = None

            if hi:
                setlist = index.values(lo, hi)
            else:
                setlist = index.values(lo)

            r = multiunion(setlist)

        else:  # not a range search
            for key in keys:
                set = index.get(key, None)
                if set is not None:
                    if isinstance(set, int):
                        set = IISet((set, ))
                    else:
                        # set can't be bigger than resultset
                        set = intersection(set, resultset)
                    r = set_func(r, set)

        if isinstance(r, int):
            r = IISet((r, ))

        if r is None:
            return IISet(), (self.id, )
        else:
            return r, (self.id, )

    def _convert(self, value, default=None):
        """Convert Date/Time value to our internal representation"""
        # XXX: Code patched 20/May/2003 by Kiran Jonnalagadda to
        # convert dates to UTC first.
        if isinstance(value, DateTime):
            t_tup = value.toZone('UTC').parts()
        elif isinstance(value, (float, int)):
            t_tup = time.gmtime(value)
        elif isinstance(value, str) and value:
            t_obj = DateTime(value).toZone('UTC')
            t_tup = t_obj.parts()
        elif isinstance(value, datetime):
            if self.index_naive_time_as_local and value.tzinfo is None:
                value = value.replace(tzinfo=Local)
            # else if tzinfo is None, naive time interpreted as UTC
            t_tup = value.utctimetuple()
        elif isinstance(value, date):
            t_tup = value.timetuple()
        else:
            return default

        yr = t_tup[0]
        mo = t_tup[1]
        dy = t_tup[2]
        hr = t_tup[3]
        mn = t_tup[4]

        t_val = ((((yr * 12 + mo) * 31 + dy) * 24 + hr) * 60 + mn)

        if t_val > MAX32:
            # t_val must be integer fitting in the 32bit range
            raise OverflowError(
                "%s is not within the range of indexable dates (index: %s)" %
                (value, self.id))

        return t_val
class GlobbingLexicon(Lexicon):
    """Lexicon which supports basic globbing function ('*' and '?').

    This lexicon keeps several data structures around that are useful
    for searching. They are:

      '_lexicon' -- Contains the mapping from word => word_id

      '_inverseLex' -- Contains the mapping from word_id => word

      '_digrams' -- Contains a mapping from digram => word_id

    Before going further, it is necessary to understand what a digram is,
    as it is a core component of the structure of this lexicon.  A digram
    is a two-letter sequence in a word.  For example, the word 'zope'
    would be converted into the digrams::

      ['$z', 'zo', 'op', 'pe', 'e$']

    where the '$' is a word marker.  It is used at the beginning and end
    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'


    def __init__(self,useSplitter=None,extra=None):
        self.clear()
        self.useSplitter = useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)

    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()

    def _convertBTrees(self, threshold=200):
        Lexicon._convertBTrees(self, threshold)
        if type(self._digrams) is OOBTree: return

        from BTrees.convert import convert

        _digrams=self._digrams
        self._digrams=OOBTree()
        self._digrams._p_jar=self._p_jar
        convert(_digrams, self._digrams, threshold, IITreeSet)


    def createDigrams(self, word):
        """Returns a list with the set of digrams in the word."""

        word = '$'+word+'$'
        return [ word[i:i+2] for i in range(len(word)-1)]


    def getWordId(self, word):
        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
            return self.assignWordId(word)

    set = getWordId                     # Kludge for old code

    def getWord(self, wid):
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word, and return it."""

        # Double check it's not in the lexicon already, and if it is, just
        # return it.
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        # Get word id. BBB Backward compat pain.
        inverse=self._inverseLex
        try: insert=inverse.insert
        except AttributeError:
            # we have an "old" BTree object
            if inverse:
                wid=inverse.keys()[-1]+1
            else:
                self._inverseLex=IOBTree()
                wid=1
            inverse[wid] = word
        else:
            # we have a "new" IOBTree object
            wid=randid()
            while not inverse.insert(wid, word):
                wid=randid()

        self._lexicon[word] = wid

        # Now take all the digrams and insert them into the digram map.
        for digram in self.createDigrams(word):
            set = self._digrams.get(digram, None)
            if set is None:
                self._digrams[digram] = set = IISet()
            set.insert(wid)

        return wid


    def get(self, pattern):
        """ Query the lexicon for words matching a pattern."""

        # single word pattern  produce a slicing problem below.
        # Because the splitter throws away single characters we can
        # return an empty tuple here.

        if len(pattern)==1: return ()

        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
        globbing = 0
        for i in range(len(pattern)):
            if pattern[i] in wc_set:
                globbing = 1
                continue

            if i == 0:
                digrams.insert(i, (self.eow + pattern[i]) )
                digrams.append((pattern[i] + pattern[i+1]))
            else:
                try:
                    if pattern[i+1] not in wc_set:
                        digrams.append( pattern[i] + pattern[i+1] )

                except IndexError:
                    digrams.append( (pattern[i] + self.eow) )

        if not globbing:
            result =  self._lexicon.get(pattern, None)
            if result is None:
                return ()
            return (result, )

        ## now get all of the intsets that contain the result digrams
        result = None
        for digram in digrams:
            result=union(result, self._digrams.get(digram, None))

        if not result:
            return ()
        else:
            ## now we have narrowed the list of possible candidates
            ## down to those words which contain digrams.  However,
            ## some words may have been returned that match digrams,
            ## but do not match 'pattern'.  This is because some words
            ## may contain all matching digrams, but in the wrong
            ## order.

            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = IISet()
            for x in result:
                if expr.match(self._inverseLex[x]):
                    hits.insert(x)
            return hits


    def __getitem__(self, word):
        """ """
        return self.get(word)


    def query_hook(self, q):
        """expand wildcards"""
        ListType = type([])
        i = len(q) - 1
        while i >= 0:
            e = q[i]
            if isinstance(e, ListType):
                self.query_hook(e)
            elif isinstance(e, Op):
                pass
            elif ( (self.multi_wc in e) or
                   (self.single_wc in e) ):
                wids = self.get(e)
                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
                if not words:
                    # if words is empty, return something that will make
                    # textindex's __getitem__ return an empty result list
                    words.append('')
                q[i] = words
            i = i - 1

        return q

    def Splitter(self, astring, words=None, encoding="latin1"):
        """ wrap the splitter """

        ## don't do anything, less efficient but there's not much
        ## sense in stemming a globbing lexicon.

        try:
            return self.SplitterFunc(
                    astring,
                    words,
                    encoding=encoding,
                    singlechar=self.splitterParams.splitterSingleChars,
                    indexnumbers=self.splitterParams.splitterIndexNumbers,
                    casefolding=self.splitterParams.splitterCasefolding
                    )
        except:
            return self.SplitterFunc(astring, words)


    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

        # Remove characters that are meaningful in a regex
        if not isinstance(pat, UnicodeType):
            transTable = string.maketrans("", "")
            result = string.translate(pat, transTable,
                                      r'()&|!@#$%^{}\<>.')
        else:
            transTable={}
            for ch in r'()&|!@#$%^{}\<>.':
                transTable[ord(ch)]=None
            result=pat.translate(transTable)

        # First, deal with multi-character globbing
        result = result.replace( '*', '.*')

        # Next, we need to deal with single-character globbing
        result = result.replace( '?', '.')

        return "%s$" % result
class UUIDIndex(UnIndex):
    """Index for uuid fields with an unique value per key.

    The internal structure is:

    self._index = {datum:documentId]}
    self._unindex = {documentId:datum}

    For each datum only one documentId can exist.
    """

    meta_type = "UUIDIndex"

    manage_options = (
        {'label': 'Settings', 'action': 'manage_main'},
        {'label': 'Browse', 'action': 'manage_browse'},
    )

    query_options = ["query", "range"]

    manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals())
    manage_main._setName('manage_main')
    manage_browse = DTMLFile('../dtml/browseIndex', globals())

    def clear(self):
        self._length = Length()
        self._index = OIBTree()
        self._unindex = IOBTree()
        self._counter = Length()

    def numObjects(self):
        """Return the number of indexed objects. Since we have a 1:1 mapping
        from documents to values, we can reuse the stored length.
        """
        return self.indexSize()

    def uniqueValues(self, name=None, withLengths=0):
        """returns the unique values for name

        if withLengths is true, returns a sequence of
        tuples of (value, length)
        """
        if name is None:
            name = self.id
        elif name != self.id:
            raise StopIteration

        if not withLengths:
            for key in self._index.keys():
                yield key
        else:
            # We know the length for each value is one
            for key in self._index.keys():
                yield (key, 1)

    def insertForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and put it in the correct place
        in the forward index.
        """
        if entry is None:
            return

        old_docid = self._index.get(entry, _marker)
        if old_docid is _marker:
            self._index[entry] = documentId
            self._length.change(1)
        elif old_docid != documentId:
            logger.error("A different document with value '%s' already "
                "exists in the index.'" % entry)

    def removeForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and remove any reference to documentId
        in its entry in the index.
        """
        old_docid = self._index.get(entry, _marker)
        if old_docid is not _marker:
            del self._index[entry]
            self._length.change(-1)

    def _get_object_datum(self, obj, attr):
        # for a uuid it never makes sense to acquire a parent value via
        # Acquisition
        has_attr = getattr(aq_base(obj), attr, _marker)
        if has_attr is _marker:
            return _marker
        return super(UUIDIndex, self)._get_object_datum(obj, attr)
Esempio n. 6
0
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
    """ An Object Catalog

    An Object Catalog maintains a table of object metadata, and a
    series of manageable indexes to quickly search for objects
    (references in the metadata) that satisfy a search query.

    This class is not Zope specific, and can be used in any python
    program to build catalogs of objects.  Note that it does require
    the objects to be Persistent, and thus must be used with ZODB3.
    """

    _v_brains = NoBrainer

    def __init__(self, vocabulary=None, brains=None):
        # Catalogs no longer care about vocabularies and lexicons
        # so the vocabulary argument is ignored. (Casey)

        self.schema = {}  # mapping from attribute name to column number
        self.names = ()  # sequence of column names
        self.indexes = {}  # mapping from index name to index object

        # The catalog maintains a BTree of object meta_data for
        # convenient display on result pages.  meta_data attributes
        # are turned into brain objects and returned by
        # searchResults.  The indexing machinery indexes all records
        # by an integer id (rid). self.data is a mapping from the
        # integer id to the meta_data, self.uids is a mapping of the
        # object unique identifier to the rid, and self.paths is a
        # mapping of the rid to the unique identifier.

        self.clear()

        if brains is not None:
            self._v_brains = brains

        self.updateBrains()

    def __len__(self):
        return self._length()

    def clear(self):
        """ clear catalog """

        self.data = IOBTree()  # mapping of rid to meta_data
        self.uids = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid
        self._length = BTrees.Length.Length()

        for index in self.indexes:
            self.getIndex(index).clear()

    def updateBrains(self):
        self.useBrains(self._v_brains)

    def __getitem__(self, index):
        """
        Returns instances of self._v_brains, or whatever is passed
        into self.useBrains.
        """
        if isinstance(index, tuple):
            # then it contains a score...
            normalized_score, score, key = index
        else:
            # otherwise no score, set all scores to 1
            normalized_score, score, key = (1, 1, index)
        return self.instantiate((key, self.data[key]),
                                score_data=(score, normalized_score))

    def __setstate__(self, state):
        """ initialize your brains.  This method is called when the
        catalog is first activated (from the persistent storage) """
        Persistent.__setstate__(self, state)
        self.updateBrains()

    def useBrains(self, brains):
        """ Sets up the Catalog to return an object (ala ZTables) that
        is created on the fly from the tuple stored in the self.data
        Btree.
        """
        class mybrains(AbstractCatalogBrain, brains):  # NOQA
            pass

        scopy = self.schema.copy()

        schema_len = len(self.schema.keys())
        scopy['data_record_id_'] = schema_len
        scopy['data_record_score_'] = schema_len + 1
        scopy['data_record_normalized_score_'] = schema_len + 2

        mybrains.__record_schema__ = scopy

        self._v_brains = brains
        self._v_result_class = mybrains

    def addColumn(self, name, default_value=None, threshold=10000):
        """Adds a row to the meta data schema"""
        schema = self.schema
        names = list(self.names)
        threshold = threshold if threshold is not None else 10000

        if name != name.strip():
            # Someone could have mistakenly added a space at the end
            # of the input field.
            LOG.warning('stripped space from new column %r -> %r', name,
                        name.strip())
            name = name.strip()

        if name in schema:
            raise CatalogError('The column %s already exists' % name)

        if name[0] == '_':
            raise CatalogError('Cannot cache fields beginning with "_"')

        values = schema.values()
        if values:
            schema[name] = max(values) + 1
        else:
            schema[name] = 0
        names.append(name)

        if default_value in (None, ''):
            default_value = MV

        if len(self):
            pghandler = ZLogHandler(threshold)
            pghandler.init('Adding %s column' % name, len(self))
            for i, (key, value) in enumerate(self.data.iteritems()):
                pghandler.report(i)
                self.data[key] = value + (default_value, )
            pghandler.finish()

        self.names = tuple(names)
        self.schema = schema

        # new column? update the brain
        self.updateBrains()

    def delColumn(self, name, threshold=10000):
        """Deletes a row from the meta data schema"""
        names = list(self.names)
        _index = names.index(name)
        threshold = threshold if threshold is not None else 10000

        if name not in self.schema:
            LOG.error(
                'delColumn attempted to delete nonexistent '
                'column %s.', str(name))
            return

        del names[_index]

        # rebuild the schema
        schema = {}
        for i, name in enumerate(names):
            schema[name] = i

        self.schema = schema
        self.names = tuple(names)

        # update the brain
        self.updateBrains()

        # remove the column value from each record
        if len(self):
            _next_index = _index + 1
            pghandler = ZLogHandler(threshold)
            pghandler.init('Deleting %s column' % name, len(self))
            for i, (key, value) in enumerate(self.data.iteritems()):
                pghandler.report(i)
                self.data[key] = value[:_index] + value[_next_index:]
            pghandler.finish()

    def addIndex(self, name, index_type):
        """Create a new index, given a name and a index_type.

        Old format: index_type was a string, 'FieldIndex' 'TextIndex' or
        'KeywordIndex' is no longer valid; the actual index must be
        instantiated and passed in to addIndex.

        New format: index_type is the actual index object to be stored.
        """

        if name in self.indexes:
            raise CatalogError('The index %s already exists' % name)

        if name.startswith('_'):
            raise CatalogError('Cannot index fields beginning with "_"')

        if not name:
            raise CatalogError('Name of index is empty')

        if name != name.strip():
            # Someone could have mistakenly added a space at the end
            # of the input field.
            LOG.warning('stripped space from new index %r -> %r', name,
                        name.strip())
            name = name.strip()

        indexes = self.indexes

        if isinstance(index_type, str):
            raise TypeError("Catalog addIndex now requires the index type to"
                            "be resolved prior to adding; create the proper "
                            "index in the caller.")

        indexes[name] = index_type
        self.indexes = indexes

    def delIndex(self, name):
        """ deletes an index """

        if name not in self.indexes:
            raise CatalogError('The index %s does not exist' % name)

        indexes = self.indexes
        del indexes[name]
        self.indexes = indexes

    def getIndex(self, name):
        """ get an index wrapped in the catalog """
        return self.indexes[name].__of__(self)

    def updateMetadata(self, object, uid, index):
        """ Given an object and a uid, update the column data for the
        uid with the object data iff the object has changed """
        data = self.data
        newDataRecord = self.recordify(object)

        if index is None:
            index = getattr(self, '_v_nextid', 0)
            if index % 4000 == 0:
                index = randint(-2000000000, 2000000000)
            while not data.insert(index, newDataRecord):
                index = randint(-2000000000, 2000000000)

            # We want ids to be somewhat random, but there are
            # advantages for having some ids generated
            # sequentially when many catalog updates are done at
            # once, such as when reindexing or bulk indexing.
            # We allocate ids sequentially using a volatile base,
            # so different threads get different bases. This
            # further reduces conflict and reduces churn in
            # here and it result sets when bulk indexing.
            self._v_nextid = index + 1
        else:
            if data.get(index, 0) != newDataRecord:
                data[index] = newDataRecord
        return index

    # the cataloging API

    def catalogObject(self,
                      object,
                      uid,
                      threshold=None,
                      idxs=None,
                      update_metadata=True):
        """
        Adds an object to the Catalog by iteratively applying it to
        all indexes.

        'object' is the object to be cataloged

        'uid' is the unique Catalog identifier for this object

        If 'idxs' is specified (as a sequence), apply the object only
        to the named indexes.

        If 'update_metadata' is true (the default), also update metadata for
        the object.  If the object is new to the catalog, this flag has
        no effect (metadata is always created for new objects).
        """
        if idxs is None:
            idxs = []

        index = self.uids.get(uid, None)

        if index is None:
            # we are inserting new data
            index = self.updateMetadata(object, uid, None)
            self._length.change(1)
            self.uids[uid] = index
            self.paths[index] = uid
        elif update_metadata:
            # we are updating and we need to update metadata
            self.updateMetadata(object, uid, index)

        # do indexing
        total = 0

        if idxs == []:
            use_indexes = self.indexes.keys()
        else:
            use_indexes = set(idxs)
            for iid in self.indexes:
                x = self.getIndex(iid)
                if ITransposeQuery.providedBy(x):
                    # supported index names for query optimization
                    names = x.getIndexNames()
                    intersec = use_indexes.intersection(names)
                    # add current index for indexing if supported index
                    # names are member of idxs
                    if intersec:
                        use_indexes.update([iid])

            use_indexes = list(use_indexes)

        for name in use_indexes:
            x = self.getIndex(name)
            if hasattr(x, 'index_object'):
                blah = x.index_object(index, object, threshold)
                total = total + blah
            else:
                LOG.error('catalogObject was passed bad index '
                          'object %s.', str(x))

        return total

    def uncatalogObject(self, uid):
        """
        Uncatalog and object from the Catalog.  and 'uid' is a unique
        Catalog identifier

        Note, the uid must be the same as when the object was
        catalogued, otherwise it will not get removed from the catalog

        This method should not raise an exception if the uid cannot
        be found in the catalog.
        """
        data = self.data
        uids = self.uids
        paths = self.paths
        indexes = self.indexes.keys()
        rid = uids.get(uid, None)

        if rid is not None:
            for name in indexes:
                x = self.getIndex(name)
                if hasattr(x, 'unindex_object'):
                    x.unindex_object(rid)
            del data[rid]
            del paths[rid]
            del uids[uid]
            self._length.change(-1)

        else:
            LOG.error(
                'uncatalogObject unsuccessfully '
                'attempted to uncatalog an object '
                'with a uid of %s. ', str(uid))

    def uniqueValuesFor(self, name):
        """ return unique values for FieldIndex name """
        return tuple(self.getIndex(name).uniqueValues())

    def hasuid(self, uid):
        """ return the rid if catalog contains an object with uid """
        return self.uids.get(uid)

    def recordify(self, object):
        """ turns an object into a record tuple """
        record = []
        # the unique id is always the first element
        for x in self.names:
            attr = getattr(object, x, MV)
            if (attr is not MV and safe_callable(attr)):
                attr = attr()
            record.append(attr)
        return tuple(record)

    def _maintain_zodb_cache(self):
        parent = aq_parent(self)
        if hasattr(aq_base(parent), 'maintain_zodb_cache'):
            parent.maintain_zodb_cache()

    def instantiate(self, record, score_data=None):
        """ internal method: create and initialise search result object.
        record should be a tuple of (document RID, metadata columns tuple),
        score_data can be a tuple of (scode, normalized score) or be omitted"""
        self._maintain_zodb_cache()
        key, data = record
        klass = self._v_result_class
        if score_data:
            score, normalized_score = score_data
            schema_len = len(klass.__record_schema__)
            if schema_len == len(data) + 3:
                # if we have complete data, create in a single pass
                data = tuple(data) + (key, score, normalized_score)
                return klass(data).__of__(aq_parent(self))
        r = klass(data)
        r.data_record_id_ = key
        if score_data:
            # preserved during refactoring for compatibility reasons:
            # can only be reached if score_data is present,
            # but schema length is not equal to len(data) + 3
            # no known use cases
            r.data_record_score_ = score
            r.data_record_normalized_score_ = normalized_score
            return r.__of__(aq_parent(self))
        return r.__of__(self)

    def getMetadataForRID(self, rid):
        record = self.data[rid]
        result = {}
        for (key, pos) in self.schema.items():
            result[key] = record[pos]
        return result

    def getIndexDataForRID(self, rid):
        result = {}
        for name in self.indexes:
            result[name] = self.getIndex(name).getEntryForObject(rid, "")
        return result

    def merge_query_args(self, query=None, **kw):
        if not kw and isinstance(query, dict):
            # Short cut for the best practice.
            return query

        merged_query = {}
        if isinstance(query, dict):
            merged_query.update(query)
        merged_query.update(kw)
        return merged_query

    def make_query(self, query):
        for iid in self.indexes:
            index = self.getIndex(iid)
            if ITransposeQuery.providedBy(index):
                query = index.make_query(query)

        # Canonicalize tuple/list query arguments.
        new_query = {}
        for key, value in query.items():
            if isinstance(value, (list, tuple)):
                new_query[key] = list(sorted(value))
            else:
                new_query[key] = value

        return new_query

    def _get_index_query_names(self, index):
        if hasattr(index, 'getIndexQueryNames'):
            return index.getIndexQueryNames()
        return (index.getId(), )

    def _sort_limit_arguments(self, query, sort_index, reverse, limit):
        b_start = int(query.get('b_start', 0))
        b_size = query.get('b_size', None)
        if b_size is not None:
            b_size = int(b_size)

        if b_size is not None:
            limit = b_start + b_size
        elif limit and b_size is None:
            b_size = limit

        if sort_index is None:
            sort_report_name = None
        else:
            if isinstance(sort_index, list):
                sort_name = '-'.join(i.getId() for i in sort_index)
            else:
                sort_name = sort_index.getId()
            if isinstance(reverse, list):
                reverse_name = '-'.join('desc' if r else 'asc'
                                        for r in reverse)
            else:
                reverse_name = 'desc' if reverse else 'asc'
            sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name
            if limit is not None:
                sort_report_name += '#limit-%s' % limit
        return (b_start, b_size, limit, sort_report_name)

    def _sorted_search_indexes(self, query):
        # Simple implementation ordering only by limited result support
        query_keys = query.keys()
        order = []
        for name, index in self.indexes.items():
            for attr in self._get_index_query_names(index):
                if attr in query_keys:
                    order.append((ILimitedResultIndex.providedBy(index), name))
        order.sort()
        return [i[1] for i in order]

    def _limit_sequence(self,
                        sequence,
                        slen,
                        b_start=0,
                        b_size=None,
                        switched_reverse=False):
        if b_size is not None:
            sequence = sequence[b_start:b_start + b_size]
            if slen:
                slen = len(sequence)
        if switched_reverse:
            sequence.reverse()
        return (sequence, slen)

    def _search_index(self, cr, index_id, query, rs):
        cr.start_split(index_id)

        index_rs = None
        index = self.getIndex(index_id)
        limit_result = ILimitedResultIndex.providedBy(index)

        if IQueryIndex.providedBy(index):
            index_query = IndexQuery(query, index.id, index.query_options,
                                     index.operators, index.useOperator)
            if index_query.keys is not None:
                index_rs = index.query_index(index_query, rs)
        else:
            if limit_result:
                index_result = index._apply_index(query, rs)
            else:
                index_result = index._apply_index(query)

            # Parse (resultset, used_attributes) index return value.
            if index_result:
                index_rs, _ = index_result

        if not index_rs:
            # Short circuit if empty index result.
            rs = None
        else:
            # Provide detailed info about the pure intersection time.
            intersect_id = index_id + '#intersection'
            cr.start_split(intersect_id)
            # weightedIntersection preserves the values from any mappings
            # we get, as some indexes don't return simple sets.
            if hasattr(rs, 'items') or hasattr(index_rs, 'items'):
                _, rs = weightedIntersection(rs, index_rs)
            else:
                rs = intersection(rs, index_rs)

            cr.stop_split(intersect_id)

        # Consider the time it takes to intersect the index result
        # with the total result set to be part of the index time.
        cr.stop_split(index_id, result=index_rs, limit=limit_result)

        return rs

    def search(self,
               query,
               sort_index=None,
               reverse=False,
               limit=None,
               merge=True):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjunction with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        # Indexes fulfill a fairly large contract here. We hand each
        # index the query mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the query, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # Canonicalize the request into a sensible query before passing it on
        query = self.make_query(query)

        cr = self.getCatalogPlan(query)
        cr.start()

        plan = cr.plan()
        if not plan:
            plan = self._sorted_search_indexes(query)

        rs = None  # result set
        for index_id in plan:
            # The actual core loop over all indices.
            if index_id not in self.indexes:
                # We can have bogus keys or the plan can contain index names
                # that have been removed in the meantime.
                continue

            rs = self._search_index(cr, index_id, query, rs)
            if not rs:
                break

        if not rs:
            # None of the indexes found anything to do with the query.
            result = LazyCat([])
            cr.stop()
            return result

        # Try to deduce the sort limit from batching arguments.
        b_start, b_size, limit, sort_report_name = self._sort_limit_arguments(
            query, sort_index, reverse, limit)

        # We got some results from the indexes, sort and convert to sequences.
        rlen = len(rs)
        if sort_index is None and hasattr(rs, 'items'):
            # Having a 'items' means we have a data structure with
            # scores. Build a new result set, sort it by score, reverse
            # it, compute the normalized score, and Lazify it.

            if not merge:
                # Don't bother to sort here, return a list of
                # three tuples to be passed later to mergeResults.
                # Note that data_record_normalized_score_ cannot be
                # calculated and will always be 1 in this case.
                result = [(score, (1, score, rid), self.__getitem__)
                          for rid, score in rs.items()]
            else:
                cr.start_split('sort_on#score')

                # Sort it by score.
                rs = rs.byValue(0)
                max = float(rs[0][0])

                # Here we define our getter function inline so that
                # we can conveniently store the max value as a default arg
                # and make the normalized score computation lazy
                def getScoredResult(item, max=max, self=self):
                    """
                    Returns instances of self._v_brains, or whatever is
                    passed into self.useBrains.
                    """
                    score, key = item
                    norm_score = int(100.0 * score / max)
                    return self.instantiate((key, self.data[key]),
                                            score_data=(score, norm_score))

                sequence, slen = self._limit_sequence(rs, rlen, b_start,
                                                      b_size)
                result = LazyMap(getScoredResult,
                                 sequence,
                                 slen,
                                 actual_result_count=rlen)
                cr.stop_split('sort_on#score', None)

        elif sort_index is None and not hasattr(rs, 'values'):
            # no scores
            if hasattr(rs, 'keys'):
                rs = rs.keys()
            sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size)
            result = LazyMap(self.__getitem__,
                             sequence,
                             slen,
                             actual_result_count=rlen)
        else:
            # Sort. If there are scores, then this block is not
            # reached, therefore 'sort-on' does not happen in the
            # context of a text index query.  This should probably
            # sort by relevance first, then the 'sort-on' attribute.
            cr.start_split(sort_report_name)
            result = self.sortResults(rs,
                                      sort_index,
                                      reverse,
                                      limit,
                                      merge,
                                      actual_result_count=rlen,
                                      b_start=b_start,
                                      b_size=b_size)
            cr.stop_split(sort_report_name, None)

        cr.stop()
        return result

    def _sort_iterate_index(self, actual_result_count, result, rs, limit,
                            merge, reverse, sort_index, sort_index_length,
                            sort_spec, second_indexes_key_map):
        # The result set is much larger than the sorted index,
        # so iterate over the sorted index for speed.
        # TODO: len(sort_index) isn't actually what we want for a keyword
        # index, as it's only the unique values, not the documents.
        # Don't use this case while using limit, as we return results of
        # non-flattened intsets, and would have to merge/unflattened those
        # before limiting.
        length = 0
        try:
            intersection(rs, IISet(()))
        except TypeError:
            # rs is not an object in the IIBTree family.
            # Try to turn rs into an IISet.
            rs = IISet(rs)

        if sort_index_length == 1:
            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', None)
                    if keys is not None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    result.append((k, intset, self.__getitem__))
            result.sort(reverse=reverse)
        else:
            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', None)
                    if keys is not None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    # sort on secondary index
                    keysets = defaultdict(list)
                    for i in intset:
                        full_key = (k, )
                        for km in second_indexes_key_map:
                            try:
                                full_key += (km[i], )
                            except KeyError:
                                pass
                        keysets[full_key].append(i)
                    for k2, v2 in keysets.items():
                        result.append((k2, v2, self.__getitem__))
            result = multisort(result, sort_spec)

        return (actual_result_count, length, result)

    def _sort_iterate_resultset(self, actual_result_count, result, rs, limit,
                                merge, reverse, sort_index, sort_index_length,
                                sort_spec, second_indexes_key_map):
        # Iterate over the result set getting sort keys from the index.
        # If we are interested in at least 25% or more of the result set,
        # the N-Best algorithm is slower, so we iterate over all.
        index_key_map = sort_index.documentToKeyMap()

        if sort_index_length == 1:
            for did in rs:
                try:
                    key = index_key_map[did]
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    # The reference back to __getitem__ is used in case
                    # we do not merge now and need to intermingle the
                    # results with those of other catalogs while avoiding
                    # the cost of instantiating a LazyMap per result
                    result.append((key, did, self.__getitem__))
            if merge:
                result = sorted(result,
                                key=lambda x: (0, ) if x[0] is None else x,
                                reverse=reverse)
        else:
            for did in rs:
                try:
                    full_key = (index_key_map[did], )
                    for km in second_indexes_key_map:
                        full_key += (km[did], )
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    result.append((full_key, did, self.__getitem__))
            if merge:
                result = multisort(result, sort_spec)

        if merge and limit is not None:
            result = result[:limit]

        return (actual_result_count, 0, result)

    def _sort_nbest(self, actual_result_count, result, rs, limit, merge,
                    reverse, sort_index, sort_index_length, sort_spec,
                    second_indexes_key_map):
        # Limit / sort results using N-Best algorithm
        # This is faster for large sets then a full sort
        # And uses far less memory
        index_key_map = sort_index.documentToKeyMap()
        keys = []
        n = 0
        worst = None
        if sort_index_length == 1:
            for did in rs:
                try:
                    key = index_key_map[did]
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, self.__getitem__))
                    if n == limit:
                        del keys[0], result[0]
                    else:
                        n += 1
                    worst = keys[0]
            result.reverse()
        else:
            for did in rs:
                try:
                    key = index_key_map[did]
                    full_key = (key, )
                    for km in second_indexes_key_map:
                        full_key += (km[did], )
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (full_key, did, self.__getitem__))
                    if n == limit:
                        del keys[0], result[0]
                    else:
                        n += 1
                    worst = keys[0]
            result = multisort(result, sort_spec)

        return (actual_result_count, 0, result)

    def _sort_nbest_reverse(self, actual_result_count, result, rs, limit,
                            merge, reverse, sort_index, sort_index_length,
                            sort_spec, second_indexes_key_map):
        # Limit / sort results using N-Best algorithm in reverse (N-Worst?)
        index_key_map = sort_index.documentToKeyMap()
        keys = []
        n = 0
        best = None
        if sort_index_length == 1:
            for did in rs:
                try:
                    key = index_key_map[did]
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    if n >= limit and key >= best:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, self.__getitem__))
                    if n == limit:
                        del keys[-1], result[-1]
                    else:
                        n += 1
                    best = keys[-1]
        else:
            for did in rs:
                try:
                    key = index_key_map[did]
                    full_key = (key, )
                    for km in second_indexes_key_map:
                        full_key += (km[did], )
                except KeyError:
                    # This document is not in the sort key index, skip it.
                    actual_result_count -= 1
                else:
                    if n >= limit and key >= best:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (full_key, did, self.__getitem__))
                    if n == limit:
                        del keys[-1], result[-1]
                    else:
                        n += 1
                    best = keys[-1]
            result = multisort(result, sort_spec)

        return (actual_result_count, 0, result)

    def sortResults(self,
                    rs,
                    sort_index,
                    reverse=False,
                    limit=None,
                    merge=True,
                    actual_result_count=None,
                    b_start=0,
                    b_size=None):
        # Sort a result set using one or more sort indexes. Both sort_index
        # and reverse can be lists of indexes and reverse specifications.
        # Return a lazy result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples, where
        # sortkey can be a tuple on its own.
        second_indexes = None
        second_indexes_key_map = None
        sort_index_length = 1
        if isinstance(sort_index, list):
            sort_index_length = len(sort_index)
            if sort_index_length > 1:
                second_indexes = sort_index[1:]
                second_indexes_key_map = []
                for si in second_indexes:
                    second_indexes_key_map.append(si.documentToKeyMap())
            sort_index = sort_index[0]

        result = []
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the result set, reverse sorting
        # order and limit it, then reverse the result set again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            if isinstance(reverse, list):
                reverse = [not r for r in reverse]
            else:
                reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        # determine sort_spec
        if isinstance(reverse, list):
            sort_spec = [r and -1 or 1 for r in reverse]
            # limit to current maximum of sort indexes
            sort_spec = sort_spec[:sort_index_length]
            # use first sort order for choosing the algorithm
            first_reverse = reverse[0]
        else:
            sort_spec = []
            for i in xrange(sort_index_length):
                sort_spec.append(reverse and -1 or 1)
            first_reverse = reverse

        # Special first condition, as it changes post-processing.
        iterate_sort_index = (merge and limit is None
                              and (rlen >
                                   (len(sort_index) * (rlen / 100 + 1))))

        # Choose one of the sort algorithms.
        if iterate_sort_index:
            sort_func = self._sort_iterate_index
        elif limit is None or (limit * 4 > rlen):
            sort_func = self._sort_iterate_resultset
        elif first_reverse:
            sort_func = self._sort_nbest
        else:
            sort_func = self._sort_nbest_reverse

        actual_result_count, length, result = sort_func(
            actual_result_count, result, rs, limit, merge, reverse, sort_index,
            sort_index_length, sort_spec, second_indexes_key_map)

        sequence, slen = self._limit_sequence(result, length, b_start, b_size,
                                              switched_reverse)

        if iterate_sort_index:
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        else:
            if not merge:
                return sequence

            result = LazyValues(sequence)
            result.actual_result_count = actual_result_count

        return LazyMap(self.__getitem__,
                       result,
                       len(result),
                       actual_result_count=actual_result_count)

    def _get_sort_attr(self, attr, kw):
        """Helper function to find sort-on or sort-order."""
        # There are three different ways to find the attribute:
        # 1. kw[sort-attr]
        # 2. self.sort-attr
        # 3. kw[sort_attr]
        # kw may be a dict or an ExtensionClass MultiMapping, which
        # differ in what get() returns with no default value.
        name = "sort-%s" % attr
        val = kw.get(name, None)
        if val is not None:
            return val
        val = getattr(self, name, None)
        if val is not None:
            return val
        return kw.get("sort_%s" % attr, None)

    def _getSortIndex(self, args):
        """Returns a list of search index objects or None."""
        sort_index_names = self._get_sort_attr("on", args)
        if sort_index_names is not None:
            # self.indexes is always a dict, so get() w/ 1 arg works
            sort_indexes = []
            if not isinstance(sort_index_names, (list, tuple)):
                sort_index_names = [sort_index_names]
            for name in sort_index_names:
                sort_index = self.indexes.get(name)
                if sort_index is None:
                    raise CatalogError('Unknown sort_on index: %s' %
                                       repr(name))
                else:
                    if not hasattr(sort_index, 'documentToKeyMap'):
                        raise CatalogError(
                            'The index chosen for sort_on is '
                            'not capable of being used as a sort index: '
                            '%s' % repr(name))
                sort_indexes.append(sort_index)
            if len(sort_indexes) == 1:
                # be nice and keep the old API intact for single sort_on's
                return sort_indexes[0]
            return sort_indexes
        return None

    def searchResults(self, query=None, _merge=True, **kw):
        # You should pass in a simple dictionary as the first argument,
        # which only contains the relevant query.
        query = self.merge_query_args(query, **kw)
        sort_indexes = self._getSortIndex(query)
        sort_limit = self._get_sort_attr('limit', query)
        reverse = False
        if sort_indexes is not None:
            order = self._get_sort_attr("order", query)
            reverse = []
            if order is None:
                order = ['']
            elif isinstance(order, str):
                order = [order]
            for o in order:
                reverse.append(o.lower() in ('reverse', 'descending'))
            if len(reverse) == 1:
                # be nice and keep the old API intact for single sort_order
                reverse = reverse[0]
        # Perform searches with indexes and sort_index
        return self.search(query, sort_indexes, reverse, sort_limit, _merge)

    __call__ = searchResults

    def getCatalogPlan(self, query=None):
        """Query time reporting and planning.
        """
        parent = aq_base(aq_parent(self))
        threshold = getattr(parent, 'long_query_time', 0.1)
        return CatalogPlan(self, query, threshold)
Esempio n. 7
0
class Indexer(object):

    filestorage = database = connection = root = None

    def __init__(self, datafs, writable=0, trans=0, pack=0):
        self.trans_limit = trans
        self.pack_limit = pack
        self.trans_count = 0
        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
        self.database = DB(self.filestorage)
        self.connection = self.database.open()
        self.root = self.connection.root()
        try:
            self.index = self.root["index"]
        except KeyError:
            self.index = self.root["index"] = TextIndexWrapper()
        try:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
        try:
            self.doctimes = self.root["doctimes"]
        except KeyError:
            self.doctimes = self.root["doctimes"] = IIBTree()
        try:
            self.watchfolders = self.root["watchfolders"]
        except KeyError:
            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
            self.path2docid[path] = docid
        try:
            self.maxdocid = max(self.docpaths.keys())
        except ValueError:
            self.maxdocid = 0
        print(len(self.docpaths), "Document ids")
        print(len(self.path2docid), "Pathnames")
        print(self.index.lexicon.length(), "Words")

    def dumpfreqs(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        L = []
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            L.append((freq, wid, lexicon.get_word(wid)))
        L.sort()
        L.reverse()
        for freq, wid, word in L:
            print("%10d %10d %s" % (wid, freq, word))

    def dumpwids(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid)))

    def dumpwords(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for word in lexicon.words():
            wid = lexicon.get_wid(word)
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, word))

    def close(self):
        self.root = None
        if self.connection is not None:
            self.connection.close()
            self.connection = None
        if self.database is not None:
            self.database.close()
            self.database = None
        if self.filestorage is not None:
            self.filestorage.close()
            self.filestorage = None

    def interact(self, nbest=NBEST, maxlines=MAXLINES):
        try:
            import readline
        except ImportError:
            pass
        text = ""
        top = 0
        results = []
        while 1:
            try:
                line = raw_input("Query: ")
            except EOFError:
                print("\nBye.")
                break
            line = line.strip()
            if line.startswith("/"):
                self.specialcommand(line, results, top - nbest)
                continue
            if line:
                text = line
                top = 0
            else:
                if not text:
                    continue
            try:
                results, n = self.timequery(text, top + nbest)
            except KeyboardInterrupt:
                raise
            except:
                reportexc()
                text = ""
                continue
            if len(results) <= top:
                if not n:
                    print("No hits for %r." % text)
                else:
                    print("No more hits for %r." % text)
                text = ""
                continue
            print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
                  end=" ")
            print("for query %s]" % repr(text))
            self.formatresults(text, results, maxlines, top, top+nbest)
            top += nbest

    def specialcommand(self, line, results, first):
        assert line.startswith("/")
        line = line[1:]
        if not line:
            n = first
        else:
            try:
                n = int(line) - 1
            except:
                print("Huh?")
                return
        if n < 0 or n >= len(results):
            print("Out of range")
            return
        docid, score = results[n]
        path = self.docpaths[docid]
        i = path.rfind("/")
        assert i > 0
        folder = path[:i]
        n = path[i+1:]
        cmd = "show +%s %s" % (folder, n)
        if os.getenv("DISPLAY"):
            os.system("xterm -e  sh -c '%s | less' &" % cmd)
        else:
            os.system(cmd)

    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
        results, n = self.timequery(text, nbest)
        if not n:
            print("No hits for %r." % text)
            return
        print("[Results 1-%d from %d]" % (len(results), n))
        self.formatresults(text, results, maxlines)

    def timequery(self, text, nbest):
        t0 = time.time()
        c0 = time.clock()
        results, n = self.index.query(text, 0, nbest)
        t1 = time.time()
        c1 = time.clock()
        print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0))
        return results, n

    def formatresults(self, text, results, maxlines=MAXLINES,
                      lo=0, hi=sys.maxint):
        stop = self.stopdict.has_key
        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
        pattern = r"\b(" + "|".join(words) + r")\b"
        pattern = pattern.replace("*", ".*") # glob -> re syntax
        prog = re.compile(pattern, re.IGNORECASE)
        print('='*70)
        rank = lo
        for docid, score in results[lo:hi]:
            rank += 1
            path = self.docpaths[docid]
            score *= 100.0
            print("Rank:    %d   Score: %d%%   File: %s" % (rank, score, path))
            path = os.path.join(self.mh.getpath(), path)
            try:
                fp = open(path)
            except (IOError, OSError) as msg:
                print("Can't open:", msg)
                continue
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
                h = msg.getheader(header)
                if h:
                    print("%-8s %s" % (header+":", h))
            text = self.getmessagetext(msg)
            if text:
                print()
                nleft = maxlines
                for part in text:
                    for line in part.splitlines():
                        if prog.search(line):
                            print(line)
                            nleft -= 1
                            if nleft <= 0:
                                break
                    if nleft <= 0:
                        break
            print('-'*70)

    def update(self, args):
        folder = None
        seqs = []

        for arg in args:
            if arg.startswith("+"):
                if folder is None:
                    folder = arg[1:]
                else:
                    print("only one folder at a time")
                    return
            else:
                seqs.append(arg)

        if not folder:
            folder = self.mh.getcontext()
        if not seqs:
            seqs = ['all']

        try:
            f = self.mh.openfolder(folder)
        except mhlib.Error as msg:
            print(msg)
            return

        dict = {}
        for seq in seqs:
            try:
                nums = f.parsesequence(seq)
            except mhlib.Error as msg:
                print(msg or "unparsable message sequence: %s" % repr(seq))
                return
            for n in nums:
                dict[n] = n
        msgs = dict.keys()
        msgs.sort()

        self.updatefolder(f, msgs)
        self.commit()

    def optimize(self, args):
        uniqwords = {}
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nOPTIMIZE FOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.prescan(f, f.listmessages(), uniqwords)
        L = [(uniqwords[word], word) for word in uniqwords.keys()]
        L.sort()
        L.reverse()
        for i in range(100):
            print("%3d. %6d %s" % ((i+1,) + L[i]))
        self.index.lexicon.sourceToWordIds([word for (count, word) in L])

    def prescan(self, f, msgs, uniqwords):
        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
        for n in msgs:
            print("prescanning", n)
            m = f.openmessage(n)
            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
        if not args:
            print("No folders specified; use ALL to bulk-index all folders")
            return
        if "ALL" in args:
            i = args.index("ALL")
            args[i:i+1] = self.mh.listfolders()
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nFOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.updatefolder(f, f.listmessages())
            print("Total", len(self.docpaths))
        self.commit()
        print("Indexed", self.index.lexicon._nbytes, "bytes and",)
        print(self.index.lexicon._nwords, "words;",)
        print(len(self.index.lexicon._words), "unique words.")

    def updatefolder(self, f, msgs):
        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
            path = "%s/%s" % (f.name, n)
            docid = self.path2docid.get(path, 0)
            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
                print("unchanged", docid, path)
                continue
            docid = self.newdocid(path)
            try:
                m = f.openmessage(n)
            except IOError:
                print("disappeared", docid, path)
                self.unindexpath(path)
                continue
            text = self.getmessagetext(m, f.name)
            if not text:
                self.unindexpath(path)
                continue
            print("indexing", docid, path)
            self.index.index_doc(docid, text)
            self.maycommit()
        # Remove messages from the folder that no longer exist
        for path in list(self.path2docid.keys(f.name)):
            if not path.startswith(f.name + "/"):
                break
            if self.getmtime(path) == 0:
                self.unindexpath(path)
        print("done.")

    def unindexpath(self, path):
        if self.path2docid.has_key(path):
            docid = self.path2docid[path]
            print("unindexing", docid, path)
            del self.docpaths[docid]
            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex_doc(docid)
            except KeyError as msg:
                print("KeyError", msg)
            self.maycommit()

    def getmessagetext(self, m, name=None):
        L = []
        if name:
            L.append("_folder " + name) # To restrict search to a folder
            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except KeyboardInterrupt:
            raise
        except:
            print("(getmsgparts failed:)")
            reportexc()
        return L

    def getmsgparts(self, m, L, level):
        ctype = m.gettype()
        if level or ctype != "text/plain":
            print(". "*level + str(ctype))
        if ctype == "text/plain":
            L.append(m.getbodytext())
        elif ctype in ("multipart/alternative", "multipart/mixed"):
            for part in m.getbodyparts():
                self.getmsgparts(part, L, level+1)
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
            self.getheaders(m, L)
            self.getmsgparts(m, L, level+1)

    def getheaders(self, m, L):
        H = []
        for key in "from", "to", "cc", "bcc", "subject":
            value = m.get(key)
            if value:
                H.append(value)
        if H:
            L.append("\n".join(H))

    def newdocid(self, path):
        docid = self.path2docid.get(path)
        if docid is not None:
            self.doctimes[docid] = self.getmtime(path)
            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

    def getmtime(self, path):
        path = os.path.join(self.mh.getpath(), path)
        try:
            st = os.stat(path)
        except os.error as msg:
            return 0
        return int(st[ST_MTIME])

    def maycommit(self):
        self.trans_count += 1
        if self.trans_count >= self.trans_limit > 0:
            self.commit()

    def commit(self):
        if self.trans_count > 0:
            print("committing...")
            transaction.commit()
            self.trans_count = 0
            self.pack_count += 1
            if self.pack_count >= self.pack_limit > 0:
                self.pack()

    def pack(self):
        if self.pack_count > 0:
            print("packing...")
            self.database.pack()
            self.pack_count = 0
Esempio n. 8
0
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
    """ An Object Catalog 

    An Object Catalog maintains a table of object metadata, and a
    series of manageable indexes to quickly search for objects
    (references in the metadata) that satisfy a search query.

    This class is not Zope specific, and can be used in any python
    program to build catalogs of objects.  Note that it does require
    the objects to be Persistent, and thus must be used with ZODB3.
    """

    _v_brains = NoBrainer

    def __init__(self, vocabulary=None, brains=None):
        # Catalogs no longer care about vocabularies and lexicons
        # so the vocabulary argument is ignored. (Casey)

        self.schema = {}    # mapping from attribute name to column number
        self.names = ()     # sequence of column names
        self.indexes = {}   # maping from index name to index object

        # The catalog maintains a BTree of object meta_data for
        # convenient display on result pages.  meta_data attributes
        # are turned into brain objects and returned by
        # searchResults.  The indexing machinery indexes all records
        # by an integer id (rid).  self.data is a mapping from the
        # integer id to the meta_data, self.uids is a mapping of the
        # object unique identifier to the rid, and self.paths is a
        # mapping of the rid to the unique identifier.

        self.clear()

        if brains is not None:
            self._v_brains = brains

        self.updateBrains()

    
    def __len__(self):
        return self._length()

    def migrate__len__(self):
        """ migration of old __len__ magic for Zope 2.8 """
        if not hasattr(self, '_length'):
            n = self.__dict__['__len__']()
            del self.__dict__['__len__'] 
            self._length = BTrees.Length.Length(n)

    def clear(self):
        """ clear catalog """

        self.data  = IOBTree()  # mapping of rid to meta_data
        self.uids  = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid
        self._length = BTrees.Length.Length()

        for index in self.indexes.keys():
            self.getIndex(index).clear()

    def updateBrains(self):
        self.useBrains(self._v_brains)

    def __getitem__(self, index, ttype=type(())):
        """
        Returns instances of self._v_brains, or whatever is passed
        into self.useBrains.
        """
        if type(index) is ttype:
            # then it contains a score...
            normalized_score, score, key = index
            r=self._v_result_class(self.data[key]).__of__(self.aq_parent)
            r.data_record_id_ = key
            r.data_record_score_ = score
            r.data_record_normalized_score_ = normalized_score
        else:
            # otherwise no score, set all scores to 1
            r=self._v_result_class(self.data[index]).__of__(self.aq_parent)
            r.data_record_id_ = index
            r.data_record_score_ = 1
            r.data_record_normalized_score_ = 1
        return r

    def __setstate__(self, state):
        """ initialize your brains.  This method is called when the
        catalog is first activated (from the persistent storage) """
        Persistent.__setstate__(self, state)
        self.updateBrains()

    def useBrains(self, brains):
        """ Sets up the Catalog to return an object (ala ZTables) that
        is created on the fly from the tuple stored in the self.data
        Btree.
        """

        class mybrains(AbstractCatalogBrain, brains):
            pass

        scopy = self.schema.copy()

        scopy['data_record_id_']=len(self.schema.keys())
        scopy['data_record_score_']=len(self.schema.keys())+1
        scopy['data_record_normalized_score_']=len(self.schema.keys())+2

        mybrains.__record_schema__ = scopy

        self._v_brains = brains
        self._v_result_class = mybrains

    def addColumn(self, name, default_value=None):
        """
        adds a row to the meta data schema
        """

        schema = self.schema
        names = list(self.names)

        if schema.has_key(name):
            raise CatalogError, 'The column %s already exists' % name

        if name[0] == '_':
            raise CatalogError, \
                  'Cannot cache fields beginning with "_"'

        if not schema.has_key(name):
            if schema.values():
                schema[name] = max(schema.values())+1
            else:
                schema[name] = 0
            names.append(name)

        if default_value is None or default_value == '':
            default_value = MV

        for key in self.data.keys():
            rec = list(self.data[key])
            rec.append(default_value)
            self.data[key] = tuple(rec)

        self.names = tuple(names)
        self.schema = schema

        # new column? update the brain
        self.updateBrains()

        self._p_changed = 1 # why?

    def delColumn(self, name):
        """
        deletes a row from the meta data schema
        """
        names = list(self.names)
        _index = names.index(name)

        if not self.schema.has_key(name):
            LOG.error('delColumn attempted to delete nonexistent column %s.' % str(name))
            return

        del names[_index]

        # rebuild the schema
        i=0; schema = {}
        for name in names:
            schema[name] = i
            i = i + 1

        self.schema = schema
        self.names = tuple(names)

        # update the brain
        self.updateBrains()

        # remove the column value from each record
        for key in self.data.keys():
            rec = list(self.data[key])
            del rec[_index]
            self.data[key] = tuple(rec)

    def addIndex(self, name, index_type):
        """Create a new index, given a name and a index_type.

        Old format: index_type was a string, 'FieldIndex' 'TextIndex' or
        'KeywordIndex' is no longer valid; the actual index must be instantiated
        and passed in to addIndex.

        New format: index_type is the actual index object to be stored.

        """

        if self.indexes.has_key(name):
            raise CatalogError, 'The index %s already exists' % name

        if name.startswith('_'):
            raise CatalogError, 'Cannot index fields beginning with "_"'

        if not name:
            raise CatalogError, 'Name of index is empty'

        indexes = self.indexes

        if isinstance(index_type, str):
            raise TypeError,"""Catalog addIndex now requires the index type to
            be resolved prior to adding; create the proper index in the caller."""

        indexes[name] = index_type;

        self.indexes = indexes

    def delIndex(self, name):
        """ deletes an index """

        if not self.indexes.has_key(name):
            raise CatalogError, 'The index %s does not exist' % name

        indexes = self.indexes
        del indexes[name]
        self.indexes = indexes

    def getIndex(self, name):
        """ get an index wrapped in the catalog """
        return self.indexes[name].__of__(self)

    def updateMetadata(self, object, uid):
        """ Given an object and a uid, update the column data for the
        uid with the object data iff the object has changed """
        data = self.data
        index = self.uids.get(uid, None)
        newDataRecord = self.recordify(object)

        if index is None:
            if type(data) is IOBTree:
                # New style, get random id

                index=getattr(self, '_v_nextid', 0)
                if index % 4000 == 0: 
                    index = randint(-2000000000, 2000000000)
                while not data.insert(index, newDataRecord):
                    index = randint(-2000000000, 2000000000)

                # We want ids to be somewhat random, but there are
                # advantages for having some ids generated
                # sequentially when many catalog updates are done at
                # once, such as when reindexing or bulk indexing.
                # We allocate ids sequentially using a volatile base,
                # so different threads get different bases. This
                # further reduces conflict and reduces churn in
                # here and it result sets when bulk indexing.
                self._v_nextid=index+1
            else:
                if data:
                    # find the next available unique id
                    index = data.keys()[-1] + 1
                else:
                    index=0
                # meta_data is stored as a tuple for efficiency
                data[index] = newDataRecord
        else:
            if data.get(index, 0) != newDataRecord:
                data[index] = newDataRecord
        return index
        
    # the cataloging API

    def catalogObject(self, object, uid, threshold=None, idxs=None,
                      update_metadata=1):
        """
        Adds an object to the Catalog by iteratively applying it to
        all indexes.

        'object' is the object to be cataloged

        'uid' is the unique Catalog identifier for this object

        If 'idxs' is specified (as a sequence), apply the object only
        to the named indexes.

        If 'update_metadata' is true (the default), also update metadata for
        the object.  If the object is new to the catalog, this flag has
        no effect (metadata is always created for new objects).

        """

        if idxs is None:
            idxs = []

        data = self.data
        index = self.uids.get(uid, None)

        if index is None:  # we are inserting new data
            index = self.updateMetadata(object, uid)

            if not hasattr(self, '_length'):
                self.migrate__len__()
            self._length.change(1)
            self.uids[uid] = index
            self.paths[index] = uid

        elif update_metadata:  # we are updating and we need to update metadata
            self.updateMetadata(object, uid)

        # do indexing

        total = 0

        if idxs==[]: use_indexes = self.indexes.keys()
        else:        use_indexes = idxs

        for name in use_indexes:
            x = self.getIndex(name)
            if hasattr(x, 'index_object'):
                blah = x.index_object(index, object, threshold)
                total = total + blah
            else:
                LOG.error('catalogObject was passed bad index object %s.' % str(x))

        return total

    def uncatalogObject(self, uid):
        """
        Uncatalog and object from the Catalog.  and 'uid' is a unique
        Catalog identifier

        Note, the uid must be the same as when the object was
        catalogued, otherwise it will not get removed from the catalog

        This method should not raise an exception if the uid cannot
        be found in the catalog.

        """
        data = self.data
        uids = self.uids
        paths = self.paths
        indexes = self.indexes.keys()
        rid = uids.get(uid, None)

        if rid is not None:
            for name in indexes:
                x = self.getIndex(name)
                if hasattr(x, 'unindex_object'):
                    x.unindex_object(rid)
            del data[rid]
            del paths[rid]
            del uids[uid]
            if not hasattr(self, '_length'):
                self.migrate__len__()
            self._length.change(-1)
            
        else:
            LOG.error('uncatalogObject unsuccessfully '
                      'attempted to uncatalog an object '
                      'with a uid of %s. ' % str(uid))


    def uniqueValuesFor(self, name):
        """ return unique values for FieldIndex name """
        return self.getIndex(name).uniqueValues()

    def hasuid(self, uid):
        """ return the rid if catalog contains an object with uid """
        return self.uids.get(uid)

    def recordify(self, object):
        """ turns an object into a record tuple """
        record = []
        # the unique id is allways the first element
        for x in self.names:
            attr=getattr(object, x, MV)
            if(attr is not MV and safe_callable(attr)): attr=attr()
            record.append(attr)
        return tuple(record)

    def instantiate(self, record):
        r=self._v_result_class(record[1])
        r.data_record_id_ = record[0]
        return r.__of__(self)


    def getMetadataForRID(self, rid):
        record = self.data[rid]
        result = {}
        for (key, pos) in self.schema.items():
            result[key] = record[pos]
        return result

    def getIndexDataForRID(self, rid):
        result = {}
        for name in self.indexes.keys():
            result[name] = self.getIndex(name).getEntryForObject(rid, "")
        return result

## This is the Catalog search engine. Most of the heavy lifting happens below

    def search(self, request, sort_index=None, reverse=0, limit=None, merge=1):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        rs = None # resultset

        # Indexes fulfill a fairly large contract here. We hand each
        # index the request mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the request, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # For hysterical reasons, if all indexes return None for a given
        # request (and no attributes were used) then we append all results
        # in the Catalog. This generally happens when the search values
        # in request are all empty strings or do not coorespond to any of
        # the indexes.

        # Note that if the indexes find query arguments, but the end result
        # is an empty sequence, we do nothing

        for i in self.indexes.keys():
            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue
            r = _apply_index(request)

            if r is not None:
                r, u = r
                w, rs = weightedIntersection(rs, r)
        
        if rs is None:
            # None of the indexes found anything to do with the request
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            if sort_index is None:
                return LazyMap(self.instantiate, self.data.items(), len(self))
            else:
                return self.sortResults(
                    self.data, sort_index, reverse,  limit, merge)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            if sort_index is None and hasattr(rs, 'values'):
                # having a 'values' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.
                                
                if not merge:
                    # Don't bother to sort here, return a list of 
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    return [(score, (1, score, rid), getitem) 
                            for rid, score in rs.items()]
                
                rs = rs.byValue(0) # sort it by score
                max = float(rs[0][0])

                # Here we define our getter function inline so that
                # we can conveniently store the max value as a default arg
                # and make the normalized score computation lazy
                def getScoredResult(item, max=max, self=self):
                    """
                    Returns instances of self._v_brains, or whatever is passed
                    into self.useBrains.
                    """
                    score, key = item
                    r=self._v_result_class(self.data[key])\
                          .__of__(self.aq_parent)
                    r.data_record_id_ = key
                    r.data_record_score_ = score
                    r.data_record_normalized_score_ = int(100. * score / max)
                    return r
                
                return LazyMap(getScoredResult, rs, len(rs))

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                return LazyMap(self.__getitem__, rs, len(rs))
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                return self.sortResults(rs, sort_index, reverse, limit, merge)
        else:
            # Empty result set
            return LazyCat([])

    def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1):
        # Sort a result set using a sort index. Return a lazy
        # result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples
        #
        # The two 'for' loops in here contribute a significant
        # proportion of the time to perform an indexed search.
        # Try to avoid all non-local attribute lookup inside
        # those loops.
        assert limit is None or limit > 0, 'Limit value must be 1 or greater'
        _lazymap = LazyMap
        _intersection = intersection
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        _None = None
        _keyerror = KeyError
        result = []
        append = result.append
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        rlen = len(rs)
        
        if merge and limit is None and (
            rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # This is rarely exercised in practice...
            
            length = 0

            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = _intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', _None)
                    if keys is not _None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    append((k, intset, _self__getitem__))
                    # Note that sort keys are unique.
            
            result.sort()
            if reverse:
                result.reverse()
            result = LazyCat(LazyValues(result), length)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    append((key, did, _self__getitem__))
                    # The reference back to __getitem__ is used in case
                    # we do not merge now and need to intermingle the
                    # results with those of other catalogs while avoiding
                    # the cost of instantiating a LazyMap per result
            if merge:
                result.sort()
                if reverse:
                    result.reverse()
                if limit is not None:
                    result = result[:limit]                    
                result = LazyValues(result)
            else:
                return result
        elif reverse: 
            # Limit/sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            n = 0
            worst = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[0], result[0]
                    else:
                        n += 1
                    worst = keys[0]
            result.reverse()
            if merge:
                result = LazyValues(result) 
            else:
                return result
        elif not reverse:
            # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            n = 0
            best = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key >= best:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[-1], result[-1]
                    else:
                        n += 1
                    best = keys[-1]
            if merge:
                result = LazyValues(result) 
            else:
                return result
        
        result = LazyMap(self.__getitem__, result, len(result))
        result.actual_result_count = rlen
        return result

    def _get_sort_attr(self, attr, kw):
        """Helper function to find sort-on or sort-order."""
        # There are three different ways to find the attribute:
        # 1. kw[sort-attr]
        # 2. self.sort-attr
        # 3. kw[sort_attr]
        # kw may be a dict or an ExtensionClass MultiMapping, which
        # differ in what get() returns with no default value.
        name = "sort-%s" % attr
        val = kw.get(name, None)
        if val is not None:
            return val
        val = getattr(self, name, None)
        if val is not None:
            return val
        return kw.get("sort_%s" % attr, None)


    def _getSortIndex(self, args):
        """Returns a search index object or None."""
        sort_index_name = self._get_sort_attr("on", args)
        if sort_index_name is not None:
            # self.indexes is always a dict, so get() w/ 1 arg works
            sort_index = self.indexes.get(sort_index_name)
            if sort_index is None:
                raise CatalogError, 'Unknown sort_on index (%s)' % sort_index_name
            else:
                if not hasattr(sort_index, 'keyForDocument'):
                    raise CatalogError(
                        'The index chosen for sort_on (%s) is not capable of being'
                        ' used as a sort index.' % sort_index_name
                        )
            return sort_index
        else:
            return None

    def searchResults(self, REQUEST=None, used=None, _merge=1, **kw):
        # The used argument is deprecated and is ignored
        if REQUEST is None and not kw:
            # Try to acquire request if we get no args for bw compat
            REQUEST = getattr(self, 'REQUEST', None)
        args = CatalogSearchArgumentsMap(REQUEST, kw)
        sort_index = self._getSortIndex(args)
        sort_limit = self._get_sort_attr('limit', args)
        reverse = 0
        if sort_index is not None:
            order = self._get_sort_attr("order", args)
            if (isinstance(order, str) and
                order.lower() in ('reverse', 'descending')):
                reverse = 1
        # Perform searches with indexes and sort_index
        return self.search(args, sort_index, reverse, sort_limit, _merge)
                
    __call__ = searchResults
Esempio n. 9
0
class ExtendedPathIndex(PathIndex):
    """A path index stores all path components of the physical path of an
    object.

    Internal datastructure (regular pathindex):

    - a physical path of an object is split into its components

    - every component is kept as a key of a OOBTree in self._indexes

    - the value is a mapping 'level of the path component' to
      'all docids with this path component on this level'

    In addition

    - there is a terminator (None) signifying the last component in the path

    - 2 additional indexes map absolute path to either the doc id or doc ids of
      contained objects. This allows for rapid answering of common queries.
    """

    meta_type = "ExtendedPathIndex"

    manage_options = ({'label': 'Settings', 'action': 'manage_main'}, )

    indexed_attrs = None
    query_options = ("query", "level", "operator", "depth", "navtree",
                     "navtree_start")

    def __init__(self, id, extra=None, caller=None):
        """ ExtendedPathIndex supports indexed_attrs """
        PathIndex.__init__(self, id, caller)

        if isinstance(extra, dict):
            attrs = extra.get('indexed_attrs', None)
        else:
            attrs = getattr(extra, 'indexed_attrs', None)

        if attrs is None:
            return

        if isinstance(attrs, str):
            attrs = attrs.split(',')
        attrs = [a.strip() for a in attrs]
        attrs = [a for a in attrs if a]

        if attrs:
            # We only index the first attribute so snip off the rest
            self.indexed_attrs = tuple(attrs[:1])

    def clear(self):
        PathIndex.clear(self)
        self._index_parents = OOBTree()
        self._index_items = OIBTree()

    def index_object(self, docid, obj, threshold=100):
        """ hook for (Z)Catalog """

        # PathIndex first checks for an attribute matching its id and
        # falls back to getPhysicalPath only when failing to get one.
        # If self.indexed_attrs is not None, it's value overrides this behavior

        attrs = self.indexed_attrs
        index = attrs is None and self.id or attrs[0]

        path = getattr(obj, index, None)
        if path is not None:
            if safe_callable(path):
                path = path()

            if not isinstance(path, (str, tuple)):
                raise TypeError('path value must be string or tuple '
                                'of strings: (%r, %s)' % (index, repr(path)))
        else:
            try:
                path = obj.getPhysicalPath()
            except AttributeError:
                return 0

        if isinstance(path, (list, tuple)):
            path = '/' + '/'.join(path[1:])
        comps = [p for p in path.split('/') if p]

        # Make sure we reindex properly when path change
        old_path = self._unindex.get(docid, _marker)
        if old_path is not _marker:
            if old_path != path:
                self.unindex_object(docid, _old=old_path)
                # unindex reduces length, we need to counter that
                self._length.change(1)
        else:
            # We only get a new entry if the value wasn't there before.
            # If it already existed the length is unchanged
            self._length.change(1)

        for i, comp in enumerate(comps):
            self.insertEntry(comp, docid, i)

        # Add terminator
        self.insertEntry(None, docid, len(comps) - 1)

        # Add full-path indexes, to optimize certain edge cases
        parent_path = '/' + '/'.join(comps[:-1])
        parents = self._index_parents.get(parent_path, _marker)
        if parents is _marker:
            self._index_parents[parent_path] = parents = IITreeSet()
        parents.insert(docid)
        self._index_items[path] = docid

        self._unindex[docid] = path
        return 1

    def unindex_object(self, docid, _old=_marker):
        """ hook for (Z)Catalog """

        if _old is not _marker:
            old_value = _old
        else:
            old_value = self._unindex.get(docid, _marker)
            if old_value is _marker:
                logger.log(
                    logging.INFO,
                    'Attempt to unindex nonexistent object with id '
                    '%s' % docid)
                return

        # There is an assumption that paths start with /
        comps = [p for p in old_value.split('/') if p]

        def unindex(comp, level, docid=docid):
            index_comp = self._index[comp]
            index_comp[level].remove(docid)
            if not index_comp[level]:
                del index_comp[level]
            if not index_comp:
                del self._index[comp]

        try:
            for level, comp in enumerate(comps):
                unindex(comp, level)

            # Remove the terminator
            unindex(None, len(comps) - 1)

            # Remove full-path indexes
            parent_path = '/' + '/'.join(comps[:-1])
            parents = self._index_parents.get(parent_path, _marker)
            if parents is not _marker:
                parents.remove(docid)
                if not parents:
                    del self._index_parents[parent_path]
            del self._index_items['/'.join([parent_path, comps[-1]])]
        except KeyError:
            logger.log(
                logging.INFO, 'Attempt to unindex object with id '
                '%s failed' % docid)

        self._length.change(-1)
        del self._unindex[docid]

    def search(self,
               path,
               default_level=0,
               depth=-1,
               navtree=0,
               navtree_start=0,
               resultset=None):
        """
        path is either a string representing a relative URL or a part of a
        relative URL or a tuple (path, level).

        default_level specifies the level to use when no more specific level
        has been passed in with the path.

        level >= 0  starts searching at the given level
        level <  0  finds matches at *any* level

        depth let's you limit the results to items at most depth levels deeper
        than the matched path. depth == 0 means no subitems are included at
        all, with depth == 1 only direct children are included, etc.
        depth == -1, the default, returns all children at any depth.

        navtree is treated as a boolean; if it evaluates to True, not only the
        query match is returned, but also each container in the path. If depth
        is greater than 0, also all siblings of those containers, as well as
        the siblings of the match are included as well, plus *all* documents at
        the starting level.

        navtree_start limits what containers are included in a navtree search.
        If greater than 0, only containers (and possibly their siblings) at
        that level and up will be included in the resultset.

        """
        if isinstance(path, string_types):
            level = default_level
        else:
            level = int(path[1])
            path = path[0]

        if level < 0:
            # Search at every level, return the union of all results
            return multiunion([
                self.search(path, level, depth, navtree, navtree_start)
                for level in range(self._depth + 1)
            ])

        comps = [p for p in path.split('/') if p]

        if navtree and depth == -1:  # Navtrees don't do recursive
            depth = 1

        # Optimizations

        pathlength = level + len(comps) - 1
        if navtree and navtree_start > min(pathlength + depth, self._depth):
            # This navtree_start excludes all items that match the depth
            return IISet()

        if level == 0 and depth in (0, 1):
            # We have easy indexes for absolute paths where
            # we are looking for depth 0 or 1 result sets
            if navtree:
                # Optimized absolute path navtree and breadcrumbs cases
                result = []
                add = lambda x: x is not None and result.append(x)
                if depth == 1:
                    # Navtree case, all sibling elements along the path
                    convert = multiunion
                    index = self._index_parents
                else:
                    # Breadcrumbs case, all direct elements along the path
                    convert = IISet
                    index = self._index_items
                # Collect all results along the path
                for i in range(len(comps), navtree_start - 1, -1):
                    parent_path = '/' + '/'.join(comps[:i])
                    add(index.get(parent_path))
                return convert(result)

            if not path.startswith('/'):
                path = '/' + path
            if depth == 0:
                # Specific object search
                res = self._index_items.get(path)
                return res and IISet([res]) or IISet()
            else:
                # Single depth search
                return self._index_parents.get(path, IISet())

        # Avoid using the root set
        # as it is common for all objects anyway and add overhead
        # There is an assumption about all indexed values having the
        # same common base path
        if level == 0:
            indexpath = [p for p in self.getPhysicalPath() if p]
            minlength = min(len(indexpath), len(comps))
            # Truncate path to first different element
            for i in range(minlength):
                if indexpath[i] != comps[i]:
                    break
                level += 1
            comps = comps[level:]

        if not comps and depth == -1:
            # Recursive search for everything
            return IISet(self._unindex)

        # Core application of the indexes
        pathset = None
        depthset = None  # For limiting depth

        if navtree and depth > 0:
            # Include the elements up to the matching path
            depthset = multiunion([
                self._index.get(None, {}).get(i, IISet())
                for i in range(min(navtree_start, level),
                               max(navtree_start, level) + 1)
            ])

        indexedcomps = enumerate(comps)
        if not navtree:
            # Optimize relative-path searches by starting with the
            # presumed smaller sets at the end of the path first
            # We can't do this for the navtree case because it needs
            # the bigger rootset to include siblings along the way.
            indexedcomps = list(indexedcomps)
            indexedcomps.reverse()

        for i, comp in indexedcomps:
            # Find all paths that have comp at the given level
            res = self._index.get(comp, {}).get(i + level)
            if res is None:
                # Non-existing path; navtree is inverse, keep going
                pathset = IISet()
                if not navtree:
                    return pathset
            pathset = intersection(pathset, res)

            if navtree and i + level >= navtree_start:
                depthset = union(
                    depthset,
                    intersection(pathset,
                                 self._index.get(None, {}).get(i + level)))

        if depth >= 0:
            # Limit results to those that terminate within depth levels
            start = len(comps) - 1
            if navtree:
                start = max(start, (navtree_start - level))
            depthset = [depthset] + [
                intersection(pathset,
                             self._index.get(None, {}).get(i + level))
                for i in range(start, start + depth + 1)
            ]
            depthset = multiunion([d for d in depthset if d])

        if navtree or depth >= 0:
            return depthset
        return pathset

    def _apply_index(self, request, resultset=None):
        """ hook for (Z)Catalog
            'request' --  mapping type (usually {"path": "..." }
             additionaly a parameter "path_level" might be passed
             to specify the level (see search())
        """
        record = IndexQuery(request, self.id, self.query_options)
        if record.keys is None:
            return None
        return (self.query_index(record), (self.id, ))

    def query_index(self, record, resultset=None):
        level = record.get("level", 0)
        operator = record.get('operator', self.useOperator).lower()
        depth = getattr(record, 'depth', -1)  # use getattr to get 0 value
        navtree = record.get('navtree', 0)
        navtree_start = record.get('navtree_start', 0)

        # depending on the operator we use intersection of union
        if operator == "or":
            set_func = union
        else:
            set_func = intersection

        result = None
        for k in record.keys:
            rows = self.search(k,
                               level,
                               depth,
                               navtree,
                               navtree_start,
                               resultset=resultset)
            result = set_func(result, rows)

        if result:
            return result
        return IISet()

    def getIndexSourceNames(self):
        """ return names of indexed attributes """
        attrs = self.indexed_attrs or ('getPhysicalPath', )
        return tuple(attrs)
Esempio n. 10
0
class UdbBtreeIndex(UdbIndex):
    is_prefixed = True
    is_ranged = True
    is_sorted_asc = True
    type = 'btree'

    def __init__(self, schema, name=None):
        from BTrees.OIBTree import OIBTree

        UdbIndex.__init__(self, schema, name)

        self._btree = OIBTree()

    def __len__(self):
        return len(self._btree)

    def clear(self):
        self._btree.clear()

        return self

    def delete(self, key, uid=None):
        self._btree.pop(key, EMPTY)

        return self

    def insert(self, key, uid):
        self._btree.insert(key, uid)

        return self

    def search_by_key(self, key):
        val = self._btree.get(key, EMPTY)

        if val != EMPTY:
            yield val

    def search_by_key_in(self, keys):
        for key in keys:
            val = self._btree.get(key, EMPTY)

            if val != EMPTY:
                yield val

    def search_by_key_prefix(self, key):
        for val in self._btree.values(key, key + CHAR255):
            yield val

    def search_by_key_prefix_in(self, keys):
        for key in keys:
            for val in self._btree.values(key, key + CHAR255):
                yield val

    def search_by_key_range(self,
                            gte=None,
                            lte=None,
                            gte_excluded=False,
                            lte_excluded=False):
        for val in self._btree.values(gte, lte, gte_excluded, lte_excluded):
            yield val

    def upsert(self, old, new, uid):
        if old != new:
            self._btree.pop(old)

        self._btree.insert(new, uid)

        return self
Esempio n. 11
0
class DateIndex(UnIndex, PropertyManager):

    """Index for dates.
    """

    __implements__ = UnIndex.__implements__
    implements(IDateIndex)

    meta_type = 'DateIndex'
    query_options = ['query', 'range']

    index_naive_time_as_local = True # False means index as UTC
    _properties=({'id':'index_naive_time_as_local',
                  'type':'boolean',
                  'mode':'w'},)

    manage = manage_main = DTMLFile( 'dtml/manageDateIndex', globals() )
    manage_browse = DTMLFile('../dtml/browseIndex', globals())

    manage_main._setName( 'manage_main' )
    manage_options = ( { 'label' : 'Settings'
                       , 'action' : 'manage_main'
                       },
                       {'label': 'Browse',
                        'action': 'manage_browse',
                       },
                     ) + PropertyManager.manage_options

    def clear( self ):
        """ Complete reset """
        self._index = IOBTree()
        self._unindex = OIBTree()
        self._length = BTrees.Length.Length()

    def index_object( self, documentId, obj, threshold=None ):
        """index an object, normalizing the indexed value to an integer

           o Normalized value has granularity of one minute.

           o Objects which have 'None' as indexed value are *omitted*,
             by design.
        """
        returnStatus = 0

        try:
            date_attr = getattr( obj, self.id )
            if safe_callable( date_attr ):
                date_attr = date_attr()

            ConvertedDate = self._convert( value=date_attr, default=_marker )
        except AttributeError:
            ConvertedDate = _marker

        oldConvertedDate = self._unindex.get( documentId, _marker )

        if ConvertedDate != oldConvertedDate:
            if oldConvertedDate is not _marker:
                self.removeForwardIndexEntry(oldConvertedDate, documentId)
                if ConvertedDate is _marker:
                    try:
                        del self._unindex[documentId]
                    except ConflictError:
                        raise
                    except:
                        logger.error(
                            ("Should not happen: ConvertedDate was there,"
                             " now it's not, for document with id %s" %
                             documentId))

            if ConvertedDate is not _marker:
                self.insertForwardIndexEntry( ConvertedDate, documentId )
                self._unindex[documentId] = ConvertedDate

            returnStatus = 1

        return returnStatus

    def _apply_index( self, request, cid='', type=type ):
        """Apply the index to query parameters given in the argument

        Normalize the 'query' arguments into integer values at minute
        precision before querying.
        """
        record = parseIndexRequest( request, self.id, self.query_options )
        if record.keys == None:
            return None

        keys = map( self._convert, record.keys )

        index = self._index
        r = None
        opr = None

        #experimental code for specifing the operator
        operator = record.get( 'operator', self.useOperator )
        if not operator in self.operators :
            raise RuntimeError, "operator not valid: %s" % operator

        # depending on the operator we use intersection or union
        if operator=="or":
            set_func = union
        else:
            set_func = intersection

        # range parameter
        range_arg = record.get('range',None)
        if range_arg:
            opr = "range"
            opr_args = []
            if range_arg.find("min") > -1:
                opr_args.append("min")
            if range_arg.find("max") > -1:
                opr_args.append("max")

        if record.get('usage',None):
            # see if any usage params are sent to field
            opr = record.usage.lower().split(':')
            opr, opr_args = opr[0], opr[1:]

        if opr=="range":   # range search
            if 'min' in opr_args:
                lo = min(keys)
            else:
                lo = None

            if 'max' in opr_args:
                hi = max(keys)
            else:
                hi = None

            if hi:
                setlist = index.values(lo,hi)
            else:
                setlist = index.values(lo)

            #for k, set in setlist:
                #if type(set) is IntType:
                    #set = IISet((set,))
                #r = set_func(r, set)
            # XXX: Use multiunion!
            r = multiunion(setlist)

        else: # not a range search
            for key in keys:
                set = index.get(key, None)
                if set is not None:
                    if type(set) is IntType:
                        set = IISet((set,))
                    r = set_func(r, set)

        if type(r) is IntType:
            r = IISet((r,))

        if r is None:
            return IISet(), (self.id,)
        else:
            return r, (self.id,)

    def _convert( self, value, default=None ):
        """Convert Date/Time value to our internal representation"""
        # XXX: Code patched 20/May/2003 by Kiran Jonnalagadda to
        # convert dates to UTC first.
        if isinstance( value, DateTime ):
            t_tup = value.toZone('UTC').parts()
        elif type( value ) in (FloatType, IntType):
            t_tup = time.gmtime( value )
        elif type( value ) is StringType and value:
            t_obj = DateTime( value ).toZone('UTC')
            t_tup = t_obj.parts()
        elif type( value ) is date:
            t_tup = value.timetuple()
        elif type( value ) is datetime:
            if self.index_naive_time_as_local and value.tzinfo is None:
                value = value.replace(tzinfo=Local)
            # else if tzinfo is None, naive time interpreted as UTC
            t_tup = value.utctimetuple()
        else:
            return default

        yr = t_tup[0]
        mo = t_tup[1]
        dy = t_tup[2]
        hr = t_tup[3]
        mn = t_tup[4]

        t_val = ( ( ( ( yr * 12 + mo ) * 31 + dy ) * 24 + hr ) * 60 + mn )

        if isinstance(t_val, long):
            # t_val must be IntType, not LongType
            raise OverflowError, (
                "%s is not within the range of indexable dates (index: %s)"
                % (value, self.id))

        return t_val
Esempio n. 12
0
    def index_object(self, documentId, obj, threshold=None):
        """ Index an object:
        'documentId' is the integer id of the document

        'obj' is the object to be indexed

        'threshold' is the number of words to process between
        commiting subtransactions.  If 'None' subtransactions are
        disabled. """

        # sniff the object for our 'id', the 'document source' of the
        # index is this attribute.  If it smells callable, call it.
        try:
            source = getattr(obj, self.id)
            if safe_callable(source):
                source = source()

            if not isinstance(source, UnicodeType):
                source = str(source)

        except (AttributeError, TypeError):
            return 0

        # sniff the object for 'id'+'_encoding'

        try:
            encoding = getattr(obj, self.id + '_encoding')
            if safe_callable(encoding):
                encoding = str(encoding())
            else:
                encoding = str(encoding)
        except (AttributeError, TypeError):
            encoding = 'latin1'

        lexicon = self.getLexicon()

        splitter = lexicon.Splitter

        wordScores = OIBTree()
        last = None

        # Run through the words and score them

        for word in list(splitter(source, encoding=encoding)):
            if word[0] == '\"':
                last = self._subindex(word[1:-1], wordScores, last, splitter)
            else:
                if word == last: continue
                last = word
                wordScores[word] = wordScores.get(word, 0) + 1

        # Convert scores to use wids:
        widScores = IIBucket()
        getWid = lexicon.getWordId
        for word, score in wordScores.items():
            widScores[getWid(word)] = score

        del wordScores

        currentWids = IISet(self._unindex.get(documentId, []))

        # Get rid of document words that are no longer indexed
        self.unindex_objectWids(documentId, difference(currentWids, widScores))

        # Now index the words. Note that the new xIBTrees are clever
        # enough to do nothing when there isn't a change. Woo hoo.
        insert = self.insertForwardIndexEntry
        for wid, score in widScores.items():
            insert(wid, documentId, score)

        # Save the unindexing info if it's changed:
        wids = widScores.keys()
        if wids != currentWids.keys():
            self._unindex[documentId] = wids

        return len(wids)
Esempio n. 13
0
class Indexer(object):

    filestorage = database = connection = root = None

    def __init__(self, datafs, writable=0, trans=0, pack=0):
        self.trans_limit = trans
        self.pack_limit = pack
        self.trans_count = 0
        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
        self.database = DB(self.filestorage)
        self.connection = self.database.open()
        self.root = self.connection.root()
        try:
            self.index = self.root["index"]
        except KeyError:
            self.index = self.root["index"] = TextIndexWrapper()
        try:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
        try:
            self.doctimes = self.root["doctimes"]
        except KeyError:
            self.doctimes = self.root["doctimes"] = IIBTree()
        try:
            self.watchfolders = self.root["watchfolders"]
        except KeyError:
            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
            self.path2docid[path] = docid
        try:
            self.maxdocid = max(self.docpaths.keys())
        except ValueError:
            self.maxdocid = 0
        print(len(self.docpaths), "Document ids")
        print(len(self.path2docid), "Pathnames")
        print(self.index.lexicon.length(), "Words")

    def dumpfreqs(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        L = []
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            L.append((freq, wid, lexicon.get_word(wid)))
        L.sort()
        L.reverse()
        for freq, wid, word in L:
            print("%10d %10d %s" % (wid, freq, word))

    def dumpwids(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid)))

    def dumpwords(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for word in lexicon.words():
            wid = lexicon.get_wid(word)
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, word))

    def close(self):
        self.root = None
        if self.connection is not None:
            self.connection.close()
            self.connection = None
        if self.database is not None:
            self.database.close()
            self.database = None
        if self.filestorage is not None:
            self.filestorage.close()
            self.filestorage = None

    def interact(self, nbest=NBEST, maxlines=MAXLINES):
        try:
            import readline
        except ImportError:
            pass
        text = ""
        top = 0
        results = []
        while 1:
            try:
                line = raw_input("Query: ")
            except EOFError:
                print("\nBye.")
                break
            line = line.strip()
            if line.startswith("/"):
                self.specialcommand(line, results, top - nbest)
                continue
            if line:
                text = line
                top = 0
            else:
                if not text:
                    continue
            try:
                results, n = self.timequery(text, top + nbest)
            except KeyboardInterrupt:
                raise
            except:
                reportexc()
                text = ""
                continue
            if len(results) <= top:
                if not n:
                    print("No hits for %r." % text)
                else:
                    print("No more hits for %r." % text)
                text = ""
                continue
            print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=' ')
            print("for query %s]" % repr(text))
            self.formatresults(text, results, maxlines, top, top+nbest)
            top += nbest

    def specialcommand(self, line, results, first):
        assert line.startswith("/")
        line = line[1:]
        if not line:
            n = first
        else:
            try:
                n = int(line) - 1
            except:
                print("Huh?")
                return
        if n < 0 or n >= len(results):
            print("Out of range")
            return
        docid, score = results[n]
        path = self.docpaths[docid]
        i = path.rfind("/")
        assert i > 0
        folder = path[:i]
        n = path[i+1:]
        cmd = "show +%s %s" % (folder, n)
        if os.getenv("DISPLAY"):
            os.system("xterm -e  sh -c '%s | less' &" % cmd)
        else:
            os.system(cmd)

    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
        results, n = self.timequery(text, nbest)
        if not n:
            print("No hits for %r." % text)
            return
        print("[Results 1-%d from %d]" % (len(results), n))
        self.formatresults(text, results, maxlines)

    def timequery(self, text, nbest):
        t0 = time.time()
        c0 = time.clock()
        results, n = self.index.query(text, 0, nbest)
        t1 = time.time()
        c1 = time.clock()
        print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0))
        return results, n

    def formatresults(self, text, results, maxlines=MAXLINES,
                      lo=0, hi=sys.maxint):
        stop = self.stopdict.has_key
        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
        pattern = r"\b(" + "|".join(words) + r")\b"
        pattern = pattern.replace("*", ".*") # glob -> re syntax
        prog = re.compile(pattern, re.IGNORECASE)
        print('='*70)
        rank = lo
        for docid, score in results[lo:hi]:
            rank += 1
            path = self.docpaths[docid]
            score *= 100.0
            print("Rank:    %d   Score: %d%%   File: %s" % (rank, score, path))
            path = os.path.join(self.mh.getpath(), path)
            try:
                fp = open(path)
            except (IOError, OSError) as msg:
                print("Can't open:", msg)
                continue
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
                h = msg.getheader(header)
                if h:
                    print("%-8s %s" % (header+":", h))
            text = self.getmessagetext(msg)
            if text:
                print()
                nleft = maxlines
                for part in text:
                    for line in part.splitlines():
                        if prog.search(line):
                            print(line)
                            nleft -= 1
                            if nleft <= 0:
                                break
                    if nleft <= 0:
                        break
            print('-'*70)

    def update(self, args):
        folder = None
        seqs = []

        for arg in args:
            if arg.startswith("+"):
                if folder is None:
                    folder = arg[1:]
                else:
                    print("only one folder at a time")
                    return
            else:
                seqs.append(arg)

        if not folder:
            folder = self.mh.getcontext()
        if not seqs:
            seqs = ['all']

        try:
            f = self.mh.openfolder(folder)
        except mhlib.Error as msg:
            print(msg)
            return

        dict = {}
        for seq in seqs:
            try:
                nums = f.parsesequence(seq)
            except mhlib.Error as msg:
                print(msg or "unparsable message sequence: %s" % repr(seq))
                return
            for n in nums:
                dict[n] = n
        msgs = dict.keys()
        msgs.sort()

        self.updatefolder(f, msgs)
        self.commit()

    def optimize(self, args):
        uniqwords = {}
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nOPTIMIZE FOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.prescan(f, f.listmessages(), uniqwords)
        L = [(uniqwords[word], word) for word in uniqwords.keys()]
        L.sort()
        L.reverse()
        for i in range(100):
            print("%3d. %6d %s" % ((i+1,) + L[i]))
        self.index.lexicon.sourceToWordIds([word for (count, word) in L])

    def prescan(self, f, msgs, uniqwords):
        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
        for n in msgs:
            print("prescanning", n)
            m = f.openmessage(n)
            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
        if not args:
            print("No folders specified; use ALL to bulk-index all folders")
            return
        if "ALL" in args:
            i = args.index("ALL")
            args[i:i+1] = self.mh.listfolders()
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nFOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.updatefolder(f, f.listmessages())
            print("Total", len(self.docpaths))
        self.commit()
        print("Indexed", self.index.lexicon._nbytes, "bytes and", end=' ')
        print(self.index.lexicon._nwords, "words;", end=' ')
        print(len(self.index.lexicon._words), "unique words.")

    def updatefolder(self, f, msgs):
        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
            path = "%s/%s" % (f.name, n)
            docid = self.path2docid.get(path, 0)
            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
                print("unchanged", docid, path)
                continue
            docid = self.newdocid(path)
            try:
                m = f.openmessage(n)
            except IOError:
                print("disappeared", docid, path)
                self.unindexpath(path)
                continue
            text = self.getmessagetext(m, f.name)
            if not text:
                self.unindexpath(path)
                continue
            print("indexing", docid, path)
            self.index.index_doc(docid, text)
            self.maycommit()
        # Remove messages from the folder that no longer exist
        for path in list(self.path2docid.keys(f.name)):
            if not path.startswith(f.name + "/"):
                break
            if self.getmtime(path) == 0:
                self.unindexpath(path)
        print("done.")

    def unindexpath(self, path):
        if path in self.path2docid:
            docid = self.path2docid[path]
            print("unindexing", docid, path)
            del self.docpaths[docid]
            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex_doc(docid)
            except KeyError as msg:
                print("KeyError", msg)
            self.maycommit()

    def getmessagetext(self, m, name=None):
        L = []
        if name:
            L.append("_folder " + name) # To restrict search to a folder
            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except KeyboardInterrupt:
            raise
        except:
            print("(getmsgparts failed:)")
            reportexc()
        return L

    def getmsgparts(self, m, L, level):
        ctype = m.gettype()
        if level or ctype != "text/plain":
            print(". "*level + str(ctype))
        if ctype == "text/plain":
            L.append(m.getbodytext())
        elif ctype in ("multipart/alternative", "multipart/mixed"):
            for part in m.getbodyparts():
                self.getmsgparts(part, L, level+1)
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
            self.getheaders(m, L)
            self.getmsgparts(m, L, level+1)

    def getheaders(self, m, L):
        H = []
        for key in "from", "to", "cc", "bcc", "subject":
            value = m.get(key)
            if value:
                H.append(value)
        if H:
            L.append("\n".join(H))

    def newdocid(self, path):
        docid = self.path2docid.get(path)
        if docid is not None:
            self.doctimes[docid] = self.getmtime(path)
            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

    def getmtime(self, path):
        path = os.path.join(self.mh.getpath(), path)
        try:
            st = os.stat(path)
        except os.error as msg:
            return 0
        return int(st[ST_MTIME])

    def maycommit(self):
        self.trans_count += 1
        if self.trans_count >= self.trans_limit > 0:
            self.commit()

    def commit(self):
        if self.trans_count > 0:
            print("committing...")
            transaction.commit()
            self.trans_count = 0
            self.pack_count += 1
            if self.pack_count >= self.pack_limit > 0:
                self.pack()

    def pack(self):
        if self.pack_count > 0:
            print("packing...")
            self.database.pack()
            self.pack_count = 0
Esempio n. 14
0
class LinkCheckTool(SimpleItem):
    security = ClassSecurityInfo()

    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # Additional queue for internal crawler to revalidate the site
        self.crawl_queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0

    security.declarePrivate("is_available")

    def is_available(self):
        return hasattr(self, 'index') and \
               hasattr(self, 'checked') and \
               hasattr(self, 'queue') and \
               hasattr(self, 'counter')

    security.declarePrivate("clear")

    def clear(self):
        while True:
            try:
                self.queue.pull()
            except IndexError:
                break
        while True:
            try:
                self.crawl_queue.pull()
            except IndexError:
                break

        self.checked.clear()
        self.index.clear()
        self.links.clear()
        self.counter = 0

    security.declarePrivate("crawl")

    def crawl(self):
        self.clear()
        query = {}
        registry = getUtility(IRegistry)
        settings = registry.forInterface(ISettings)
        if settings.content_types:
            query['portal_type'] = settings.content_types

        if settings.workflow_states:
            query['review_state'] = settings.workflow_states

        catalog = api.portal.get_tool('portal_catalog')
        brains = catalog(query)
        for brain in brains:
            # asyncronous crawling not working yet
            # self.crawl_enqueue(brain.UID)

            obj = brain.getObject()
            obj.restrictedTraverse('@@linkcheck')()
            logger.info('Crawling: checked {0}'.format(brain.getURL()))

    security.declarePrivate("enqueue")

    def enqueue(self, url):
        index = self.index.get(url)
        if index is None:
            # a really new url
            index = self.store(url)
        else:
            entry = self.checked.get(index)
            if entry is not None and entry:
                entry = None, entry[1], entry[2]
                self.checked[index] = entry
            else:
                # reset empty entry
                self.remove(url)
                index = self.store(url)
        self.queue.put(index)
        return index

    security.declarePrivate("register")

    def register(self, hrefs, referer, timestamp):
        """Add or update link presence information.

        If a link has not been checked since the provided timestamp,
        it will be added to the queue (or if it is not in the
        database).
        """

        referer = self.index.get(referer) or self.store(referer)

        registry = getUtility(IRegistry, context=self.aq_parent)
        try:
            settings = registry.forInterface(ISettings)
        except KeyError as exc:
            logger.warn(exc)
            return

        limit = settings.referers

        for href in hrefs:
            if self.should_ignore(href, settings.ignore_list):
                continue

            # If the hyperlink is not already in the work queue,
            # compare the provided timestamp to our database to see if
            # we need to check its validity. Note that internal links
            # are excempt if we're not using the publisher.
            index = self.index.get(href)
            entry = self.checked.get(-1 if index is None else index)

            if index not in self.queue:
                if entry is None or entry[0] < timestamp:
                    if settings.use_publisher or not href.startswith('/'):
                        index = self.enqueue(href)
                    elif href not in self.index:
                        index = self.store(href)

            assert index is not None

            if entry is None:
                self.checked[index] = None, None, IISet((referer, ))
            else:
                # If the provided paths are a subset of the already
                # seen paths, and if there is no new referer, we don't
                # issue an update.
                referers = entry[2]
                if referer not in referers and len(referers) <= limit:
                    referers.add(referer)

    security.declarePrivate("store")

    def store(self, url):
        index = self.index[url] = self.counter
        self.links[index] = url
        self.counter += 1
        return index

    security.declarePrivate("remove")

    def remove(self, url):
        index = self.index.get(url)
        if url in self.index:
            del self.index[url]
        if index and index in self.checked:
            del self.checked[index]

    security.declarePrivate("update")

    def update(self, href, status):
        """Update link status."""
        now = datetime.datetime.now()
        timestamp = int(time.mktime(now.timetuple()))

        index = self.index.get(href)
        if index is None:
            return

        entry = self.checked.get(-1 if index is None else index)
        if entry is None:
            self.checked[index] = timestamp, status, IISet()

        # If the status changed, we update the entry.
        elif status != entry[1] or not entry[0]:

            # If the status was previously good, then we clear the
            # status. What this means is that we'll wait for the next
            # check to declare a bad status (it might be temporary).
            if entry[1] == 200:
                status = None

            self.checked[index] = timestamp, status, entry[2]

    @cache(lambda method, self, ignore_list: ignore_list)
    def get_matchers(self, ignore_list):
        matchers = []
        for expression in ignore_list:
            try:
                matcher = re.compile(expression).search
            except re.error:
                pass
            else:
                matchers.append(matcher)

        return matchers

    def should_ignore(self, href, ignore_list):
        for matcher in self.get_matchers(ignore_list):
            if matcher(href):
                return True

        return False

    def crawl_enqueue(self, obj):
        if not isinstance(obj, basestring):
            obj = obj.UID()
        self.crawl_queue.put(obj)

    def crawl_dequeue(self):
        if self.crawl_queue._data:
            return self.crawl_queue.pull()
Esempio n. 15
0
class Table(Persistent):
    """Simple, generic relational table.
    """
    schema = None
    _v_record_class = None

    def __init__(self, schema=None):
        if schema is not None:
            self.schema = schema
        columns = schema.get_columns()
        self.col_info = []  # [(tuple position, column),]
        self.positions = {}
        for i in range(len(columns)):
            # Leave space for the record ID at position 0.
            position = i + 1
            self.col_info.append((position, columns[i]))
            self.positions[columns[i].name] = position
        self.proto_record = [None] * (len(columns) + 1)
        self.next_rid = 1
        self.clear()

    def clear(self):
        self.data = IOBTree()  # {rid -> record as tuple}
        self.indexes = {}  # {index_name -> OOBTree({value -> IITreeSet})}
        self.primary_index = OIBTree()  # {primary key -> rid}
        for position, column in self.col_info:
            if column.indexed:
                self.indexes[column.name] = OOBTree()

    def tuplify(self, params):
        """Accepts a mapping-like object and returns a tuple.
        """
        record = self.proto_record[:]
        positions = self.positions
        if hasattr(params, '__record_schema__'):
            for name in params.__record_schema__.keys():
                position = positions[name]
                record[position] = params[name]
        else:
            for name, value in params.items():
                position = positions[name]
                record[position] = value
        return tuple(record)

    def insert(self, params):
        record = self.tuplify(params)

        # Determine the primary key.
        primary_key = []
        for position, column in self.col_info:
            if column.primary:
                if record[position] is None:
                    raise ValueError, (
                        "No value provided for primary key column %s" %
                        repr(column.name))
                primary_key.append(record[position])
        if primary_key:
            primary_key = tuple(primary_key)
            if self.primary_index.has_key(primary_key):
                raise DuplicateError("Primary key %s in use" %
                                     repr(primary_key))

        # Add a record.
        rid = self.next_rid
        self.next_rid += 1  # XXX Hotspot!
        record = (rid, ) + record[1:]
        self.data[rid] = record
        if primary_key:
            self.primary_index[primary_key] = rid

        # Add to indexes.
        for position, column in self.col_info:
            name = column.name
            value = record[position]
            if value is not None:
                if self.indexes.has_key(name):
                    set = self.indexes[name].get(value)
                    if set is None:
                        set = IITreeSet()
                        self.indexes[name][value] = set
                    set.insert(rid)

        # Return the number of rows inserted.
        return 1

    def delete(self, filter):
        rids = self._select_rids(self.tuplify(filter))
        if rids is None:
            # Zap everything
            count = len(self.data)
            self.clear()
            return count
        elif not rids:
            # No rows selected
            return 0

        rids = tuple(rids)  # Make sure rids is a static sequence
        for rid in rids:
            old_r = self.data[rid]
            assert old_r[0] == rid
            primary_key = []
            for position, column in self.col_info:
                old_value = old_r[position]
                if old_value is not None:
                    if column.primary:
                        primary_key.append(old_value)
                    # Remove from indexes.
                    index = self.indexes.get(column.name)
                    if index is not None:
                        if index.has_key(old_value):
                            # Remove an index entry.
                            set = index[old_value]
                            set.remove(rid)
                            if not set:
                                del index[old_value]

            if primary_key:
                # Remove a primary key.
                primary_key = tuple(primary_key)
                assert self.primary_index[primary_key] == rid
                del self.primary_index[primary_key]

            # Remove the data.
            del self.data[rid]

        return len(rids)

    def update(self, filter, changes):
        rids = self._select_rids(self.tuplify(filter))
        if rids is None:
            rids = self.data.keys()
        elif not rids:
            # Nothing needs to be updated.
            return 0
        count = len(rids)

        # Identify changes.
        old_data = {}  # rid -> old tuple
        new_data = {}  # rid -> new tuple
        old_to_new = {}  # old primary key -> new primary key
        new_to_rid = {}  # new primary key -> rid

        record = self.tuplify(changes)
        for rid in rids:
            old_r = self.data[rid]
            old_data[rid] = old_r
            new_r = list(old_r)
            # new_r and old_r contain record tuples.
            for position, column in self.col_info:
                if record[position] is not None:
                    new_r[position] = record[position]
            new_data[rid] = tuple(new_r)
            # Hmm.  The code below allows an update to change the primary
            # key.  It might be better to prevent primary key columns from
            # being changed by an update() call.
            opk = []
            npk = []
            for position, column in self.col_info:
                if column.primary:
                    opk.append(old_r[position])
                    npk.append(new_r[position])
            if opk != npk:
                opk = tuple(opk)
                npk = tuple(npk)
                old_to_new[opk] = npk
                new_to_rid[npk] = rid

        # Look for primary key conflicts.  A primary key conflict can
        # occur when changing a record to a different primary key and
        # the new primary key is already in use.
        for pk in old_to_new.values():
            if (self.primary_index.has_key(pk) and not old_to_new.has_key(pk)):
                raise DuplicateError("Primary key %s in use" % repr(pk))

        # Update the data.
        self.data.update(new_data)

        # Remove old primary key indexes and insert new primary key indexes.
        for pk in old_to_new.keys():
            del self.primary_index[pk]
        self.primary_index.update(new_to_rid)

        # Update indexes.
        for rid, old_r in old_data.items():
            for position, column in self.col_info:
                index = self.indexes.get(column.name)
                if index is not None:
                    new_value = record[position]
                    old_value = old_r[position]
                    if new_value != old_value:
                        if old_value is not None and index.has_key(old_value):
                            # Remove an index entry.
                            set = index[old_value]
                            set.remove(rid)
                            if not set:
                                del index[old_value]
                        if new_value is not None:
                            # Add an index entry.
                            set = index.get(new_value)
                            if set is None:
                                set = IITreeSet()
                                index[new_value] = set
                            set.insert(rid)

        # Return the number of rows affected.
        return count

    def get_record_class(self):
        klass = self._v_record_class
        if klass is None:
            schema = {'rid': 0}
            for position, column in self.col_info:
                schema[column.name] = position

            class TableRecord(TableRecordMixin, Record):
                __record_schema__ = schema

            self._v_record_class = klass = TableRecord
        return klass

    def select(self, filter):
        rids = self._select_rids(self.tuplify(filter))
        if rids is None:
            # All
            klass = self.get_record_class()
            return [klass(rec) for rec in self.data.values()]
        elif rids:
            # Some
            klass = self.get_record_class()
            data = self.data
            return [klass(data[rid]) for rid in rids]
        else:
            # None
            return []

    def _select_rids(self, query):
        """Searches the table for matches, returning record ids.

        Returns a sequence of record ids, or None for all records.
        """
        primary_key = []
        params = 0  # The number of parameters specified
        primary_params = 0  # The number of primary params specified
        for position, column in self.col_info:
            value = query[position]
            if value is not None:
                params += 1
                if column.primary:
                    primary_params += 1
                    if primary_key is not None:
                        primary_key.append(value)
            elif column.primary:
                # Didn't fully specify the primary key.
                # Can't search by primary key.
                primary_key = None

        if not params:
            # No query.  Select all.
            return None

        # First strategy: try to satisfy the request by consulting
        # the primary key index.
        if primary_key:
            # The primary key is complete.  The result set will have
            # either zero rows or one row.
            primary_key = tuple(primary_key)
            rid = self.primary_index.get(primary_key)
            if rid is None:
                return ()
            # Possibly filter out the single item.
            if params > primary_params:
                cand = self.data[rid]
                for position, column in self.col_info:
                    if query[position] is not None:
                        if cand[position] != query[position]:
                            # Not a match.
                            return ()
            return (rid, )

        # Second strategy: try to satisfy the request by intersecting
        # indexes.
        rids = None
        iteration_filters = []
        for position, column in self.col_info:
            value = query[position]
            if value is not None:
                index = self.indexes.get(column.name)
                if index is None:
                    iteration_filters.append((position, value))
                else:
                    set = index.get(value)
                    if set is None:
                        # No rows satisfy this criterion.
                        return ()
                    if rids is None:
                        rids = set
                    else:
                        rids = intersection(rids, set)
                    if not rids:
                        # No rows satisfy all criteria.
                        return ()
        if rids is not None:
            rids = rids.keys()

        if not iteration_filters:
            # Indexes did all the work.  No need to search each record.
            return rids

        # Fallback strategy: Eliminate items one by one.
        if rids is None:
            # Use the whole data set.
            candidates = self.data.values()
        else:
            # Use the specified records.
            candidates = [self.data[rid] for rid in rids]

        rids = []
        append = rids.append
        for cand in candidates:
            for position, value in iteration_filters:
                if cand[position] != value:
                    # Not a match.
                    break
            else:
                # A match.
                append(cand[0])
        return rids

    def __repr__(self):
        return "<%s(schema=%s)>" % (self.__class__.__name__, repr(self.schema))
Esempio n. 16
0
class Lexicon(Persistent, Implicit):
    """Maps words to word ids and then some

    The Lexicon object is an attempt to abstract vocabularies out of
    Text indexes.  This abstraction is not totally cooked yet, this
    module still includes the parser for the 'Text Index Query
    Language' and a few other hacks.

    """

    # default for older objects
    stop_syn={}

    def __init__(self, stop_syn=None,useSplitter=None,extra=None):

        self.clear()
        if stop_syn is None:
            self.stop_syn = {}
        else:
            self.stop_syn = stop_syn

        self.useSplitter = Splitter.splitterNames[0]
        if useSplitter: self.useSplitter=useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)


    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()

    def _convertBTrees(self, threshold=200):
        if (type(self._lexicon) is OIBTree and
            type(getattr(self, '_inverseLex', None)) is IOBTree):
            return

        from BTrees.convert import convert

        lexicon=self._lexicon
        self._lexicon=OIBTree()
        self._lexicon._p_jar=self._p_jar
        convert(lexicon, self._lexicon, threshold)

        try:
            inverseLex=self._inverseLex
            self._inverseLex=IOBTree()
        except AttributeError:
            # older lexicons didn't have an inverse lexicon
            self._inverseLex=IOBTree()
            inverseLex=self._inverseLex

        self._inverseLex._p_jar=self._p_jar
        convert(inverseLex, self._inverseLex, threshold)

    def set_stop_syn(self, stop_syn):
        """ pass in a mapping of stopwords and synonyms.  Format is:

        {'word' : [syn1, syn2, ..., synx]}

        Vocabularies do not necesarily need to implement this if their
        splitters do not support stemming or stoping.

        """
        self.stop_syn = stop_syn


    def getWordId(self, word):
        """ return the word id of 'word' """

        wid=self._lexicon.get(word, None)
        if wid is None:
            wid=self.assignWordId(word)
        return wid

    set = getWordId

    def getWord(self, wid):
        """ post-2.3.1b2 method, will not work with unconverted lexicons """
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word and returns it."""
        # First make sure it's not already in there
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        try: inverse=self._inverseLex
        except AttributeError:
            # woops, old lexicom wo wids
            inverse=self._inverseLex=IOBTree()
            for word, wid in self._lexicon.items():
                inverse[wid]=word

        wid=randid()
        while not inverse.insert(wid, word):
            wid=randid()

        if isinstance(word,StringType):
            self._lexicon[intern(word)] = wid
        else:
            self._lexicon[word] = wid


        return wid


    def get(self, key, default=None):
        """Return the matched word against the key."""
        r=IISet()
        wid=self._lexicon.get(key, default)
        if wid is not None: r.insert(wid)
        return r

    def __getitem__(self, key):
        return self.get(key)


    def __len__(self):
        return len(self._lexicon)


    def Splitter(self, astring, words=None, encoding = "latin1"):
        """ wrap the splitter """
        if words is None: words = self.stop_syn

        try:
            return self.SplitterFunc(
                    astring,
                    words,
                    encoding=encoding,
                    singlechar=self.splitterParams.splitterSingleChars,
                    indexnumbers=self.splitterParams.splitterIndexNumbers,
                    casefolding=self.splitterParams.splitterCasefolding
                    )
        except:
            return self.SplitterFunc(astring, words)


    def query_hook(self, q):
        """ we don't want to modify the query cuz we're dumb """
        return q
Esempio n. 17
0
class LinkCheckTool(SimpleItem):
    security = ClassSecurityInfo()

    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # Additional queue for internal crawler to revalidate the site
        self.crawl_queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0

    security.declarePrivate("is_available")
    def is_available(self):
        return hasattr(self, 'index') and \
               hasattr(self, 'checked') and \
               hasattr(self, 'queue') and \
               hasattr(self, 'counter')

    security.declarePrivate("clear")
    def clear(self):
        while True:
            try:
                self.queue.pull()
            except IndexError:
                break
        while True:
            try:
                self.crawl_queue.pull()
            except IndexError:
                break

        self.checked.clear()
        self.index.clear()
        self.links.clear()
        self.counter = 0

    security.declarePrivate("crawl")
    def crawl(self):
        self.clear()
        query = {}
        registry = getUtility(IRegistry)
        settings = registry.forInterface(ISettings)
        if settings.content_types:
            query['portal_type'] = settings.content_types

        if settings.workflow_states:
            query['review_state'] = settings.workflow_states

        catalog = api.portal.get_tool('portal_catalog')
        brains = catalog(query)
        for brain in brains:
            # asyncronous crawling not working yet
            # self.crawl_enqueue(brain.UID)

            obj = brain.getObject()
            obj.restrictedTraverse('@@linkcheck')()
            logger.info('Crawling: checked {0}'.format(brain.getURL()))

    security.declarePrivate("enqueue")
    def enqueue(self, url):
        index = self.index.get(url)
        if index is None:
            # a really new url
            index = self.store(url)
        else:
            entry = self.checked.get(index)
            if entry is not None and entry:
                entry = None, entry[1], entry[2]
                self.checked[index] = entry
            else:
                # reset empty entry
                self.remove(url)
                index = self.store(url)
        self.queue.put(index)
        return index

    security.declarePrivate("register")
    def register(self, hrefs, referer, timestamp):
        """Add or update link presence information.

        If a link has not been checked since the provided timestamp,
        it will be added to the queue (or if it is not in the
        database).
        """

        referer = self.index.get(referer) or self.store(referer)

        registry = getUtility(IRegistry, context=self.aq_parent)
        try:
            settings = registry.forInterface(ISettings)
        except KeyError as exc:
            logger.warn(exc)
            return

        limit = settings.referers

        for href in hrefs:
            if self.should_ignore(href, settings.ignore_list):
                continue

            # If the hyperlink is not already in the work queue,
            # compare the provided timestamp to our database to see if
            # we need to check its validity. Note that internal links
            # are excempt if we're not using the publisher.
            index = self.index.get(href)
            entry = self.checked.get(-1 if index is None else index)

            if index not in self.queue:
                if entry is None or entry[0] < timestamp:
                    if settings.use_publisher or not href.startswith('/'):
                        index = self.enqueue(href)
                    elif href not in self.index:
                        index = self.store(href)

            assert index is not None

            if entry is None:
                self.checked[index] = None, None, IISet((referer,))
            else:
                # If the provided paths are a subset of the already
                # seen paths, and if there is no new referer, we don't
                # issue an update.
                referers = entry[2]
                if referer not in referers and len(referers) <= limit:
                    referers.add(referer)

    security.declarePrivate("store")
    def store(self, url):
        index = self.index[url] = self.counter
        self.links[index] = url
        self.counter += 1
        return index

    security.declarePrivate("remove")
    def remove(self, url):
        index = self.index.get(url)
        if url in self.index:
            del self.index[url]
        if index and index in self.checked:
            del self.checked[index]

    security.declarePrivate("update")
    def update(self, href, status):
        """Update link status."""
        now = datetime.datetime.now()
        timestamp = int(time.mktime(now.timetuple()))

        index = self.index.get(href)
        if index is None:
            return

        entry = self.checked.get(-1 if index is None else index)
        if entry is None:
            self.checked[index] = timestamp, status, IISet()

        # If the status changed, we update the entry.
        elif status != entry[1] or not entry[0]:

            # If the status was previously good, then we clear the
            # status. What this means is that we'll wait for the next
            # check to declare a bad status (it might be temporary).
            if entry[1] == 200:
                status = None

            self.checked[index] = timestamp, status, entry[2]

    @cache(lambda method, self, ignore_list: ignore_list)
    def get_matchers(self, ignore_list):
        matchers = []
        for expression in ignore_list:
            try:
                matcher = re.compile(expression).search
            except re.error:
                pass
            else:
                matchers.append(matcher)

        return matchers

    def should_ignore(self, href, ignore_list):
        for matcher in self.get_matchers(ignore_list):
            if matcher(href):
                return True

        return False

    def crawl_enqueue(self, obj):
        if not isinstance(obj, basestring):
            obj = obj.UID()
        self.crawl_queue.put(obj)

    def crawl_dequeue(self):
        if self.crawl_queue._data:
            return self.crawl_queue.pull()
Esempio n. 18
0
class Lexicon(Persistent):

    _v_nextid = None
    _wid_length_based = True  # Flag to distinguish new and old lexica

    def __init__(self, *pipeline):
        self.clear()
        self._pipeline = pipeline

    def clear(self):
        """Empty the lexicon.
        """
        self.length = Length()
        self._wid_length_based = False
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree()  # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.

    def length(self):
        """Return the number of unique terms in the lexicon.
        """
        # Overridden in instances with a BTrees.Length.Length
        raise NotImplementedError

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        return list(map(self._getWordIdCreate, last))

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "process_post_glob", element.process)
            last = process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix)  # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            # WidCode requires us to use at least 0x4000 as a base number.
            # The algorithm in versions before 2.13 used the length as a base
            # number. So we don't even try to generate numbers below the
            # length as they are likely all taken
            minimum = 0x4000
            if self._wid_length_based:
                minimum = max(self.length(), 0x4000)

            while True:
                if self._v_nextid is None:
                    self._v_nextid = randrange(minimum, 0x10000000)

                wid = self._v_nextid
                self._v_nextid += 1

                if wid not in self._words:
                    break

                self._v_nextid = None

            self.length.change(1)
            self._wids[word] = wid
            self._words[wid] = word
        return wid
Esempio n. 19
0
class IntegerRangesIndex(SimpleItem):
    """ Index a set of integer ranges:
        [(1,2), (12,23), (12, 22)]
    """

    implements(IPluggableIndex)
    meta_type = 'IntegerRangesIndex'

    def __init__(self, id, caller=None, extra=None):
        self.id = id
        self.caller = caller
        self.clear()
        self.__genid = 0

    def __len__(self):
        return self._length()

    def getId(self):
        """Return Id of index."""
        return self.id

    def clear(self):
        """Empty the index"""
        
        IOBTree = BTrees.family64.IO.BTree

        self._index = IOBTree() # {rangeid: [document_id, ...]}
        self._unindex = IOBTree() # {document_id: [rangeid, ...]}
        self._range_mapping = IOBTree() # {rangeid: range}
        self._reverse_range_mapping = OIBTree() # {range: rangeid}
        self._since_index = IOBTree() # {since: [rangeid,...]}
        self._until_index = IOBTree() # {until: [rangeid,...]}
        self._length = BTrees.Length.Length()
        self._unique_values_length = BTrees.Length.Length()

    def __get_range_id(self, range_):
        return self._reverse_range_mapping.get(range_, None)

    def __get_range(self, range_id):
        return self._range_mapping.get(range_id, None)

    def __index_range(self, range_):
        """ index range if needed and return the rangeid
        """
        range_id = self.__get_range_id(range_)
        if range_id is None:
            range_id = self.genid()
            # index range
            self._unique_values_length.change(1)
            self._range_mapping[range_id] = range_
            self._reverse_range_mapping[range_] = range_id
            # index range boundaries
            since, until = range_
            self.__insert_in_index_set(self._since_index, since, range_id)
            self.__insert_in_index_set(self._until_index, until, range_id)
        return range_id

    def __unindex_range(self, range_id):
        range_ = self.__get_range(range_id)
        if range_ is None:
            return None
        since, until = range_
        self.__remove_in_index_set(self._since_index, since, range_id)
        self.__remove_in_index_set(self._until_index, until, range_id)
        self._unique_values_length.change(-1)
        del self._range_mapping[range_id]
        del self._reverse_range_mapping[range_]
        return range_

    def genid(self):
        self.__genid += 1
        return self.__genid

    def getEntryForObject(self, document_id, default=_marker):
        """Get all information contained for 'document_id'."""
        if default is _marker:
            return self._unindex.get(document_id)
        else:
            return self._index.get(document_id, default)

    def getIndexSourceNames(self):
        """Get a sequence of attribute names that are indexed by the index.
        """
        return [self.id]

    def index_object(self, document_id, obj, threshold=None):
        """Index an object.

        'document_id' is the integer ID of the document.
        'obj' is the object to be indexed.
        'threshold' is the number of words to process between committing
        subtransactions.  If None, subtransactions are disabled.
        """
        new_ranges = self._get_object_data(obj, self.id)
        if new_ranges:
            new_set = IISet(map(self.__index_range, new_ranges))
        else:
            new_set = IISet()

        old_set = self._unindex.get(document_id, IISet())

        new_entries = difference(new_set, old_set)
        expired_entries = difference(old_set, new_set)

        if not (new_entries or expired_entries):
            # nothing to do, bail out !
            return 0
        for expired_entry in expired_entries:
            self.__remove_in_index_set(self._unindex, document_id,
                expired_entry)
            if self.__remove_in_index_set(self._index, expired_entry, \
                    document_id):
                # range is not used anymore, retire it
                self.__unindex_range(expired_entry)

        for new_entry in new_entries:
            if self.__insert_in_index_set(self._unindex, document_id,
                    new_entry):
                self._length.change(1)
            self.__insert_in_index_set(self._index, new_entry, document_id)

        return 1

    def unindex_object(self, document_id):
        """Remove the document_id from the index."""
        entries = self._unindex.get(document_id, _marker)
        if entries is _marker:
            return
        if isinstance(entries, int):
            entries = [entries]
        for expired_entry in entries:
            if self.__remove_in_index_set(self._index, expired_entry, \
                    document_id):
                # range is not used anymore, retire it
                self.__unindex_range(expired_entry)
        self._length.change(-1)
        del self._unindex[document_id]

    def __insert_in_index_set(self, index, key, value, set_type=IISet):
        """ Insert value in the index. If the key was not present and
        the index row was created it returns True
        """
        index_row = index.get(key, _marker)
        if index_row is _marker:
            index[key] = value
            return True
        if isinstance(index_row, set_type):
            index_row.insert(value)
            return False
        # it was an int
        index[key] = set_type((index_row, value,))
        return False

    def __remove_in_index_set(self, index, key, value, set_type=IISet):
        """ remove the value in the index, index row is a Set
        It returns true if the index row as been removed (The set was empty)
        """
        index_row = index.get(key, _marker)
        if index_row is _marker:
            return True
        if isinstance(index_row, IISet):
            index_row.remove(value)
            if len(index_row) == 0:
                del index[key]
                return True
            if len(index_row) == 1:
                index[key] = index_row[0]
            return False
        del index[key]
        return True

    def _apply_index(self, request):
        record = parseIndexRequest(request, self.id)
        try:
            qstart, qend = record.keys
        except TypeError:
            return None

        minint = BTrees.family64.minint
        maxint = BTrees.family64.maxint

        qstart = min(maxint, max(minint, qstart))
        qend = max(minint, min(maxint, qend))

        # start in inside range
        start = multiunion(self._since_index.values(max=qstart))
        end = multiunion(self._until_index.values(min=qstart))
        start_into = intersection(start, end)

        # end inside range
        start = multiunion(self._since_index.values(max=qend))
        end = multiunion(self._until_index.values(min=qend))
        end_into = intersection(start, end)

        # start before range and end after range
        start = multiunion(self._since_index.values(min=qstart))
        end = multiunion(self._until_index.values(max=qend))
        start_before_end_after = intersection(start, end)

        result = union(start_into, end_into)
        result = union(result, start_before_end_after)

        return multiunion(map(self._index.__getitem__, result)), (self.id,)

    def numObjects(self):
        """Return the number of indexed objects"""
        return self._length()

    def indexSize(self):
        """Return the size of the index in terms of distinct values"""
        return self._unique_values_length()

    def _get_object_data(self, obj, attr):
        # self.id is the name of the index, which is also the name of the
        # attribute we're interested in.  If the attribute is callable,
        # we'll do so.
        try:
            datum = getattr(obj, attr)
            if safe_callable(datum):
                datum = datum()
        except AttributeError:
            datum = _marker
        return datum
Esempio n. 20
0
class Lexicon(Persistent, Implicit):
    """Maps words to word ids and then some

    The Lexicon object is an attempt to abstract vocabularies out of
    Text indexes.  This abstraction is not totally cooked yet, this
    module still includes the parser for the 'Text Index Query
    Language' and a few other hacks.

    """

    # default for older objects
    stop_syn={}

    def __init__(self, stop_syn=None,useSplitter=None,extra=None):

        self.clear()
        if stop_syn is None:
            self.stop_syn = {}
        else:
            self.stop_syn = stop_syn

        self.useSplitter = Splitter.splitterNames[0]
        if useSplitter: self.useSplitter=useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)


    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()

    def _convertBTrees(self, threshold=200):
        if (type(self._lexicon) is OIBTree and
            type(getattr(self, '_inverseLex', None)) is IOBTree):
            return

        from BTrees.convert import convert

        lexicon=self._lexicon
        self._lexicon=OIBTree()
        self._lexicon._p_jar=self._p_jar
        convert(lexicon, self._lexicon, threshold)

        try:
            inverseLex=self._inverseLex
            self._inverseLex=IOBTree()
        except AttributeError:
            # older lexicons didn't have an inverse lexicon
            self._inverseLex=IOBTree()
            inverseLex=self._inverseLex

        self._inverseLex._p_jar=self._p_jar
        convert(inverseLex, self._inverseLex, threshold)

    def set_stop_syn(self, stop_syn):
        """ pass in a mapping of stopwords and synonyms.  Format is:

        {'word' : [syn1, syn2, ..., synx]}

        Vocabularies do not necesarily need to implement this if their
        splitters do not support stemming or stoping.

        """
        self.stop_syn = stop_syn


    def getWordId(self, word):
        """ return the word id of 'word' """

        wid=self._lexicon.get(word, None)
        if wid is None:
            wid=self.assignWordId(word)
        return wid

    set = getWordId

    def getWord(self, wid):
        """ post-2.3.1b2 method, will not work with unconverted lexicons """
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word and returns it."""
        # First make sure it's not already in there
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        try: inverse=self._inverseLex
        except AttributeError:
            # woops, old lexicom wo wids
            inverse=self._inverseLex=IOBTree()
            for word, wid in self._lexicon.items():
                inverse[wid]=word

        wid=randid()
        while not inverse.insert(wid, word):
            wid=randid()

        if isinstance(word,StringType):
            self._lexicon[intern(word)] = wid
        else:
            self._lexicon[word] = wid


        return wid


    def get(self, key, default=None):
        """Return the matched word against the key."""
        r=IISet()
        wid=self._lexicon.get(key, default)
        if wid is not None: r.insert(wid)
        return r

    def __getitem__(self, key):
        return self.get(key)


    def __len__(self):
        return len(self._lexicon)


    def Splitter(self, astring, words=None, encoding = "latin1"):
        """ wrap the splitter """
        if words is None: words = self.stop_syn

        try:
            return self.SplitterFunc(
                    astring,
                    words,
                    encoding=encoding,
                    singlechar=self.splitterParams.splitterSingleChars,
                    indexnumbers=self.splitterParams.splitterIndexNumbers,
                    casefolding=self.splitterParams.splitterCasefolding
                    )
        except:
            return self.SplitterFunc(astring, words)


    def query_hook(self, q):
        """ we don't want to modify the query cuz we're dumb """
        return q
Esempio n. 21
0
class PreferenceTool(BaseTool):
    """
    PreferenceTool manages User Preferences / User profiles.

    TODO:
      - make the preference tool an action provider (templates)
  """
    id = 'portal_preferences'
    meta_type = 'ERP5 Preference Tool'
    portal_type = 'Preference Tool'
    title = 'Preferences'
    allowed_types = ('ERP5 Preference', )
    security = ClassSecurityInfo()

    aq_preference_generated = False

    security.declareProtected(Permissions.ManagePortal, 'manage_overview')
    manage_overview = DTMLFile('explainPreferenceTool', _dtmldir)

    security.declarePrivate('manage_afterAdd')

    def manage_afterAdd(self, item, container):
        """ init the permissions right after creation """
        item.manage_permission(Permissions.AddPortalContent,
                               ['Member', 'Author', 'Manager'])
        item.manage_permission(Permissions.AddPortalFolders,
                               ['Member', 'Author', 'Manager'])
        item.manage_permission(Permissions.View,
                               ['Member', 'Auditor', 'Manager'])
        item.manage_permission(Permissions.CopyOrMove,
                               ['Member', 'Auditor', 'Manager'])
        item.manage_permission(Permissions.ManageProperties, ['Manager'],
                               acquire=0)
        item.manage_permission(Permissions.SetOwnPassword,
                               ['Member', 'Author', 'Manager'])
        BaseTool.inheritedAttribute('manage_afterAdd')(self, item, container)

    security.declarePublic('getPreference')

    def getPreference(self, pref_name, default=_marker):
        """ get the preference on the most appopriate Preference object. """
        method = getattr(self, 'get%s' % convertToUpperCase(pref_name), None)
        if method is not None:
            return method(default)
        if default is _marker:
            return None
        return default

    security.declareProtected(Permissions.ModifyPortalContent, "setPreference")

    def setPreference(self, pref_name, value):
        """ set the preference on the active Preference object"""
        self.getActivePreference()._edit(**{pref_name: value})

    def _getSortedPreferenceList(self, sql_catalog_id=None):
        """ return the most appropriate preferences objects,
        sorted so that the first in the list should be applied first
    """
        tv = getTransactionalVariable()
        security_manager = getSecurityManager()
        user = security_manager.getUser()
        acl_users = self.getPortalObject().acl_users
        try:
            # reset a security manager without any proxy role or unrestricted method,
            # wich affects the catalog search that we do to find applicable
            # preferences.
            actual_user = acl_users.getUserById(user.getId())
            if actual_user is not None:
                newSecurityManager(None, actual_user.__of__(acl_users))
            tv_key = 'PreferenceTool._getSortedPreferenceList/%s/%s' % (
                user.getId(), sql_catalog_id)
            if tv.get(tv_key, None) is None:
                prefs = []
                # XXX will also cause problems with Manager (too long)
                # XXX For manager, create a manager specific preference
                #                  or better solution
                user_is_manager = 'Manager' in user.getRolesInContext(self)
                for pref in self.searchFolder(portal_type='Preference',
                                              sql_catalog_id=sql_catalog_id):
                    pref = pref.getObject()
                    # XXX quick workaround so that managers only see user preference
                    #     they actually own.
                    if pref is not None and (
                            not user_is_manager
                            or pref.getPriority() != Priority.USER
                            or pref.getOwnerTuple()[1] == user.getId()):
                        if pref.getProperty('preference_state',
                                            'broken') in ('enabled', 'global'):
                            prefs.append(pref)
                prefs.sort(key=lambda x: x.getPriority(), reverse=True)
                # add system preferences before user preferences
                sys_prefs = [x.getObject() for x in self.searchFolder(portal_type='System Preference', sql_catalog_id=sql_catalog_id) \
                             if x.getObject().getProperty('preference_state', 'broken') in ('enabled', 'global')]
                sys_prefs.sort(key=lambda x: x.getPriority(), reverse=True)
                preference_list = sys_prefs + prefs
                tv[tv_key] = preference_list
            return tv[tv_key]
        finally:
            setSecurityManager(security_manager)

    def _getActivePreferenceByPortalType(self, portal_type):
        enabled_prefs = self._getSortedPreferenceList()
        if len(enabled_prefs) > 0:
            try:
                return [
                    x for x in enabled_prefs
                    if x.getPortalType() == portal_type
                ][0]
            except IndexError:
                pass
        return None

    security.declareProtected(Permissions.View, 'getActivePreference')

    def getActivePreference(self):
        """ returns the current preference for the user.
       Note that this preference may be read only. """
        return self._getActivePreferenceByPortalType('Preference')

    security.declareProtected(Permissions.View, 'clearCache')

    def clearCache(self, preference):
        """ clear cache when a preference is modified.
    This is called by an interaction workflow on preferences.
    """
        self._getCacheId()  # initialize _preference_cache if needed.
        if preference.getPriority() == Priority.USER:
            user_id = getSecurityManager().getUser().getId()
            self._preference_cache[user_id] = \
                self._preference_cache.get(user_id, 0) + 1
        self._preference_cache[None] = self._preference_cache.get(None, 0) + 1

    def _getCacheId(self):
        """Return a cache id for preferences.

    We use:
     - user_id: because preferences are always different by user
     - self._preference_cache[user_id] which is increased everytime a user
       preference is modified
     - self._preference_cache[None] which is increased everytime a global
       preference is modified
    """
        user_id = getSecurityManager().getUser().getId()
        try:
            self._preference_cache
        except AttributeError:
            self._preference_cache = OIBTree()
        return self._preference_cache.get(None), self._preference_cache.get(
            user_id), user_id

    security.declareProtected(Permissions.View, 'getActiveUserPreference')

    def getActiveUserPreference(self):
        """ returns the current user preference for the user.
    If no preference exists, then try to create one with `createUserPreference`
    type based method.

    This method returns a preference that the user will be able to edit or
    None, if `createUserPreference` refused to create a preference.

    It is intendended for "click here to edit your preferences" actions.
    """
        active_preference = self.getActivePreference()
        if active_preference is None or active_preference.getPriority(
        ) != Priority.USER:
            # If user does not have a preference, let's try to create one
            user = self.getPortalObject(
            ).portal_membership.getAuthenticatedMember().getUserValue()
            if user is not None:
                createUserPreference = user.getTypeBasedMethod(
                    'createUserPreference')
                if createUserPreference is not None:
                    active_preference = createUserPreference()
        return active_preference

    security.declareProtected(Permissions.View, 'getActiveSystemPreference')

    def getActiveSystemPreference(self):
        """ returns the current system preference for the user.
       Note that this preference may be read only. """
        return self._getActivePreferenceByPortalType('System Preference')

    security.declareProtected(Permissions.View, 'getDocumentTemplateList')

    def getDocumentTemplateList(self, folder=None):
        """ returns all document templates that are in acceptable Preferences
        based on different criteria such as folder, portal_type, etc.
    """
        if folder is None:
            # as the preference tool is also a Folder, this method is called by
            # page templates to get the list of document templates for self.
            folder = self

        # We must set the user_id as a parameter to make sure each
        # user can get a different cache
        def _getDocumentTemplateList(user_id, portal_type=None):
            acceptable_template_list = []
            for pref in self._getSortedPreferenceList():
                for doc in pref.contentValues(portal_type=portal_type):
                    acceptable_template_list.append(doc.getRelativeUrl())
            return acceptable_template_list

        _getDocumentTemplateList = CachingMethod(
            _getDocumentTemplateList,
            'portal_preferences.getDocumentTemplateList.{}'.format(
                self._getCacheId()),
            cache_factory='erp5_ui_long')

        allowed_content_types = [
            pti.id for pti in folder.allowedContentTypes()
        ]
        user_id = getToolByName(
            self, 'portal_membership').getAuthenticatedMember().getId()
        template_list = []
        for portal_type in allowed_content_types:
            for template_url in _getDocumentTemplateList(
                    user_id, portal_type=portal_type):
                template = self.restrictedTraverse(template_url, None)
                if template is not None:
                    template_list.append(template)
        return template_list

    security.declareProtected(Permissions.ManagePortal,
                              'createActiveSystemPreference')

    def createActiveSystemPreference(self):
        """ Create a System Preference and enable it if there is no other
        enabled System Preference in present.
    """
        if self.getActiveSystemPreference() is not None:
            raise ValueError("Another Active Preference already exists.")
        system_preference = self.newContent(portal_type='System Preference')
        system_preference.enable()

    security.declareProtected(Permissions.ManagePortal,
                              'createPreferenceForUser')

    def createPreferenceForUser(self, user_id, enable=True):
        """Creates a preference for a given user, and optionnally enable the
    preference.
    """
        user_folder = self.acl_users
        user = user_folder.getUserById(user_id)
        if user is None:
            raise ValueError("User %r not found" % (user_id, ))
        security_manager = getSecurityManager()
        try:
            newSecurityManager(None, user.__of__(user_folder))
            preference = self.newContent(portal_type='Preference')
            if enable:
                preference.enable()
            return preference
        finally:
            setSecurityManager(security_manager)

    security.declarePublic('isAuthenticationPolicyEnabled')

    def isAuthenticationPolicyEnabled(self):
        """
    Return True if authentication policy is enabled.
    This method exists here due to bootstrap issues.
    It should work even if erp5_authentication_policy bt5 is not installed.
    """
        # isPreferredAuthenticationPolicyEnabled exisss if property sheets from
        # erp5_authentication_policy are installed.
        method = getattr(self, 'isPreferredAuthenticationPolicyEnabled', None)
        if method is not None and method():
            return True
        # if it does not exist, for sure authentication policy is not enabled.
        return False
    def index_object(self, documentId, obj, threshold=None):
        """ Index an object:
        'documentId' is the integer id of the document

        'obj' is the object to be indexed

        'threshold' is the number of words to process between
        commiting subtransactions.  If 'None' subtransactions are
        disabled. """

        # sniff the object for our 'id', the 'document source' of the
        # index is this attribute.  If it smells callable, call it.
        try:
            source = getattr(obj, self.id)
            if safe_callable(source):
                source = source()

            if not isinstance(source, UnicodeType):
                source = str(source)

        except (AttributeError, TypeError):
            return 0

        # sniff the object for 'id'+'_encoding'

        try:
            encoding = getattr(obj, self.id+'_encoding')
            if safe_callable(encoding ):
                encoding = str(encoding())
            else:
                encoding = str(encoding)
        except (AttributeError, TypeError):
            encoding = 'latin1'

        lexicon = self.getLexicon()

        splitter = lexicon.Splitter

        wordScores = OIBTree()
        last = None

        # Run through the words and score them

        for word in list(splitter(source,encoding=encoding)):
            if word[0] == '\"':
                last = self._subindex(word[1:-1], wordScores, last, splitter)
            else:
                if word==last: continue
                last=word
                wordScores[word]=wordScores.get(word,0)+1

        # Convert scores to use wids:
        widScores=IIBucket()
        getWid=lexicon.getWordId
        for word, score in wordScores.items():
            widScores[getWid(word)]=score

        del wordScores

        currentWids=IISet(self._unindex.get(documentId, []))

        # Get rid of document words that are no longer indexed
        self.unindex_objectWids(documentId, difference(currentWids, widScores))

        # Now index the words. Note that the new xIBTrees are clever
        # enough to do nothing when there isn't a change. Woo hoo.
        insert=self.insertForwardIndexEntry
        for wid, score in widScores.items():
            insert(wid, documentId, score)

        # Save the unindexing info if it's changed:
        wids=widScores.keys()
        if wids != currentWids.keys():
            self._unindex[documentId]=wids

        return len(wids)
Esempio n. 23
0
class Path(String):

    root = None     # root as passed to Catalog()
    path2rid = None # OIBTree mapping path to rid (one:one)
    rid2path = None # IOBTree mapping rid to path (one:one)
    parts = None    # OOBTree mapping (level, part) to rids (one:many)
    levels = None   # IOBTree mapping level to a list of rids (one:many)
    case_sensitive = None

    sorted = None   # OOBTree for sorting; inherited from Path


    def __init__(self, root, case_sensitive=None):

        # Root
        # ====

        if not isinstance(root, basestring):
            raise TypeError("root is not a string: '%s'" % root)
        elif not isdir(root):
            raise ValueError("root doesn't point to a directory: '%s'" % root)
        self.root = root.rstrip(os.sep)


        # Case Sensitivity
        # ================

        if case_sensitive is None:
            if 'win' in sys.platform:
                case_sensitive = False
            else:
                case_sensitive = True
        if case_sensitive not in (False, True, 0, 1):
            raise TypeError( "case_sensitive isn't a boolean: "
                           + "'%s'" % case_sensitive
                            )
        self.case_sensitive = bool(case_sensitive)

        self.reset()


    # Index contract
    # ==============

    __name__ = 'Path' # used in command-line interface


    def reset(self):
        """Forget everything; usually called from __init__.
        """
        String.reset(self)

        self.path2rid = OIBTree()   # {path:rid}
        self.rid2path = IOBTree()   # {rid:path}
        self.parts = OOBTree()      # {(level,part):rids}
        self.rids = IOBTree()       # {rid:(level,part)s}
        self.levels = IOBTree()     # {level:rids}


    def learn(self, rid, value):
        """Given an rid and a value, associate them.
        """
        String.learn(self, rid, value)


        # Parse and validate.
        # ===================
        # Value is an absolute path, rooted in self.root.

        if not isinstance(value, basestring):
            raise TypeError("string expected")
        elif value and not value.startswith(os.sep):
            raise ValueError("path not specified absolutely: '%s'" % value)
        if self.case_sensitive:
            path = value
        else:
            path = value.lower()
        path = path.rstrip(os.sep) # safety net; should never need this
        parts = value.split(os.sep)
        #parts = value.split(os.sep)[1:]


        # Add to simple identity indices.
        # ===============================

        self.path2rid[path] = rid
        self.rid2path[rid] = path


        # Add to complex level/part indices.
        # ==================================

        for level in range(len(parts)):
            token_ = (level, parts[level])


            # Add to (one:many) mapping of (level,part) to [rids].
            # ====================================================

            if token_ not in self.parts:
                self.parts[token_] = IITreeSet([rid])
            else:
                self.parts[token_].insert(rid)


            # Add to the (one:many) mapping of rid to (level,part)s.
            # ======================================================
            # This exists so we know how to forget about this rid when the time
            # comes.

            if rid not in self.rids:
                self.rids[rid] = OOSet([token_])
            else:
                self.rids[rid].insert(token_)


        # Add to (one:many) mapping of levels to rids.
        # ============================================
        # This is used to implement level limits.

        if level not in self.levels:
            self.levels[level] = IITreeSet([rid])
        else:
            self.levels[level].insert(rid)


    def forget(self, rid):
        """Given an rid, remove it from all indices.
        """
        String.forget(self, rid)


        # Remove from the (one:many) mapping of (level, part) to rids.
        # ============================================================
        # We also track the level here and remove the rid from the (one:many)
        # mapping of levels to rids.

        level = -1
        for token_ in self.rids[rid]:
            if token_[0] > level:
                level = token_[0]
            self.parts[token_].remove(rid)
            if len(self.parts[token_]) == 0:
                del self.parts[token_]
        self.levels[level].remove(rid)
        if len(self.levels[level]) == 0:
            del self.levels[level]


        # Remove from the (one:many) mapping of rid to tokens.
        # ====================================================

        del self.rids[rid]


        # Remove from simple identity indices.
        # ====================================
        path = self.rid2path[rid]
        del self.path2rid[path]
        del self.rid2path[rid]


    # Searches
    # ========

    def above(self, arg):
        """Find all resources at or above path, within the limits given.

        Here we actually call below() on <path> and all of its ancestors,
        passing the limits straight through, with the exception that limits
        default to 0:1 rather than None:None. Use '0:' for the latter.

        """

        # Parse and validate.
        # ===================

        path, upper, lower = self._path_and_limits(arg)
        rid = self.path2rid.get(path, None)
        if rid is None:
            return


        # Build
        # =====

        tmpl = "%s "
        if (upper, lower) == (None, None):
            tmpl += '0:1' # default: breadcrumbs
        else:
            if upper is not None:
                tmpl += str(upper)
            tmpl += ":"
            if lower is not None:
                tmpl += str(lower)

        parts = path.split(os.sep)
        rids = []
        for level in range(len(parts)):
            ancestor = os.sep.join(parts[:level+1])
            ancestor = ancestor and ancestor or '/'
            rids.append(self.below(tmpl % ancestor))
        rids = multiunion(rids)


    def below(self, arg):
        """Find all resources at or below path, within the limits given.
        """

        # Parse and validate.
        # ===================

        path, upper, lower = self._path_and_limits(arg)
        rid = self.path2rid.get(path, None)
        if rid is None:
            return


        # Build
        # =====

        parts = path.split(os.sep)
        rids = None
        for level in range(len(parts)):
            rids = intersection(rids, self.parts[(level, parts[level])])
        if rids is None:
            return IISet() # short-cut


        # Limits
        # ======
        # Remove rids that are above any upper limit, and then only include rids
        # that are above any lower limit. Limits are relative to the level of
        # the requested path.

        if upper is not None:
            upper += level
            for i in range(level, upper):
                if i not in self.levels:
                    break
                rids = difference(rids, self.levels[i])
        if lower is not None:
            lower += level
            _rids = []
            for i in range(level, lower):
                if i not in self.levels:
                    break
                _rids.append(self.levels[i])
            rids = intersection(rids, multiunion(_rids))

        return rids


    def is_(self, arg):
        """Return the rid corresponding to a single path. Root is special-cased.
        """
        path, foo, bar = self._path_and_limits(arg)
        return self.path2rid.get(arg, None)


    # Parser
    # ======

    def _path_and_limits(self, arg):
        """Given an argument from a Collection constraint, return three params.

        Arg is of the form:

           /some/path 0:4

        The first token is the path, the second is a limits specification. The
        path must not contain a space (@@: really should support that). The
        limits spec is optional; if given, it must have a colon and at least one
        end specified. To the left of the colon is the upper bound; to the right
        is the lower bound. These bounds specify the tree levels that the path
        filter should apply to, but the specifics of how it applies depend on
        the searches above.

        (Yes this nomenclature is all wacky. The root is conceptually 'higher'
        for some reason, even though the root is 0 and a real tree's roots are
        lower than its branches. Go figure.)

        """

        path = ''
        upper = None
        lower = None

        parts = arg.split()
        nparts = len(parts)
        assert nparts in (1, 2), "either need path or path and limits"


        # Path
        # ====

        if nparts == 1:
            path = parts[0]
        elif nparts == 2:
            path = parts[0]


            # Limits
            # ======

            limits = parts[1]
            if not limits.count(':') == 1:
                raise ValueError("malformed limits (no colon): '%s'" % limits)
            upper, lower = limits.split(':')
            #if not (upper + lower):
            #    raise ValueError("no limits given: '%s'" % limits)

            if not upper:
                upper = None
            else:
                if not upper.isdigit():
                    raise ValueError("bad upper limit: '%s'" % upper)
                upper = int(upper)

            if not lower:
                lower = None
            else:
                if not lower.isdigit():
                    raise ValueError("bad lower limit: '%s'" % lower)
                lower = int(lower)

            if None not in (upper, lower):
                if upper > lower:
                    raise ValueError( "upper limit greater than lower: "
                                    + "%d > %d" % (upper, lower)
                                     )

        if path == os.sep:
            path = ''
        if not self.case_sensitive:
            path = path.lower()
        return path, upper, lower
class ExtendedPathIndex(PathIndex):
    """A path index stores all path components of the physical path of an
    object.

    Internal datastructure (regular pathindex):

    - a physical path of an object is split into its components

    - every component is kept as a key of a OOBTree in self._indexes

    - the value is a mapping 'level of the path component' to
      'all docids with this path component on this level'

    In addition

    - there is a terminator (None) signifying the last component in the path

    - 2 additional indexes map absolute path to either the doc id or doc ids of
      contained objects. This allows for rapid answering of common queries.
    """

    meta_type = "ExtendedPathIndex"

    manage_options = (
        {'label': 'Settings', 'action': 'manage_main'},
    )

    indexed_attrs = None
    multi_valued = False
    query_options = ("query", "level", "operator",
                     "depth", "navtree", "navtree_start")

    def __init__(self, id, extra=None, caller=None):
        """ ExtendedPathIndex supports indexed_attrs """
        PathIndex.__init__(self, id, caller)

        if isinstance(extra, dict):
            attrs = extra.get('indexed_attrs', None)
            self.multi_valued = extra.get('multi_valued', False)
        else:
            attrs = getattr(extra, 'indexed_attrs', None)
            self.multi_valued = getattr(extra, 'multi_valued', False)

        if attrs is None:
            return

        if isinstance(attrs, str):
            attrs = attrs.split(',')
        attrs = [a.strip() for a in attrs]
        attrs = [a for a in attrs if a]

        if attrs:
            # We only index the first attribute so snip off the rest
            self.indexed_attrs = tuple(attrs[:1])

    def clear(self):
        PathIndex.clear(self)
        self._index_parents = OOBTree()
        self._index_items = OIBTree()

    def index_object(self, docid, obj, threshold=100):
        """ hook for (Z)Catalog """

        # PathIndex first checks for an attribute matching its id and
        # falls back to getPhysicalPath only when failing to get one.
        # If self.indexed_attrs is not None, it's value overrides this behavior

        attrs = self.indexed_attrs
        index = attrs is None and self.id or attrs[0]

        path = getattr(obj, index, None)
        if path is not None:
            if safe_callable(path):
                path = path()

            if not isinstance(path, (str, tuple)):
                raise TypeError('path value must be string or tuple '
                                'of strings: (%r, %s)' % (index, repr(path)))
        else:
            try:
                path = obj.getPhysicalPath()
            except AttributeError:
                return 0

        if isinstance(path, (list, tuple)):
            path = '/' + '/'.join(path[1:])
        comps = [p for p in path.split('/') if p]

        # Make sure we reindex properly when path change
        old_path = self._unindex.get(docid, _marker)
        if old_path is not _marker:
            if old_path != path:
                self.unindex_object(docid, _old=old_path)
                # unindex reduces length, we need to counter that
                self._length.change(1)
        else:
            # We only get a new entry if the value wasn't there before.
            # If it already existed the length is unchanged
            self._length.change(1)

        for i, comp in enumerate(comps):
            self.insertEntry(comp, docid, i)

        # Add terminator
        self.insertEntry(None, docid, len(comps) - 1)

        # Add full-path indexes, to optimize certain edge cases
        parent_path = '/' + '/'.join(comps[:-1])
        parents = self._index_parents.get(parent_path, _marker)
        if parents is _marker:
            self._index_parents[parent_path] = parents = IITreeSet()
        parents.insert(docid)
        self._index_items[path] = docid

        self._unindex[docid] = path
        return 1

    def unindex_object(self, docid, _old=_marker):
        """ hook for (Z)Catalog """

        if _old is not _marker:
            old_value = _old
        else:
            old_value = self._unindex.get(docid, _marker)
            if old_value is _marker:
                logger.log(logging.INFO,
                           'Attempt to unindex nonexistent object with id '
                           '%s' % docid)
                return

        # There is an assumption that paths start with /
        comps = [p for p in old_value.split('/') if p]

        def unindex(comp, level, docid=docid):
            index_comp = self._index[comp]
            index_comp[level].remove(docid)
            if not index_comp[level]:
                del index_comp[level]
            if not index_comp:
                del self._index[comp]

        try:
            for level, comp in enumerate(comps):
                unindex(comp, level)

            # Remove the terminator
            unindex(None, len(comps) - 1)

            # Remove full-path indexes
            parent_path = '/' + '/'.join(comps[:-1])
            parents = self._index_parents.get(parent_path, _marker)
            if parents is not _marker:
                parents.remove(docid)
                if not parents:
                    del self._index_parents[parent_path]
            del self._index_items['/'.join([parent_path, comps[-1]])]
        except KeyError:
            logger.log(logging.INFO,
                       'Attempt to unindex object with id '
                       '%s failed' % docid)

        self._length.change(-1)
        del self._unindex[docid]

    def search(self, path, default_level=0, depth=-1, navtree=0,
               navtree_start=0, resultset=None):
        """
        path is either a string representing a relative URL or a part of a
        relative URL or a tuple (path, level).

        default_level specifies the level to use when no more specific level
        has been passed in with the path.

        level >= 0  starts searching at the given level
        level <  0  finds matches at *any* level

        depth let's you limit the results to items at most depth levels deeper
        than the matched path. depth == 0 means no subitems are included at
        all, with depth == 1 only direct children are included, etc.
        depth == -1, the default, returns all children at any depth.

        navtree is treated as a boolean; if it evaluates to True, not only the
        query match is returned, but also each container in the path. If depth
        is greater than 0, also all siblings of those containers, as well as
        the siblings of the match are included as well, plus *all* documents at
        the starting level.

        navtree_start limits what containers are included in a navtree search.
        If greater than 0, only containers (and possibly their siblings) at
        that level and up will be included in the resultset.

        """
        if isinstance(path, string_types):
            level = default_level
        else:
            level = int(path[1])
            path = path[0]

        if level < 0:
            # Search at every level, return the union of all results
            return multiunion(
                [self.search(path, level, depth, navtree, navtree_start)
                 for level in range(self._depth + 1)])

        comps = [p for p in path.split('/') if p]

        if navtree and depth == -1:  # Navtrees don't do recursive
            depth = 1

        # Optimizations

        pathlength = level + len(comps) - 1
        if navtree and navtree_start > min(pathlength + depth, self._depth):
            # This navtree_start excludes all items that match the depth
            return IISet()

        if level == 0 and depth in (0, 1):
            # We have easy indexes for absolute paths where
            # we are looking for depth 0 or 1 result sets
            if navtree:
                # Optimized absolute path navtree and breadcrumbs cases
                result = []
                add = lambda x: x is not None and result.append(x)
                if depth == 1 and not self.multi_valued:
                    # Navtree case, all sibling elements along the path
                    convert = multiunion
                    index = self._index_parents
                else:
                    # Breadcrumbs case, all direct elements along the path
                    convert = IISet
                    index = self._index_items
                # Collect all results along the path
                for i in range(len(comps), navtree_start - 1, -1):
                    parent_path = '/' + '/'.join(comps[:i])
                    add(index.get(parent_path))
                return convert(result)

            if not path.startswith('/'):
                path = '/' + path
            if depth == 0 and not self.multi_valued:
                # Specific object search
                res = self._index_items.get(path)
                return res and IISet([res]) or IISet()
            else:
                # Single depth search
                return self._index_parents.get(path, IISet())

        # Avoid using the root set
        # as it is common for all objects anyway and add overhead
        # There is an assumption about all indexed values having the
        # same common base path
        if level == 0:
            indexpath = [p for p in self.getPhysicalPath() if p]
            minlength = min(len(indexpath), len(comps))
            # Truncate path to first different element
            for i in range(minlength):
                if indexpath[i] != comps[i]:
                    break
                level += 1
            comps = comps[level:]

        if not comps and depth == -1:
            # Recursive search for everything
            return IISet(self._unindex)

        # Core application of the indexes
        pathset = None
        depthset = None  # For limiting depth

        if navtree and depth > 0:
            # Include the elements up to the matching path
            depthset = multiunion([
                self._index.get(None, {}).get(i, IISet())
                for i in range(min(navtree_start, level),
                               max(navtree_start, level) + 1)])

        indexedcomps = enumerate(comps)
        if not navtree:
            # Optimize relative-path searches by starting with the
            # presumed smaller sets at the end of the path first
            # We can't do this for the navtree case because it needs
            # the bigger rootset to include siblings along the way.
            indexedcomps = list(indexedcomps)
            indexedcomps.reverse()

        for i, comp in indexedcomps:
            # Find all paths that have comp at the given level
            res = self._index.get(comp, {}).get(i + level)
            if res is None:
                # Non-existing path; navtree is inverse, keep going
                pathset = IISet()
                if not navtree:
                    return pathset
            pathset = intersection(pathset, res)

            if navtree and i + level >= navtree_start:
                depthset = union(depthset, intersection(pathset,
                    self._index.get(None, {}).get(i + level)))

        if depth >= 0:
            # Limit results to those that terminate within depth levels
            start = len(comps) - 1
            if navtree:
                start = max(start, (navtree_start - level))
            depthset = [depthset] + [
                intersection(pathset, self._index.get(None, {}).get(i + level))
                for i in range(start, start + depth + 1)
            ]
            depthset = multiunion([d for d in depthset if d])

        if navtree or depth >= 0:
            return depthset
        return pathset

    def _apply_index(self, request, resultset=None):
        """ hook for (Z)Catalog
            'request' --  mapping type (usually {"path": "..." }
             additionaly a parameter "path_level" might be passed
             to specify the level (see search())
        """
        record = IndexQuery(request, self.id, self.query_options)
        if record.keys is None:
            return None
        return (self.query_index(record), (self.id, ))

    def query_index(self, record, resultset=None):
        level = record.get("level", 0)
        operator = record.get('operator', self.useOperator).lower()
        depth = getattr(record, 'depth', -1)  # use getattr to get 0 value
        navtree = record.get('navtree', 0)
        navtree_start = record.get('navtree_start', 0)

        # depending on the operator we use intersection of union
        if operator == "or":
            set_func = union
        else:
            set_func = intersection

        result = None
        for k in record.keys:
            rows = self.search(k, level, depth, navtree, navtree_start,
                               resultset=resultset)
            result = set_func(result, rows)

        if result:
            return result
        return IISet()

    def getIndexSourceNames(self):
        """ return names of indexed attributes """
        attrs = self.indexed_attrs or ('getPhysicalPath', )
        return tuple(attrs)
Esempio n. 25
0
class Lexicon(Persistent):
    """
    Implementation of :class:`zope.index.text.interfaces.ILexicon`.
    """

    def __init__(self, *pipeline):
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree() # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.
        self.wordCount = Length()
        self._pipeline = pipeline

    def wordCount(self):
        """Return the number of unique terms in the lexicon."""
        # overridden per instance
        return len(self._wids)

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        if text is None:
            text = ''
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            # Make sure wordCount is overridden with a BTrees.Length.Length
            self.wordCount = Length(self.wordCount())
        # Strategically unload the length value so that we get the most
        # recent value written to the database to minimize conflicting wids
        # Because length is independent, this will load the most
        # recent value stored, regardless of whether MVCC is enabled
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix) # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            wid = self._new_wid()
            self._wids[word] = wid
            self._words[wid] = word
        return wid

    def _new_wid(self):
        count = self.wordCount
        count.change(1)
        while count() in self._words:
            # just to be safe
            count.change(1)
        return count()
Esempio n. 26
0
class DocumentMap(Persistent):
    """ A two-way map between addresses (e.g. location paths) and document ids.

    The map is a persistent object meant to live in a ZODB storage.

    Additionally, the map is capable of mapping 'metadata' to docids.
    """
    _v_nextid = None
    family = BTrees.family32
    _randrange = random.randrange
    docid_to_metadata = None # latch for b/c

    def __init__(self):
        self.docid_to_address = IOBTree()
        self.address_to_docid = OIBTree()
        self.docid_to_metadata = IOBTree()

    def docid_for_address(self, address):
        """ Retrieve a document id for a given address.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        Return the integer document id corresponding to ``address``.

        If ``address`` doesn't exist in the document map, return None.
        """
        return self.address_to_docid.get(address)

    def address_for_docid(self, docid):
        """ Retrieve an address for a given document id.

        ``docid`` is an integer document id.

        Return the address corresponding to ``docid``.

        If ``docid`` doesn't exist in the document map, return None.
        """
        return self.docid_to_address.get(docid)

    def add(self, address, docid=_marker):
        """ Add a new document to the document map.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        ``docid``, if passed, must be an int.  In this case, remove
        any previous address stored for it before mapping it to the
        new address.  Passing an explicit ``docid`` also removes any
        metadata associated with that docid.
        
        If ``docid`` is not passed, generate a new docid.

        Return the integer document id mapped to ``address``.
        """
        if docid is _marker:
            docid = self.new_docid()

        self.remove_docid(docid)
        self.remove_address(address)

        self.docid_to_address[docid] = address
        self.address_to_docid[address] = docid
        return docid

    def remove_docid(self, docid):
        """ Remove a document from the document map for the given document ID.

        ``docid`` is an integer document id.

        Remove any corresponding metadata for ``docid`` as well.

        Return a True if ``docid`` existed in the map, else return False.
        """
        # It should be an invariant that if one entry exists in
        # docid_to_address for a docid/address pair, exactly one
        # corresponding entry exists in address_to_docid for the same
        # docid/address pair.  However, versions of this code before
        # r.catalog 0.7.3 had a bug which, if this method was called
        # multiple times, each time with the same address but a
        # different docid, the ``docid_to_address`` mapping could
        # contain multiple entries for the same address each with a
        # different docid, causing this invariant to be violated.  The
        # symptom: in systems that used r.catalog 0.7.2 and lower,
        # there might be more entries in docid_to_address than there
        # are in address_to_docid.  The conditional fuzziness in the
        # code directly below is a runtime kindness to systems in that
        # state.  Technically, the administrator of a system in such a
        # state should normalize the two data structures by running a
        # script after upgrading to 0.7.3.  If we made the admin do
        # this, some of the code fuzziness below could go away,
        # replaced with something simpler.  But there's no sense in
        # breaking systems at runtime through being a hardass about
        # consistency if an unsuspecting upgrader has not yet run the
        # data fixer script. The "fix the data" mantra rings a
        # little hollow when you weren't the one who broke the data in
        # the first place ;-)

        self._check_metadata()

        address = self.docid_to_address.get(docid, _marker)
        if address is _marker:
            return False
        
        old_docid = self.address_to_docid.get(address, _marker)
        if (old_docid is not _marker) and (old_docid != docid):
            self.remove_docid(old_docid)

        if docid in self.docid_to_address:
            del self.docid_to_address[docid]
        if address in self.address_to_docid:
            del self.address_to_docid[address]
        if docid in self.docid_to_metadata:
            del self.docid_to_metadata[docid]

        return True


    def remove_address(self, address):
        """ Remove a document from the document map using an address.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        Remove any corresponding metadata for ``address`` as well.

        Return a True if ``address`` existed in the map, else return False.
        """
        # See the comment in remove_docid for complexity rationalization
        
        self._check_metadata()

        docid = self.address_to_docid.get(address, _marker)
        if docid is _marker:
            return False
        
        old_address = self.docid_to_address.get(docid, _marker)
        if (old_address is not _marker) and (old_address != address):
            self.remove_address(old_address)

        if docid in self.docid_to_address:
            del self.docid_to_address[docid]
        if address in self.address_to_docid:
            del self.address_to_docid[address]
        if docid in self.docid_to_metadata:
            del self.docid_to_metadata[docid]

        return True

    def _check_metadata(self):
        # backwards compatibility
        if self.docid_to_metadata is None:
            self.docid_to_metadata = IOBTree()

    def add_metadata(self, docid, data):
        """ Add metadata related to a given document id.

        ``data`` must be a mapping, such as a dictionary.
        
        For each key/value pair in ``data`` insert a metadata key/value pair
        into the metadata stored for ``docid``.

        Overwrite any existing values for the keys in ``data``, leaving values
        unchanged for other existing keys.

        Raise a KeyError If ``docid`` doesn't relate to an address in the
        document map.
        """
        if not docid in self.docid_to_address:
            raise KeyError(docid)
        if len(data.keys()) == 0:
            return
        self._check_metadata()
        meta = self.docid_to_metadata.setdefault(docid, OOBTree())
        for k in data:
            meta[k] = data[k]

    def remove_metadata(self, docid, *keys):
        """ Remove metadata related to a given document id.

        If ``docid`` doesn't exist in the metadata map, raise a KeyError.

        For each key in ``keys``, remove the metadata value for the
        docid related to that key.
        
        Do not raise any error if no value exists for a given key.

        If no keys are specified, remove all metadata related to the docid.
        """
        self._check_metadata()
        if keys:
            meta = self.docid_to_metadata.get(docid, _marker)
            if meta is _marker:
                raise KeyError(docid)
            for k in keys:
                if k in meta:
                    del meta[k]
            if not meta:
                del self.docid_to_metadata[docid]
        else:
            if not (docid in self.docid_to_metadata):
                raise KeyError(docid)
            del self.docid_to_metadata[docid]

    def get_metadata(self, docid):
        """ Return the metadata for ``docid``.

        Return a mapping of the keys and values set using ``add_metadata``.

        Raise a KeyError If metadata does not exist for ``docid``.
        """
        if self.docid_to_metadata is None:
            raise KeyError(docid)
        meta = self.docid_to_metadata[docid]
        return meta

    def new_docid(self):
        """ Return a new document id.

        The returned value is guaranteed not to be used already in this
        document map.
        """
        while True:
            if self._v_nextid is None:
                self._v_nextid = self._randrange(self.family.minint,
                                                 self.family.maxint)
            uid = self._v_nextid
            self._v_nextid += 1
            if uid not in self.docid_to_address:
                return uid
            self._v_nextid = None
Esempio n. 27
0
class UUIDIndex(UnIndex):
    """Index for uuid fields with an unique value per key.

    The internal structure is:

    self._index = {datum:documentId]}
    self._unindex = {documentId:datum}

    For each datum only one documentId can exist.
    """

    meta_type = "UUIDIndex"

    manage_options = (
        {
            'label': 'Settings',
            'action': 'manage_main'
        },
        {
            'label': 'Browse',
            'action': 'manage_browse'
        },
    )

    query_options = ["query", "range"]

    manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals())
    manage_main._setName('manage_main')
    manage_browse = DTMLFile('../dtml/browseIndex', globals())

    def clear(self):
        self._length = Length()
        self._index = OIBTree()
        self._unindex = IOBTree()

    def numObjects(self):
        """Return the number of indexed objects. Since we have a 1:1 mapping
        from documents to values, we can reuse the stored length.
        """
        return self.indexSize()

    def uniqueValues(self, name=None, withLengths=0):
        """returns the unique values for name

        if withLengths is true, returns a sequence of
        tuples of (value, length)
        """
        if name is None:
            name = self.id
        elif name != self.id:
            return []

        if not withLengths:
            return tuple(self._index.keys())
        # We know the length for each value is one
        return [(k, 1) for k in self._index.keys()]

    def insertForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and put it in the correct place
        in the forward index.
        """
        if entry is None:
            return

        old_docid = self._index.get(entry, _marker)
        if old_docid is _marker:
            self._index[entry] = documentId
            self._length.change(1)
        elif old_docid != documentId:
            logger.error("A different document with value '%s' already "
                         "exists in the index.'" % entry)

    def removeForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and remove any reference to documentId
        in its entry in the index.
        """
        old_docid = self._index.get(entry, _marker)
        if old_docid is not _marker:
            del self._index[entry]
            self._length.change(-1)

    def _get_object_datum(self, obj, attr):
        # for a uuid it never makes sense to acquire a parent value via
        # Acquisition
        has_attr = getattr(aq_base(obj), attr, _marker)
        if has_attr is _marker:
            return _marker
        return super(UUIDIndex, self)._get_object_datum(obj, attr)
Esempio n. 28
0
class Lexicon(Persistent):

    implements(ILexicon)

    def __init__(self, *pipeline):
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree() # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.
        self._nextwid = 1
        self._pipeline = pipeline

        # Keep some statistics about indexing
        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
        self._nwords = 0 # Number of words indexed (after pipeline)

    def wordCount(self):
        """Return the number of unique terms in the lexicon."""
        return self._nextwid - 1

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        last = _text2list(text)
        for t in last:
            self._nbytes += len(t)
        for element in self._pipeline:
            last = element.process(last)
        self._nwords += len(last)
        return map(self._getWordIdCreate, last)

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix) # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            wid = self._new_wid()
            self._wids[word] = wid
            self._words[wid] = word
        return wid

    def _new_wid(self):
        wid = self._nextwid
        self._nextwid += 1
        return wid
Esempio n. 29
0
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
    """ An Object Catalog

    An Object Catalog maintains a table of object metadata, and a
    series of manageable indexes to quickly search for objects
    (references in the metadata) that satisfy a search query.

    This class is not Zope specific, and can be used in any python
    program to build catalogs of objects.  Note that it does require
    the objects to be Persistent, and thus must be used with ZODB3.
    """

    _v_brains = NoBrainer

    def __init__(self, vocabulary=None, brains=None):
        # Catalogs no longer care about vocabularies and lexicons
        # so the vocabulary argument is ignored. (Casey)

        self.schema = {}  # mapping from attribute name to column number
        self.names = ()  # sequence of column names
        self.indexes = {}  # maping from index name to index object

        # The catalog maintains a BTree of object meta_data for
        # convenient display on result pages.  meta_data attributes
        # are turned into brain objects and returned by
        # searchResults.  The indexing machinery indexes all records
        # by an integer id (rid). self.data is a mapping from the
        # integer id to the meta_data, self.uids is a mapping of the
        # object unique identifier to the rid, and self.paths is a
        # mapping of the rid to the unique identifier.

        self.clear()

        if brains is not None:
            self._v_brains = brains

        self.updateBrains()

    def __len__(self):
        return self._length()

    def clear(self):
        """ clear catalog """

        self.data = IOBTree()  # mapping of rid to meta_data
        self.uids = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid
        self._length = BTrees.Length.Length()

        for index in self.indexes.keys():
            self.getIndex(index).clear()

    def updateBrains(self):
        self.useBrains(self._v_brains)

    def __getitem__(self, index, ttype=type(())):
        """
        Returns instances of self._v_brains, or whatever is passed
        into self.useBrains.
        """
        if type(index) is ttype:
            # then it contains a score...
            normalized_score, score, key = index
            r = self._v_result_class(self.data[key]).__of__(aq_parent(self))
            r.data_record_id_ = key
            r.data_record_score_ = score
            r.data_record_normalized_score_ = normalized_score
        else:
            # otherwise no score, set all scores to 1
            r = self._v_result_class(self.data[index]).__of__(aq_parent(self))
            r.data_record_id_ = index
            r.data_record_score_ = 1
            r.data_record_normalized_score_ = 1
        return r

    def __setstate__(self, state):
        """ initialize your brains.  This method is called when the
        catalog is first activated (from the persistent storage) """
        Persistent.__setstate__(self, state)
        self.updateBrains()

    def useBrains(self, brains):
        """ Sets up the Catalog to return an object (ala ZTables) that
        is created on the fly from the tuple stored in the self.data
        Btree.
        """
        class mybrains(AbstractCatalogBrain, brains):
            pass

        scopy = self.schema.copy()

        scopy['data_record_id_'] = len(self.schema.keys())
        scopy['data_record_score_'] = len(self.schema.keys()) + 1
        scopy['data_record_normalized_score_'] = len(self.schema.keys()) + 2

        mybrains.__record_schema__ = scopy

        self._v_brains = brains
        self._v_result_class = mybrains

    def addColumn(self, name, default_value=None):
        """
        adds a row to the meta data schema
        """

        schema = self.schema
        names = list(self.names)

        if name in schema:
            raise CatalogError('The column %s already exists' % name)

        if name[0] == '_':
            raise CatalogError('Cannot cache fields beginning with "_"')

        values = schema.values()
        if values:
            schema[name] = max(values) + 1
        else:
            schema[name] = 0
        names.append(name)

        if default_value in (None, ''):
            default_value = MV

        for key, value in self.data.items():
            rec = list(value)
            rec.append(default_value)
            self.data[key] = tuple(rec)

        self.names = tuple(names)
        self.schema = schema

        # new column? update the brain
        self.updateBrains()

        self._p_changed = 1  # why?

    def delColumn(self, name):
        """
        deletes a row from the meta data schema
        """
        names = list(self.names)
        _index = names.index(name)

        if not name in self.schema:
            LOG.error('delColumn attempted to delete nonexistent '
                      'column %s.' % str(name))
            return

        del names[_index]

        # rebuild the schema
        i = 0
        schema = {}
        for name in names:
            schema[name] = i
            i = i + 1

        self.schema = schema
        self.names = tuple(names)

        # update the brain
        self.updateBrains()

        # remove the column value from each record
        for key, value in self.data.items():
            rec = list(value)
            del rec[_index]
            self.data[key] = tuple(rec)

    def addIndex(self, name, index_type):
        """Create a new index, given a name and a index_type.

        Old format: index_type was a string, 'FieldIndex' 'TextIndex' or
        'KeywordIndex' is no longer valid; the actual index must be
        instantiated and passed in to addIndex.

        New format: index_type is the actual index object to be stored.
        """

        if name in self.indexes:
            raise CatalogError('The index %s already exists' % name)

        if name.startswith('_'):
            raise CatalogError('Cannot index fields beginning with "_"')

        if not name:
            raise CatalogError('Name of index is empty')

        indexes = self.indexes

        if isinstance(index_type, str):
            raise TypeError("Catalog addIndex now requires the index type to"
                            "be resolved prior to adding; create the proper "
                            "index in the caller.")

        indexes[name] = index_type
        self.indexes = indexes

    def delIndex(self, name):
        """ deletes an index """

        if not name in self.indexes:
            raise CatalogError('The index %s does not exist' % name)

        indexes = self.indexes
        del indexes[name]
        self.indexes = indexes

    def getIndex(self, name):
        """ get an index wrapped in the catalog """
        return self.indexes[name].__of__(self)

    def updateMetadata(self, object, uid):
        """ Given an object and a uid, update the column data for the
        uid with the object data iff the object has changed """
        data = self.data
        index = self.uids.get(uid, None)
        newDataRecord = self.recordify(object)

        if index is None:
            if type(data) is IOBTree:
                # New style, get random id

                index = getattr(self, '_v_nextid', 0)
                if index % 4000 == 0:
                    index = randint(-2000000000, 2000000000)
                while not data.insert(index, newDataRecord):
                    index = randint(-2000000000, 2000000000)

                # We want ids to be somewhat random, but there are
                # advantages for having some ids generated
                # sequentially when many catalog updates are done at
                # once, such as when reindexing or bulk indexing.
                # We allocate ids sequentially using a volatile base,
                # so different threads get different bases. This
                # further reduces conflict and reduces churn in
                # here and it result sets when bulk indexing.
                self._v_nextid = index + 1
            else:
                if data:
                    # find the next available unique id
                    index = data.keys()[-1] + 1
                else:
                    index = 0
                # meta_data is stored as a tuple for efficiency
                data[index] = newDataRecord
        else:
            if data.get(index, 0) != newDataRecord:
                data[index] = newDataRecord
        return index

    # the cataloging API

    def catalogObject(self,
                      object,
                      uid,
                      threshold=None,
                      idxs=None,
                      update_metadata=1):
        """
        Adds an object to the Catalog by iteratively applying it to
        all indexes.

        'object' is the object to be cataloged

        'uid' is the unique Catalog identifier for this object

        If 'idxs' is specified (as a sequence), apply the object only
        to the named indexes.

        If 'update_metadata' is true (the default), also update metadata for
        the object.  If the object is new to the catalog, this flag has
        no effect (metadata is always created for new objects).

        """

        if idxs is None:
            idxs = []

        index = self.uids.get(uid, None)

        if index is None:  # we are inserting new data
            index = self.updateMetadata(object, uid)
            self._length.change(1)
            self.uids[uid] = index
            self.paths[index] = uid

        elif update_metadata:  # we are updating and we need to update metadata
            self.updateMetadata(object, uid)

        # do indexing
        total = 0

        if idxs == []:
            use_indexes = self.indexes.keys()
        else:
            use_indexes = idxs

        for name in use_indexes:
            x = self.getIndex(name)
            if hasattr(x, 'index_object'):
                blah = x.index_object(index, object, threshold)
                total = total + blah
            else:
                LOG.error('catalogObject was passed bad index '
                          'object %s.' % str(x))

        return total

    def uncatalogObject(self, uid):
        """
        Uncatalog and object from the Catalog.  and 'uid' is a unique
        Catalog identifier

        Note, the uid must be the same as when the object was
        catalogued, otherwise it will not get removed from the catalog

        This method should not raise an exception if the uid cannot
        be found in the catalog.

        """
        data = self.data
        uids = self.uids
        paths = self.paths
        indexes = self.indexes.keys()
        rid = uids.get(uid, None)

        if rid is not None:
            for name in indexes:
                x = self.getIndex(name)
                if hasattr(x, 'unindex_object'):
                    x.unindex_object(rid)
            del data[rid]
            del paths[rid]
            del uids[uid]
            self._length.change(-1)

        else:
            LOG.error('uncatalogObject unsuccessfully '
                      'attempted to uncatalog an object '
                      'with a uid of %s. ' % str(uid))

    def uniqueValuesFor(self, name):
        """ return unique values for FieldIndex name """
        return self.getIndex(name).uniqueValues()

    def hasuid(self, uid):
        """ return the rid if catalog contains an object with uid """
        return self.uids.get(uid)

    def recordify(self, object):
        """ turns an object into a record tuple """
        record = []
        # the unique id is always the first element
        for x in self.names:
            attr = getattr(object, x, MV)
            if (attr is not MV and safe_callable(attr)):
                attr = attr()
            record.append(attr)
        return tuple(record)

    def instantiate(self, record):
        r = self._v_result_class(record[1])
        r.data_record_id_ = record[0]
        return r.__of__(self)

    def getMetadataForRID(self, rid):
        record = self.data[rid]
        result = {}
        for (key, pos) in self.schema.items():
            result[key] = record[pos]
        return result

    def getIndexDataForRID(self, rid):
        result = {}
        for name in self.indexes.keys():
            result[name] = self.getIndex(name).getEntryForObject(rid, "")
        return result

    ## This is the Catalog search engine. Most of the heavy lifting happens
    # below

    def make_query(self, request):
        # This is a bit of a mess, but the ZCatalog API has traditionally
        # supported passing in query restrictions in almost arbitary ways
        real_req = None
        if isinstance(request, dict):
            query = request.copy()
        elif isinstance(request, CatalogSearchArgumentsMap):
            query = {}
            query.update(request.keywords)
            real_req = request.request
            if isinstance(real_req, dict):
                query.update(real_req)
                real_req = None
        else:
            real_req = request

        if real_req:
            warnings.warn('You have specified a query using either a request '
                          'object or a mixture of a query dict and keyword '
                          'arguments. Please use only a simple query dict. '
                          'Your query contained "%s". This support is '
                          'deprecated and will be removed in Zope 2.14.' %
                          repr(real_req),
                          DeprecationWarning,
                          stacklevel=4)

            known_keys = query.keys()
            # The request has too many places where an index restriction
            # might be specified. Putting all of request.form,
            # request.other, ... into the query isn't what we want.
            # So we iterate over all known indexes instead and see if they
            # are in the request.
            for iid in self.indexes.keys():
                if iid in known_keys:
                    continue
                value = real_req.get(iid)
                if value:
                    query[iid] = value
        return query

    def _sorted_search_indexes(self, query):
        # Simple implementation doing no ordering.
        query_keys = query.keys()
        order = []
        for name, index in self.indexes.items():
            if name not in query_keys:
                continue
            order.append((ILimitedResultIndex.providedBy(index), name))
        order.sort()
        return [i[1] for i in order]

    def _limit_sequence(self,
                        sequence,
                        slen,
                        b_start=0,
                        b_size=None,
                        switched_reverse=False):
        if b_size is not None:
            sequence = sequence[b_start:b_start + b_size]
            if slen:
                slen = len(sequence)
        if switched_reverse:
            sequence.reverse()
        return (sequence, slen)

    def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        rs = None  # resultset

        # Indexes fulfill a fairly large contract here. We hand each
        # index the query mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the query, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # Canonicalize the request into a sensible query before passing it on
        query = self.make_query(query)

        cr = self.getCatalogPlan(query)
        cr.start()

        plan = cr.plan()
        if not plan:
            plan = self._sorted_search_indexes(query)

        indexes = self.indexes.keys()
        for i in plan:
            if i not in indexes:
                # We can have bogus keys or the plan can contain index names
                # that have been removed in the meantime
                continue

            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue

            cr.start_split(i)
            limit_result = ILimitedResultIndex.providedBy(index)
            if limit_result:
                r = _apply_index(query, rs)
            else:
                r = _apply_index(query)

            if r is not None:
                r, u = r
                # Short circuit if empty result
                # BBB: We can remove the "r is not None" check in Zope 2.14
                # once we don't need to support the "return everything" case
                # anymore
                if r is not None and not r:
                    cr.stop_split(i, result=None, limit=limit_result)
                    return LazyCat([])

                # provide detailed info about the pure intersection time
                intersect_id = i + '#intersection'
                cr.start_split(intersect_id)
                w, rs = weightedIntersection(rs, r)
                cr.stop_split(intersect_id)

                # consider the time it takes to intersect the index result with
                # the total resultset to be part of the index time
                cr.stop_split(i, result=r, limit=limit_result)
                if not rs:
                    break
            else:
                cr.stop_split(i, result=None, limit=limit_result)

        # Try to deduce the sort limit from batching arguments
        b_start = int(query.get('b_start', 0))
        b_size = query.get('b_size', None)
        if b_size is not None:
            b_size = int(b_size)

        if b_size is not None:
            limit = b_start + b_size
        elif limit and b_size is None:
            b_size = limit

        if rs is None:
            # None of the indexes found anything to do with the query
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            warnings.warn('Your query %s produced no query restriction. '
                          'Currently the entire catalog content is returned. '
                          'In Zope 2.14 this will result in an empty LazyCat '
                          'to be returned.' % repr(make_key(self, query)),
                          DeprecationWarning,
                          stacklevel=3)

            rlen = len(self)
            if sort_index is None:
                sequence, slen = self._limit_sequence(self.data.items(), rlen,
                                                      b_start, b_size)
                result = LazyMap(self.instantiate,
                                 sequence,
                                 slen,
                                 actual_result_count=rlen)
            else:
                cr.start_split('sort_on')
                result = self.sortResults(self.data,
                                          sort_index,
                                          reverse,
                                          limit,
                                          merge,
                                          actual_result_count=rlen,
                                          b_start=b_start,
                                          b_size=b_size)
                cr.stop_split('sort_on', None)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            rlen = len(rs)
            if sort_index is None and hasattr(rs, 'values'):
                # having a 'values' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.

                if not merge:
                    # Don't bother to sort here, return a list of
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    result = [(score, (1, score, rid), getitem)
                              for rid, score in rs.items()]
                else:
                    cr.start_split('sort_on')

                    rs = rs.byValue(0)  # sort it by score
                    max = float(rs[0][0])

                    # Here we define our getter function inline so that
                    # we can conveniently store the max value as a default arg
                    # and make the normalized score computation lazy
                    def getScoredResult(item, max=max, self=self):
                        """
                        Returns instances of self._v_brains, or whatever is
                        passed into self.useBrains.
                        """
                        score, key = item
                        r=self._v_result_class(self.data[key])\
                              .__of__(aq_parent(self))
                        r.data_record_id_ = key
                        r.data_record_score_ = score
                        r.data_record_normalized_score_ = int(100. * score /
                                                              max)
                        return r

                    sequence, slen = self._limit_sequence(
                        rs, rlen, b_start, b_size)
                    result = LazyMap(getScoredResult,
                                     sequence,
                                     slen,
                                     actual_result_count=rlen)
                    cr.stop_split('sort_on', None)

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                sequence, slen = self._limit_sequence(rs, rlen, b_start,
                                                      b_size)
                result = LazyMap(self.__getitem__,
                                 sequence,
                                 slen,
                                 actual_result_count=rlen)
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                cr.start_split('sort_on')
                result = self.sortResults(rs,
                                          sort_index,
                                          reverse,
                                          limit,
                                          merge,
                                          actual_result_count=rlen,
                                          b_start=b_start,
                                          b_size=b_size)
                cr.stop_split('sort_on', None)
        else:
            # Empty result set
            result = LazyCat([])
        cr.stop()
        return result

    def sortResults(self,
                    rs,
                    sort_index,
                    reverse=0,
                    limit=None,
                    merge=1,
                    actual_result_count=None,
                    b_start=0,
                    b_size=None):
        # Sort a result set using a sort index. Return a lazy
        # result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples
        #
        # The two 'for' loops in here contribute a significant
        # proportion of the time to perform an indexed search.
        # Try to avoid all non-local attribute lookup inside
        # those loops.
        _intersection = intersection
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        _None = None
        _keyerror = KeyError
        result = []
        append = result.append
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the resultset, reverse sorting
        # order and limit it, then reverse the resultset again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = b_end - b_start
            limit = b_start + b_size

        if merge and limit is None and (rlen >
                                        (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # This is rarely exercised in practice...

            length = 0

            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            for k, intset in sort_index.items():
                # We have an index that has a set of values for
                # each sort key, so we intersect with each set and
                # get a sorted sequence of the intersections.
                intset = _intersection(rs, intset)
                if intset:
                    keys = getattr(intset, 'keys', _None)
                    if keys is not _None:
                        # Is this ever true?
                        intset = keys()
                    length += len(intset)
                    append((k, intset, _self__getitem__))
                    # Note that sort keys are unique.

            if reverse:
                result.sort(reverse=True)
            else:
                result.sort()
            sequence, slen = self._limit_sequence(result, length, b_start,
                                                  b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    append((key, did, _self__getitem__))
                    # The reference back to __getitem__ is used in case
                    # we do not merge now and need to intermingle the
                    # results with those of other catalogs while avoiding
                    # the cost of instantiating a LazyMap per result
            if merge:
                if reverse:
                    result.sort(reverse=True)
                else:
                    result.sort()
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif reverse:
            # Limit/sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            n = 0
            worst = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key <= worst:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[0], result[0]
                    else:
                        n += 1
                    worst = keys[0]
            result.reverse()
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence
        elif not reverse:
            # Limit/sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            n = 0
            best = None
            for did in rs:
                try:
                    key = index_key_map[did]
                except _keyerror:
                    # This document is not in the sort key index, skip it.
                    pass
                else:
                    if n >= limit and key >= best:
                        continue
                    i = bisect(keys, key)
                    keys.insert(i, key)
                    result.insert(i, (key, did, _self__getitem__))
                    if n == limit:
                        del keys[-1], result[-1]
                    else:
                        n += 1
                    best = keys[-1]
            if merge:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                                                   switched_reverse)
                return sequence

        return LazyMap(self.__getitem__,
                       result,
                       len(result),
                       actual_result_count=actual_result_count)

    def _get_sort_attr(self, attr, kw):
        """Helper function to find sort-on or sort-order."""
        # There are three different ways to find the attribute:
        # 1. kw[sort-attr]
        # 2. self.sort-attr
        # 3. kw[sort_attr]
        # kw may be a dict or an ExtensionClass MultiMapping, which
        # differ in what get() returns with no default value.
        name = "sort-%s" % attr
        val = kw.get(name, None)
        if val is not None:
            return val
        val = getattr(self, name, None)
        if val is not None:
            return val
        return kw.get("sort_%s" % attr, None)

    def _getSortIndex(self, args):
        """Returns a search index object or None."""
        sort_index_name = self._get_sort_attr("on", args)
        if sort_index_name is not None:
            # self.indexes is always a dict, so get() w/ 1 arg works
            sort_index = self.indexes.get(sort_index_name)
            if sort_index is None:
                raise CatalogError('Unknown sort_on index (%s)' %
                                   sort_index_name)
            else:
                if not hasattr(sort_index, 'documentToKeyMap'):
                    raise CatalogError(
                        'The index chosen for sort_on (%s) is not capable of '
                        'being used as a sort index.' % sort_index_name)
            return sort_index
        else:
            return None

    def searchResults(self, REQUEST=None, used=None, _merge=1, **kw):
        # You should pass in a simple dictionary as the request argument,
        # which only contains the relevant query.
        # The used argument is deprecated and is ignored
        if REQUEST is None and not kw:
            # Try to acquire request if we get no args for bw compat
            warnings.warn(
                'Calling searchResults without a query argument nor '
                'keyword arguments is deprecated. In Zope 2.14 the '
                'query will no longer be automatically taken from '
                'the acquired request.',
                DeprecationWarning,
                stacklevel=3)
            REQUEST = getattr(self, 'REQUEST', None)
        if isinstance(REQUEST, dict) and not kw:
            # short cut for the best practice
            args = REQUEST
        else:
            args = CatalogSearchArgumentsMap(REQUEST, kw)
        sort_index = self._getSortIndex(args)
        sort_limit = self._get_sort_attr('limit', args)
        reverse = 0
        if sort_index is not None:
            order = self._get_sort_attr("order", args)
            if (isinstance(order, str)
                    and order.lower() in ('reverse', 'descending')):
                reverse = 1
        # Perform searches with indexes and sort_index
        return self.search(args, sort_index, reverse, sort_limit, _merge)

    __call__ = searchResults

    def getCatalogPlan(self, query=None):
        """Query time reporting and planning.
        """
        parent = aq_base(aq_parent(self))
        threshold = getattr(parent, 'long_query_time', 0.1)
        return CatalogPlan(self, query, threshold)
Esempio n. 30
0
class GlobbingLexicon(Lexicon):
    """Lexicon which supports basic globbing function ('*' and '?').

    This lexicon keeps several data structures around that are useful
    for searching. They are:

      '_lexicon' -- Contains the mapping from word => word_id

      '_inverseLex' -- Contains the mapping from word_id => word

      '_digrams' -- Contains a mapping from digram => word_id

    Before going further, it is necessary to understand what a digram is,
    as it is a core component of the structure of this lexicon.  A digram
    is a two-letter sequence in a word.  For example, the word 'zope'
    would be converted into the digrams::

      ['$z', 'zo', 'op', 'pe', 'e$']

    where the '$' is a word marker.  It is used at the beginning and end
    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'

    def __init__(self, useSplitter=None, extra=None):
        self.clear()
        self.useSplitter = useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)

    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()

    def _convertBTrees(self, threshold=200):
        Lexicon._convertBTrees(self, threshold)
        if type(self._digrams) is OOBTree: return

        from BTrees.convert import convert

        _digrams = self._digrams
        self._digrams = OOBTree()
        self._digrams._p_jar = self._p_jar
        convert(_digrams, self._digrams, threshold, IITreeSet)

    def createDigrams(self, word):
        """Returns a list with the set of digrams in the word."""

        word = '$' + word + '$'
        return [word[i:i + 2] for i in range(len(word) - 1)]

    def getWordId(self, word):
        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
            return self.assignWordId(word)

    set = getWordId  # Kludge for old code

    def getWord(self, wid):
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word, and return it."""

        # Double check it's not in the lexicon already, and if it is, just
        # return it.
        if self._lexicon.has_key(word):
            return self._lexicon[word]

        # Get word id. BBB Backward compat pain.
        inverse = self._inverseLex
        try:
            insert = inverse.insert
        except AttributeError:
            # we have an "old" BTree object
            if inverse:
                wid = inverse.keys()[-1] + 1
            else:
                self._inverseLex = IOBTree()
                wid = 1
            inverse[wid] = word
        else:
            # we have a "new" IOBTree object
            wid = randid()
            while not inverse.insert(wid, word):
                wid = randid()

        self._lexicon[word] = wid

        # Now take all the digrams and insert them into the digram map.
        for digram in self.createDigrams(word):
            set = self._digrams.get(digram, None)
            if set is None:
                self._digrams[digram] = set = IISet()
            set.insert(wid)

        return wid

    def get(self, pattern):
        """ Query the lexicon for words matching a pattern."""

        # single word pattern  produce a slicing problem below.
        # Because the splitter throws away single characters we can
        # return an empty tuple here.

        if len(pattern) == 1: return ()

        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
        globbing = 0
        for i in range(len(pattern)):
            if pattern[i] in wc_set:
                globbing = 1
                continue

            if i == 0:
                digrams.insert(i, (self.eow + pattern[i]))
                digrams.append((pattern[i] + pattern[i + 1]))
            else:
                try:
                    if pattern[i + 1] not in wc_set:
                        digrams.append(pattern[i] + pattern[i + 1])

                except IndexError:
                    digrams.append((pattern[i] + self.eow))

        if not globbing:
            result = self._lexicon.get(pattern, None)
            if result is None:
                return ()
            return (result, )

        ## now get all of the intsets that contain the result digrams
        result = None
        for digram in digrams:
            result = union(result, self._digrams.get(digram, None))

        if not result:
            return ()
        else:
            ## now we have narrowed the list of possible candidates
            ## down to those words which contain digrams.  However,
            ## some words may have been returned that match digrams,
            ## but do not match 'pattern'.  This is because some words
            ## may contain all matching digrams, but in the wrong
            ## order.

            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = IISet()
            for x in result:
                if expr.match(self._inverseLex[x]):
                    hits.insert(x)
            return hits

    def __getitem__(self, word):
        """ """
        return self.get(word)

    def query_hook(self, q):
        """expand wildcards"""
        ListType = type([])
        i = len(q) - 1
        while i >= 0:
            e = q[i]
            if isinstance(e, ListType):
                self.query_hook(e)
            elif isinstance(e, Op):
                pass
            elif ((self.multi_wc in e) or (self.single_wc in e)):
                wids = self.get(e)
                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
                if not words:
                    # if words is empty, return something that will make
                    # textindex's __getitem__ return an empty result list
                    words.append('')
                q[i] = words
            i = i - 1

        return q

    def Splitter(self, astring, words=None, encoding="latin1"):
        """ wrap the splitter """

        ## don't do anything, less efficient but there's not much
        ## sense in stemming a globbing lexicon.

        try:
            return self.SplitterFunc(
                astring,
                words,
                encoding=encoding,
                singlechar=self.splitterParams.splitterSingleChars,
                indexnumbers=self.splitterParams.splitterIndexNumbers,
                casefolding=self.splitterParams.splitterCasefolding)
        except:
            return self.SplitterFunc(astring, words)

    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

        # Remove characters that are meaningful in a regex
        if not isinstance(pat, UnicodeType):
            transTable = string.maketrans("", "")
            result = string.translate(pat, transTable, r'()&|!@#$%^{}\<>.')
        else:
            transTable = {}
            for ch in r'()&|!@#$%^{}\<>.':
                transTable[ord(ch)] = None
            result = pat.translate(transTable)

        # First, deal with multi-character globbing
        result = result.replace('*', '.*')

        # Next, we need to deal with single-character globbing
        result = result.replace('?', '.')

        return "%s$" % result
Esempio n. 31
0
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
    """ An Object Catalog

    An Object Catalog maintains a table of object metadata, and a
    series of manageable indexes to quickly search for objects
    (references in the metadata) that satisfy a search query.

    This class is not Zope specific, and can be used in any python
    program to build catalogs of objects.  Note that it does require
    the objects to be Persistent, and thus must be used with ZODB3.
    """

    _v_brains = NoBrainer

    def __init__(self, vocabulary=None, brains=None):
        # Catalogs no longer care about vocabularies and lexicons
        # so the vocabulary argument is ignored. (Casey)

        self.schema = {}   # mapping from attribute name to column number
        self.names = ()    # sequence of column names
        self.indexes = {}  # mapping from index name to index object

        # The catalog maintains a BTree of object meta_data for
        # convenient display on result pages.  meta_data attributes
        # are turned into brain objects and returned by
        # searchResults.  The indexing machinery indexes all records
        # by an integer id (rid). self.data is a mapping from the
        # integer id to the meta_data, self.uids is a mapping of the
        # object unique identifier to the rid, and self.paths is a
        # mapping of the rid to the unique identifier.

        self.clear()

        if brains is not None:
            self._v_brains = brains

        self.updateBrains()

    def __len__(self):
        return self._length()

    def clear(self):
        """ clear catalog """

        self.data = IOBTree()  # mapping of rid to meta_data
        self.uids = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid
        self._length = BTrees.Length.Length()

        for index in self.indexes.keys():
            self.getIndex(index).clear()

    def updateBrains(self):
        self.useBrains(self._v_brains)

    def __getitem__(self, index):
        """
        Returns instances of self._v_brains, or whatever is passed
        into self.useBrains.
        """
        if isinstance(index, tuple):
            # then it contains a score...
            normalized_score, score, key = index
        else:
            # otherwise no score, set all scores to 1
            normalized_score, score, key = (1, 1, index)

        data = self.data[key]
        klass = self._v_result_class
        schema_len = len(klass.__record_schema__)
        if schema_len == len(data) + 3:
            # if we have complete data, create in a single pass
            r = klass(tuple(data) + (key, score, normalized_score))
        else:
            r = klass(data)
            r.data_record_id_ = key
            r.data_record_score_ = score
            r.data_record_normalized_score_ = normalized_score
        r = r.__of__(aq_parent(self))
        return r

    def __setstate__(self, state):
        """ initialize your brains.  This method is called when the
        catalog is first activated (from the persistent storage) """
        Persistent.__setstate__(self, state)
        self.updateBrains()

    def useBrains(self, brains):
        """ Sets up the Catalog to return an object (ala ZTables) that
        is created on the fly from the tuple stored in the self.data
        Btree.
        """

        class mybrains(AbstractCatalogBrain, brains):
            pass

        scopy = self.schema.copy()

        schema_len = len(self.schema.keys())
        scopy['data_record_id_'] = schema_len
        scopy['data_record_score_'] = schema_len + 1
        scopy['data_record_normalized_score_'] = schema_len + 2

        mybrains.__record_schema__ = scopy

        self._v_brains = brains
        self._v_result_class = mybrains

    def addColumn(self, name, default_value=None, threshold=10000):
        """Adds a row to the meta data schema"""
        schema = self.schema
        names = list(self.names)

        if name != name.strip():
            # Someone could have mistakenly added a space at the end
            # of the input field.
            LOG.warn("stripped space from new column %r -> %r", name,
                     name.strip())
            name = name.strip()

        if name in schema:
            raise CatalogError('The column %s already exists' % name)

        if name[0] == '_':
            raise CatalogError('Cannot cache fields beginning with "_"')

        values = schema.values()
        if values:
            schema[name] = max(values) + 1
        else:
            schema[name] = 0
        names.append(name)

        if default_value in (None, ''):
            default_value = MV

        if len(self):
            pghandler = ZLogHandler(threshold)
            pghandler.init('Adding %s column' % name, len(self))
            for i, (key, value) in enumerate(self.data.iteritems()):
                pghandler.report(i)
                self.data[key] = value + (default_value, )
            pghandler.finish()

        self.names = tuple(names)
        self.schema = schema

        # new column? update the brain
        self.updateBrains()

    def delColumn(self, name, threshold=10000):
        """Deletes a row from the meta data schema"""
        names = list(self.names)
        _index = names.index(name)

        if not name in self.schema:
            LOG.error('delColumn attempted to delete nonexistent '
                      'column %s.' % str(name))
            return

        del names[_index]

        # rebuild the schema
        schema = {}
        for i, name in enumerate(names):
            schema[name] = i

        self.schema = schema
        self.names = tuple(names)

        # update the brain
        self.updateBrains()

        # remove the column value from each record
        if len(self):
            _next_index = _index + 1
            pghandler = ZLogHandler(threshold)
            pghandler.init('Deleting %s column' % name, len(self))
            for i, (key, value) in enumerate(self.data.iteritems()):
                pghandler.report(i)
                self.data[key] = value[:_index] + value[_next_index:]
            pghandler.finish()

    def addIndex(self, name, index_type):
        """Create a new index, given a name and a index_type.

        Old format: index_type was a string, 'FieldIndex' 'TextIndex' or
        'KeywordIndex' is no longer valid; the actual index must be
        instantiated and passed in to addIndex.

        New format: index_type is the actual index object to be stored.
        """

        if name in self.indexes:
            raise CatalogError('The index %s already exists' % name)

        if name.startswith('_'):
            raise CatalogError('Cannot index fields beginning with "_"')

        if not name:
            raise CatalogError('Name of index is empty')

        if name != name.strip():
            # Someone could have mistakenly added a space at the end
            # of the input field.
            LOG.warn("stripped space from new index %r -> %r", name,
                     name.strip())
            name = name.strip()

        indexes = self.indexes

        if isinstance(index_type, str):
            raise TypeError("Catalog addIndex now requires the index type to"
                            "be resolved prior to adding; create the proper "
                            "index in the caller.")

        indexes[name] = index_type
        self.indexes = indexes

    def delIndex(self, name):
        """ deletes an index """

        if not name in self.indexes:
            raise CatalogError('The index %s does not exist' % name)

        indexes = self.indexes
        del indexes[name]
        self.indexes = indexes

    def getIndex(self, name):
        """ get an index wrapped in the catalog """
        return self.indexes[name].__of__(self)

    def updateMetadata(self, object, uid, index):
        """ Given an object and a uid, update the column data for the
        uid with the object data iff the object has changed """
        data = self.data
        newDataRecord = self.recordify(object)

        if index is None:
            index = getattr(self, '_v_nextid', 0)
            if index % 4000 == 0:
                index = randint(-2000000000, 2000000000)
            while not data.insert(index, newDataRecord):
                index = randint(-2000000000, 2000000000)

            # We want ids to be somewhat random, but there are
            # advantages for having some ids generated
            # sequentially when many catalog updates are done at
            # once, such as when reindexing or bulk indexing.
            # We allocate ids sequentially using a volatile base,
            # so different threads get different bases. This
            # further reduces conflict and reduces churn in
            # here and it result sets when bulk indexing.
            self._v_nextid = index + 1
        else:
            if data.get(index, 0) != newDataRecord:
                data[index] = newDataRecord
        return index

    # the cataloging API

    def catalogObject(self, object, uid, threshold=None, idxs=None,
                      update_metadata=True):
        """
        Adds an object to the Catalog by iteratively applying it to
        all indexes.

        'object' is the object to be cataloged

        'uid' is the unique Catalog identifier for this object

        If 'idxs' is specified (as a sequence), apply the object only
        to the named indexes.

        If 'update_metadata' is true (the default), also update metadata for
        the object.  If the object is new to the catalog, this flag has
        no effect (metadata is always created for new objects).
        """
        if idxs is None:
            idxs = []

        index = self.uids.get(uid, None)

        if index is None:
            # we are inserting new data
            index = self.updateMetadata(object, uid, None)
            self._length.change(1)
            self.uids[uid] = index
            self.paths[index] = uid
        elif update_metadata:
            # we are updating and we need to update metadata
            self.updateMetadata(object, uid, index)

        # do indexing
        total = 0

        if idxs == []:
            use_indexes = self.indexes.keys()
        else:
            use_indexes = idxs

        for name in use_indexes:
            x = self.getIndex(name)
            if hasattr(x, 'index_object'):
                blah = x.index_object(index, object, threshold)
                total = total + blah
            else:
                LOG.error('catalogObject was passed bad index '
                          'object %s.' % str(x))

        return total

    def uncatalogObject(self, uid):
        """
        Uncatalog and object from the Catalog.  and 'uid' is a unique
        Catalog identifier

        Note, the uid must be the same as when the object was
        catalogued, otherwise it will not get removed from the catalog

        This method should not raise an exception if the uid cannot
        be found in the catalog.
        """
        data = self.data
        uids = self.uids
        paths = self.paths
        indexes = self.indexes.keys()
        rid = uids.get(uid, None)

        if rid is not None:
            for name in indexes:
                x = self.getIndex(name)
                if hasattr(x, 'unindex_object'):
                    x.unindex_object(rid)
            del data[rid]
            del paths[rid]
            del uids[uid]
            self._length.change(-1)

        else:
            LOG.error('uncatalogObject unsuccessfully '
                      'attempted to uncatalog an object '
                      'with a uid of %s. ' % str(uid))

    def uniqueValuesFor(self, name):
        """ return unique values for FieldIndex name """
        return tuple(self.getIndex(name).uniqueValues())

    def hasuid(self, uid):
        """ return the rid if catalog contains an object with uid """
        return self.uids.get(uid)

    def recordify(self, object):
        """ turns an object into a record tuple """
        record = []
        # the unique id is always the first element
        for x in self.names:
            attr = getattr(object, x, MV)
            if (attr is not MV and safe_callable(attr)):
                attr = attr()
            record.append(attr)
        return tuple(record)

    def instantiate(self, record):
        r = self._v_result_class(record[1])
        r.data_record_id_ = record[0]
        return r.__of__(self)

    def getMetadataForRID(self, rid):
        record = self.data[rid]
        result = {}
        for (key, pos) in self.schema.items():
            result[key] = record[pos]
        return result

    def getIndexDataForRID(self, rid):
        result = {}
        for name in self.indexes.keys():
            result[name] = self.getIndex(name).getEntryForObject(rid, "")
        return result

    # This is the Catalog search engine. Most of the heavy lifting happens
    # below

    def make_query(self, request):
        # This is a bit of a mess, but the ZCatalog API has traditionally
        # supported passing in query restrictions in almost arbitary ways
        real_req = None
        if isinstance(request, dict):
            query = request.copy()
        elif isinstance(request, CatalogSearchArgumentsMap):
            query = {}
            query.update(request.keywords)
            real_req = request.request
            if isinstance(real_req, dict):
                query.update(real_req)
                real_req = None
        else:
            real_req = request

        if real_req:
            warnings.warn('You have specified a query using either a request '
                          'object or a mixture of a query dict and keyword '
                          'arguments. Please use only a simple query dict. '
                          'Your query contained "%s". This support is '
                          'deprecated and will be removed in Zope 4.' %
                          repr(real_req), DeprecationWarning, stacklevel=4)

            known_keys = query.keys()
            # The request has too many places where an index restriction
            # might be specified. Putting all of request.form,
            # request.other, ... into the query isn't what we want.
            # So we iterate over all known indexes instead and see if they
            # are in the request.
            for iid in self.indexes.keys():
                if iid in known_keys:
                    continue
                value = real_req.get(iid)
                if value:
                    query[iid] = value
        return query

    def _get_index_query_names(self, index):
        if hasattr(index, 'getIndexQueryNames'):
            return index.getIndexQueryNames()
        return (index.getId(),)

    def _sorted_search_indexes(self, query):
        # Simple implementation ordering only by limited result support
        query_keys = query.keys()
        order = []
        for name, index in self.indexes.items():
            for attr in self._get_index_query_names(index):
                if attr in query_keys:
                    order.append((ILimitedResultIndex.providedBy(index), name))
        order.sort()
        return [i[1] for i in order]

    def _limit_sequence(self, sequence, slen, b_start=0, b_size=None,
                        switched_reverse=False):
        if b_size is not None:
            sequence = sequence[b_start:b_start + b_size]
            if slen:
                slen = len(sequence)
        if switched_reverse:
            sequence.reverse()
        return (sequence, slen)

    def search(self,
            query, sort_index=None, reverse=False, limit=None, merge=True):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        # Indexes fulfill a fairly large contract here. We hand each
        # index the query mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the query, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # Canonicalize the request into a sensible query before passing it on
        query = self.make_query(query)

        cr = self.getCatalogPlan(query)
        cr.start()

        plan = cr.plan()
        if not plan:
            plan = self._sorted_search_indexes(query)

        rs = None  # result set
        indexes = self.indexes.keys()
        for i in plan:
            if i not in indexes:
                # We can have bogus keys or the plan can contain index names
                # that have been removed in the meantime
                continue

            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue

            cr.start_split(i)
            limit_result = ILimitedResultIndex.providedBy(index)
            if limit_result:
                r = _apply_index(query, rs)
            else:
                r = _apply_index(query)

            if r is not None:
                r, u = r
                # Short circuit if empty result
                # BBB: We can remove the "r is not None" check in Zope 4
                # once we don't need to support the "return everything" case
                # anymore
                if r is not None and not r:
                    cr.stop_split(i, result=None, limit=limit_result)
                    return LazyCat([])

                # provide detailed info about the pure intersection time
                intersect_id = i + '#intersection'
                cr.start_split(intersect_id)
                # weightedIntersection preserves the values from any mappings
                # we get, as some indexes don't return simple sets
                if hasattr(rs, 'items') or hasattr(r, 'items'):
                    _, rs = weightedIntersection(rs, r)
                else:
                    rs = intersection(rs, r)

                cr.stop_split(intersect_id)

                # consider the time it takes to intersect the index result
                # with the total result set to be part of the index time
                cr.stop_split(i, result=r, limit=limit_result)
                if not rs:
                    break
            else:
                cr.stop_split(i, result=None, limit=limit_result)

        # Try to deduce the sort limit from batching arguments
        b_start = int(query.get('b_start', 0))
        b_size = query.get('b_size', None)
        if b_size is not None:
            b_size = int(b_size)

        if b_size is not None:
            limit = b_start + b_size
        elif limit and b_size is None:
            b_size = limit

        if sort_index is None:
            sort_report_name = None
        else:
            if isinstance(sort_index, list):
                sort_name = '-'.join(i.getId() for i in sort_index)
            else:
                sort_name = sort_index.getId()
            if isinstance(reverse, list):
                reverse_name = '-'.join(
                    'desc' if r else 'asc' for r in reverse)
            else:
                reverse_name = 'desc' if reverse else 'asc'
            sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name
            if limit is not None:
                sort_report_name += '#limit-%s' % limit

        if rs is None:
            # None of the indexes found anything to do with the query
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            warnings.warn('Your query %s produced no query restriction. '
                          'Currently the entire catalog content is returned. '
                          'In Zope 4 this will result in an empty LazyCat '
                          'to be returned.' % repr(cr.make_key(query)),
                          DeprecationWarning, stacklevel=3)

            rlen = len(self)
            if sort_index is None:
                sequence, slen = self._limit_sequence(self.data.items(), rlen,
                    b_start, b_size)
                result = LazyMap(self.instantiate, sequence, slen,
                    actual_result_count=rlen)
            else:
                cr.start_split(sort_report_name)
                result = self.sortResults(
                    self.data, sort_index, reverse, limit, merge,
                        actual_result_count=rlen, b_start=b_start,
                        b_size=b_size)
                cr.stop_split(sort_report_name, None)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            rlen = len(rs)
            if sort_index is None and hasattr(rs, 'items'):
                # having a 'items' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.

                if not merge:
                    # Don't bother to sort here, return a list of
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    result = [(score, (1, score, rid), getitem)
                            for rid, score in rs.items()]
                else:
                    cr.start_split('sort_on#score')

                    # sort it by score
                    rs = rs.byValue(0)
                    max = float(rs[0][0])

                    # Here we define our getter function inline so that
                    # we can conveniently store the max value as a default arg
                    # and make the normalized score computation lazy
                    def getScoredResult(item, max=max, self=self):
                        """
                        Returns instances of self._v_brains, or whatever is
                        passed into self.useBrains.
                        """
                        score, key = item
                        data = self.data[key]
                        klass = self._v_result_class
                        schema_len = len(klass.__record_schema__)
                        norm_score = int(100.0 * score / max)
                        if schema_len == len(data) + 3:
                            r = klass(tuple(data) + (key, score, norm_score))
                        else:
                            r = klass(data)
                            r.data_record_id_ = key
                            r.data_record_score_ = score
                            r.data_record_normalized_score_ = norm_score
                        r = r.__of__(aq_parent(self))
                        return r

                    sequence, slen = self._limit_sequence(rs, rlen, b_start,
                        b_size)
                    result = LazyMap(getScoredResult, sequence, slen,
                        actual_result_count=rlen)
                    cr.stop_split('sort_on#score', None)

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                sequence, slen = self._limit_sequence(rs, rlen, b_start,
                    b_size)
                result = LazyMap(self.__getitem__, sequence, slen,
                    actual_result_count=rlen)
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                cr.start_split(sort_report_name)
                result = self.sortResults(rs, sort_index, reverse, limit,
                    merge, actual_result_count=rlen, b_start=b_start,
                    b_size=b_size)
                cr.stop_split(sort_report_name, None)
        else:
            # Empty result set
            result = LazyCat([])
        cr.stop()
        return result

    def sortResults(self, rs, sort_index, reverse=False, limit=None,
            merge=True, actual_result_count=None, b_start=0, b_size=None):
        # Sort a result set using one or more sort indexes. Both sort_index
        # and reverse can be lists of indexes and reverse specifications.
        # Return a lazy result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples, where
        # sortkey can be a tuple on its own.
        second_indexes = None
        second_indexes_key_map = None
        sort_index_length = 1
        if isinstance(sort_index, list):
            sort_index_length = len(sort_index)
            if sort_index_length > 1:
                second_indexes = sort_index[1:]
                second_indexes_key_map = []
                for si in second_indexes:
                    second_indexes_key_map.append(si.documentToKeyMap())
            sort_index = sort_index[0]
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        result = []
        r_append = result.append
        r_insert = result.insert
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the result set, reverse sorting
        # order and limit it, then reverse the result set again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            if isinstance(reverse, list):
                reverse = [not r for r in reverse]
            else:
                reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        # determine sort_spec
        if isinstance(reverse, list):
            sort_spec = [r and -1 or 1 for r in reverse]
            # limit to current maximum of sort indexes
            sort_spec = sort_spec[:sort_index_length]
            # use first sort order for choosing the algorithm
            first_reverse = reverse[0]
        else:
            sort_spec = []
            for i in xrange(sort_index_length):
                sort_spec.append(reverse and -1 or 1)
            first_reverse = reverse

        if merge and limit is None and (
           rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # TODO: len(sort_index) isn't actually what we want for a keyword
            # index, as it's only the unique values, not the documents.
            # Don't use this case while using limit, as we return results of
            # non-flattened intsets, and would have to merge/unflattened those
            # before limiting.
            length = 0
            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            if sort_index_length == 1:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        r_append((k, intset, _self__getitem__))
                result.sort(reverse=reverse)
            else:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        # sort on secondary index
                        keysets = defaultdict(list)
                        for i in intset:
                            full_key = (k, )
                            for km in second_indexes_key_map:
                                try:
                                    full_key += (km[i], )
                                except KeyError:
                                    pass
                            keysets[full_key].append(i)
                        for k2, v2 in keysets.items():
                            r_append((k2, v2, _self__getitem__))
                result = multisort(result, sort_spec)
            sequence, slen = self._limit_sequence(result, length, b_start,
                b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index.
            # If we are interested in at least 25% or more of the result set,
            # the N-Best algorithm is slower, so we iterate over all.
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        # The reference back to __getitem__ is used in case
                        # we do not merge now and need to intermingle the
                        # results with those of other catalogs while avoiding
                        # the cost of instantiating a LazyMap per result
                        r_append((key, did, _self__getitem__))
                if merge:
                    result.sort(reverse=reverse)
            else:
                for did in rs:
                    try:
                        full_key = (index_key_map[did], )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        r_append((full_key, did, _self__getitem__))
                if merge:
                    result = multisort(result, sort_spec)
            if merge:
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                return sequence
        elif first_reverse:
            # Limit / sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            k_insert = keys.insert
            n = 0
            worst = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result.reverse()
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence
        elif not first_reverse:
            # Limit / sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            k_insert = keys.insert
            n = 0
            best = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence

        return LazyMap(self.__getitem__, result, len(result),
            actual_result_count=actual_result_count)

    def _get_sort_attr(self, attr, kw):
        """Helper function to find sort-on or sort-order."""
        # There are three different ways to find the attribute:
        # 1. kw[sort-attr]
        # 2. self.sort-attr
        # 3. kw[sort_attr]
        # kw may be a dict or an ExtensionClass MultiMapping, which
        # differ in what get() returns with no default value.
        name = "sort-%s" % attr
        val = kw.get(name, None)
        if val is not None:
            return val
        val = getattr(self, name, None)
        if val is not None:
            return val
        return kw.get("sort_%s" % attr, None)

    def _getSortIndex(self, args):
        """Returns a list of search index objects or None."""
        sort_index_names = self._get_sort_attr("on", args)
        if sort_index_names is not None:
            # self.indexes is always a dict, so get() w/ 1 arg works
            sort_indexes = []
            if not isinstance(sort_index_names, (list, tuple)):
                sort_index_names = [sort_index_names]
            for name in sort_index_names:
                sort_index = self.indexes.get(name)
                if sort_index is None:
                    raise CatalogError('Unknown sort_on index: %s' %
                                       repr(name))
                else:
                    if not hasattr(sort_index, 'documentToKeyMap'):
                        raise CatalogError('The index chosen for sort_on is '
                            'not capable of being used as a sort index: '
                            '%s' % repr(name))
                sort_indexes.append(sort_index)
            if len(sort_indexes) == 1:
                # be nice and keep the old API intact for single sort_on's
                return sort_indexes[0]
            return sort_indexes
        return None

    def searchResults(self, REQUEST=None, used=None, _merge=True, **kw):
        # You should pass in a simple dictionary as the request argument,
        # which only contains the relevant query.
        # The used argument is deprecated and is ignored
        if REQUEST is None and not kw:
            # Try to acquire request if we get no args for bw compat
            warnings.warn('Calling searchResults without a query argument nor '
                          'keyword arguments is deprecated. In Zope 4 the '
                          'query will no longer be automatically taken from '
                          'the acquired request.',
                          DeprecationWarning, stacklevel=3)
            REQUEST = getattr(self, 'REQUEST', None)
        if isinstance(REQUEST, dict) and not kw:
            # short cut for the best practice
            args = REQUEST
        else:
            args = CatalogSearchArgumentsMap(REQUEST, kw)
        sort_indexes = self._getSortIndex(args)
        sort_limit = self._get_sort_attr('limit', args)
        reverse = False
        if sort_indexes is not None:
            order = self._get_sort_attr("order", args)
            reverse = []
            if order is None:
                order = ['']
            elif isinstance(order, str):
                order = [order]
            for o in order:
                reverse.append(o.lower() in ('reverse', 'descending'))
            if len(reverse) == 1:
                # be nice and keep the old API intact for single sort_order
                reverse = reverse[0]
        # Perform searches with indexes and sort_index
        return self.search(args, sort_indexes, reverse, sort_limit, _merge)

    __call__ = searchResults

    def getCatalogPlan(self, query=None):
        """Query time reporting and planning.
        """
        parent = aq_base(aq_parent(self))
        threshold = getattr(parent, 'long_query_time', 0.1)
        return CatalogPlan(self, query, threshold)
Esempio n. 32
0
class Lexicon(Persistent):
    """Maps words to word ids """

    __implements__ = LexiconInterface

    def __init__(self, truncate_left=0):
        self.truncate_left = truncate_left
        self.clear()

    def clear(self):
        self._nextid      = BTrees.Length.Length()
        self._forward_idx = OIBTree()
        self._inverse_idx = IOBTree()
        if self.truncate_left:
            self._lforward_idx = OIBTree()
        else:
            self._lforward_idx = None

    def getWordIdList(self, words):
        """ return a list of wordIds for a list of words """
    
        fw_idx = self._forward_idx
        fw_idx_get = fw_idx.get
        rev_idx = self._inverse_idx
        if self.truncate_left: lfw_idx = self._lforward_idx
        nextid = self._nextid

        wids = []
        append = wids.append

        for word in words:
            wid = fw_idx_get(word)
            if not wid:         
                nextid.change(1)
                wid = nextid()
                fw_idx[word] = wid
                rev_idx[wid] = word
                if self.truncate_left:
                    lfw_idx[word[::-1]] = wid
            append(wid)
        return wids

    def getWordId(self, word, default=None):
        """Return the matched word against the key."""
        return  self._forward_idx.get(word, default)

    def getWord(self, wid):
        """ return a word by its wid"""
        return self._inverse_idx[wid]

    def deleteWord(self, word):
        wid = self._forward_idx[word]
        del self._inverse_idx[wid]
        del self._forward_idx[word]

    def deleteWordId(self, wid):
        word = self._inverse_idx[wid]
        del self._forward_idx[word]
        del self._inverse_idx[wid]

    def getWordsForRightTruncation(self, prefix):
        """ Return a list for wordIds that match against prefix.
            We use the BTrees range search to perform the search
        """
        assert isinstance(prefix, unicode)
        return  self._forward_idx.keys(prefix, prefix + u'\uffff') 

    def getWordsForLeftTruncation(self, suffix):
        """ Return a sequence of word ids for a common suffix """
        suffix = suffix[::-1]
        assert isinstance(suffix, unicode)
        return  [w[::-1] for w in  self._lforward_idx.keys(suffix, suffix + u'\uffff') ] 

    def createRegex(self, pattern):
        """Translate a PATTERN to a regular expression """
        return '%s$' % pattern.replace( '*', '.*').replace( '?', '.')

    def getSimiliarWords(self, term, threshold=0.75): 
        """ return a list of similar words based on the levenshtein distance """
        return [ (w, ratio(w,term)) for w in self._forward_idx.keys() if ratio(w, term) > threshold  ]

    def getWordsForPattern(self, pattern):
        """ perform full pattern matching """

        # search for prefix in word
        mo = re.search('([\?\*])', pattern)
        if mo is None:
            return [ pattern ] 

        pos = mo.start(1)
        if pos==0: raise QueryParserError, \
            'word "%s" should not start with a globbing character' % pattern

        prefix = pattern[:pos]
        words = self._forward_idx.keys(prefix, prefix + u'\uffff')
        regex = re.compile( self.createRegex(pattern) )
        return [word  for word in words if regex.match(word) ] 

    def getWordsInRange(self, w1, w2):
        """ return all words within w1...w2 """
        return self._forward_idx.keys(w1, w2)

    def getWordsForSubstring(self, sub):
        """ return all words that match *sub* """
        return [word for word in self._forward_idx.keys() if sub in word]

    def getWordIds(self):
        """ return all wids """
        return self._inverse_idx.keys()

    def removeWordId(self, wid):
        """ remove word id 'wid' """
        word = self._inverse_idx[wid]
        del self._inverse_idx[wid]
        del self._forward_idx[word]

    def __len__(self):
        return len(self._inverse_idx.keys())