Example #1
0
 def clear(self):
     self.data = IOBTree()  # {rid -> record as tuple}
     self.indexes = {}  # {index_name -> OOBTree({value -> IITreeSet})}
     self.primary_index = OIBTree()  # {primary key -> rid}
     for position, column in self.col_info:
         if column.indexed:
             self.indexes[column.name] = OOBTree()
Example #2
0
 def clear(self):
     """Empty the lexicon.
     """
     self.length = Length()
     self._wid_length_based = False
     self._wids = OIBTree()  # word -> wid
     self._words = IOBTree()  # wid -> word
Example #3
0
 def clear(self):
     self._length = Length()
     self._index = OIBTree()
     self._unindex = IOBTree()
     if self._counter is None:
         self._counter = Length()
     else:
         self._increment_counter()
Example #4
0
 def clear(self):
     self._nextid      = BTrees.Length.Length()
     self._forward_idx = OIBTree()
     self._inverse_idx = IOBTree()
     if self.truncate_left:
         self._lforward_idx = OIBTree()
     else:
         self._lforward_idx = None
    def __init__(self):
        # These keep track of symbolic label and branch names that
        # have been used to ensure that they don't collide.
        self._branches = OIBTree()
        self._branches['mainline'] = 1
        self._labels = OIBTree()

        self._histories = OOBTree()
        self._created = time.time()
Example #6
0
    def __init__(self):
        # These keep track of symbolic label and branch names that
        # have been used to ensure that they don't collide.
        self._branches = OIBTree()
        self._branches['mainline'] = 1
        self._labels = OIBTree()

        self._histories = OOBTree()
        self._created = time.time()
Example #7
0
    def clear(self):
        """ clear catalog """

        self.data = IOBTree()  # mapping of rid to meta_data
        self.uids = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid
        self._length = BTrees.Length.Length()

        for index in self.indexes.keys():
            self.getIndex(index).clear()
Example #8
0
 def __init__(self, *pipeline):
     self._wids = OIBTree()  # word -> wid
     self._words = IOBTree() # wid -> word
     # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
     # of vocabulary).  This can happen, e.g., if a query contains a word
     # we never saw before, and that isn't a known stopword (or otherwise
     # filtered out).  Returning a special wid value for OOV words is a
     # way to let clients know when an OOV word appears.
     self.wordCount = Length()
     self._pipeline = pipeline
Example #9
0
def setupAnnotations(context):
    """
    set up the annotations if they haven't been set up
    already. The rest of the functions in here assume that
    this has already been set up
    """
    annotations = IAnnotations(context)

    if yays not in annotations:
        annotations[yays] = OIBTree()

    if nays not in annotations:
        annotations[nays] = OIBTree()

    return annotations
Example #10
0
def convert_to_uuidindex(catalog, index):
    if isinstance(index, UUIDIndex):
        return
    logger.info('Converting index `%s` to UUIDIndex.' % index.getId())
    index.__class__ = UUIDIndex
    index._p_changed = True
    catalog._catalog._p_changed = True
    # convert from OOBTree to OIBTree
    old_index = index._index
    if not isinstance(old_index, OIBTree):
        index._index = _index = OIBTree()
        for k, v in old_index.items():
            if k is None:
                continue
            if isinstance(v, int):
                _index[k] = v
            else:
                if isinstance(v, (IISet, IITreeSet)):
                    # inconsistent data, one uid with multiple docids
                    paths = dict((tuple(catalog.getpath(k).split('/')), k)
                                 for k in v.keys())
                    shortest = min(paths, key=len)
                    for path, key in paths.iteritems():
                        if path[:len(shortest)] != shortest:
                            raise ValueError(
                                'Inconsistent UID index, UID %s is associated '
                                'with multiple docids: %r' % (k, paths))

                    # All other docids are sub-paths of another
                    # indicating the UID was just acquired,
                    # choose the shortest
                    _index[k] = paths[shortest]
        del old_index
        transaction.savepoint(optimistic=True)
    logger.info('Finished conversion.')
Example #11
0
    def __init__(self):
        super(ContentTypeScopeManager, self).__init__()
        self._mappings = IOBTree()

        # Methods permitted to access this mapping with.  Originally
        # I wanted to provide alternative sets of mapping on a per
        # mapping_id basis, however this proved to be complex and
        # complicated due to extra relationships involved.
        self._methods = IOBTree()

        # For metadata related to the above.
        self._mappings_metadata = IOBTree()

        # To ease the usage of scopes, the mappings are referenced by
        # names and are called profiles which add a few useful fields to
        # allow slightly easier usage.  This separates the name from the
        # already active tokens such that once a token is instantiated
        # with a scope, the mapping is stuck until the token is revoked.
        self._named_mappings = OIBTree()  # name to id.

        # To not overburden the named mappings with work-in-progress
        # profiles, instantiate one here also.
        self._edit_mappings = OOBTree()

        self.default_mapping_id = self.addMapping({})
Example #12
0
 def storage(self):
     """ get the counter storage
     """
     annotation = get_portal_annotation()
     if annotation.get(NUMBER_STORAGE) is None:
         annotation[NUMBER_STORAGE] = OIBTree()
     return annotation[NUMBER_STORAGE]
def resetOrdering(ordering):
    annotations = IAnnotations(ordering.context)
    order = PersistentList()
    annotations.__setitem__(ordering.ORDER_KEY, order)
    pos = OIBTree()
    annotations.__setitem__(ordering.POS_KEY, pos)
    return order, pos
Example #14
0
    def _getCacheId(self):
        """Return a cache id for preferences.

    We use:
     - user_id: because preferences are always different by user
     - self._preference_cache[user_id] which is increased everytime a user
       preference is modified
     - self._preference_cache[None] which is increased everytime a global
       preference is modified
    """
        user_id = getSecurityManager().getUser().getId()
        try:
            self._preference_cache
        except AttributeError:
            self._preference_cache = OIBTree()
        return self._preference_cache.get(None), self._preference_cache.get(
            user_id), user_id
    def _clear_and_rebuild(self, ids=[]):
        """
        """
        self._positionId = IOBTree()
        self._idPosition = OIBTree()

        for id in ids:
            self.addObject(id)
 def clear(self):
     self._length = Length()
     self._index = OIBTree()
     self._unindex = IOBTree()
     if self._counter is None:
         self._counter = Length()
     else:
         self._increment_counter()
Example #17
0
    def _cleanup(self):
        """Cleans up errors in the BTrees.

        Certain ZODB bugs have caused BTrees to become slightly insane.
        Fortunately, there is a way to clean up damaged BTrees that
        always seems to work: make a new BTree containing the items()
        of the old one.

        Returns 1 if no damage was detected, or 0 if damage was
        detected and fixed.
        """
        from BTrees.check import check
        path = '/'.join(self.getPhysicalPath())
        try:
            check(self._tree)
            for key in self._tree.keys():
                if key not in self._tree:
                    raise AssertionError(
                        "Missing value for key: %s" % repr(key))
            check(self._mt_index)
            keys = set(self._tree.keys())
            for key, value in self._mt_index.items():
                if (key not in self._mt_index
                    or self._mt_index[key] is not value):
                    raise AssertionError(
                        "Missing or incorrect meta_type index: %s"
                        % repr(key))
                check(value)
                for k in value.keys():
                    if k not in value or k not in keys:
                        raise AssertionError(
                            "Missing values for meta_type index: %s"
                            % repr(key))
            return 1
        except AssertionError:
            LOG.warn('Detected damage to %s. Fixing now.' % path,
                     exc_info=sys.exc_info())
            try:
                self._tree = OOBTree(self._tree)
                keys = set(self._tree.keys())
                mt_index = OOBTree()
                for key, value in self._mt_index.items():
                    for name in tuple(value.keys()):
                        if name not in keys:
                            del value[name]
                    mt_index[key] = OIBTree(value)
                self._mt_index = mt_index
                new = len(keys)
                if self._count() != new:
                    self._count.set(new)
            except:
                LOG.error('Failed to fix %s.' % path,
                    exc_info=sys.exc_info())
                raise
            else:
                LOG.info('Fixed %s.' % path)
            return 0
Example #18
0
def initializeAnnotations(obj, event):
    """Ensure that we don't delegate certain annotations by setting them 
    from the beginning.
    """
    annotations = IAnnotations(obj)
    annotations.setdefault(DefaultOrdering.ORDER_KEY, PersistentList())
    annotations.setdefault(DefaultOrdering.POS_KEY, OIBTree())
    annotations.setdefault(CONTENTRULES_KEY, None)
    annotations.setdefault(CONTEXT_ASSIGNMENT_KEY, OOBTree())
Example #19
0
class DateIndex(UnIndex):
    """ Index for Dates """

    __implements__ = (PluggableIndex.PluggableIndexInterface,)

    meta_type = 'DateIndex'
    query_options = ['query', 'range']

    manage = manage_main = DTMLFile( 'dtml/manageDateIndex', globals() )
    manage_main._setName( 'manage_main' )
    manage_options = ( { 'label' : 'Settings'
                       , 'action' : 'manage_main'
                       },
                     )

    def clear( self ):
        """ Complete reset """
        self._index = IOBTree()
        self._unindex = OIBTree()


    def index_object( self, documentId, obj, threshold=None ):
        """index an object, normalizing the indexed value to an integer

           o Normalized value has granularity of one minute.

           o Objects which have 'None' as indexed value are *omitted*,
             by design.
        """
        returnStatus = 0

        try:
            date_attr = getattr( obj, self.id )
            if callable( date_attr ):
                date_attr = date_attr()

            ConvertedDate = self._convert( value=date_attr, default=_marker )
        except AttributeError:
            ConvertedDate = _marker

        oldConvertedDate = self._unindex.get( documentId, _marker )

        if ConvertedDate != oldConvertedDate:
            if oldConvertedDate is not _marker:
                self.removeForwardIndexEntry(oldConvertedDate, documentId)

            if ConvertedDate is not _marker:
                self.insertForwardIndexEntry( ConvertedDate, documentId )
                self._unindex[documentId] = ConvertedDate

            returnStatus = 1

        return returnStatus


    def _apply_index( self, request, cid='', type=type, None=None ):
Example #20
0
    def clear(self):
        """ clear catalog """

        self.data = IOBTree()  # mapping of rid to meta_data
        self.uids = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid
        self._length = BTrees.Length.Length()

        for index in self.indexes.keys():
            self.getIndex(index).clear()
Example #21
0
 def noteLock(self, obj, user_id):
     mapping = getattr(self, '_locks', None)
     if mapping is None:
         mapping = self._locks = OOBTree()
     path = '/'.join(obj.getPhysicalPath())
     items = mapping.get(user_id, None)
     if items is None:
         items = OIBTree()
         mapping[user_id] = items
     items[path] = 1
Example #22
0
    def reset(self):
        """Forget everything; usually called from __init__.
        """
        String.reset(self)

        self.path2rid = OIBTree()   # {path:rid}
        self.rid2path = IOBTree()   # {rid:path}
        self.parts = OOBTree()      # {(level,part):rids}
        self.rids = IOBTree()       # {rid:(level,part)s}
        self.levels = IOBTree()     # {level:rids}
Example #23
0
def load_model():
    data = DataFS()
    keys, label = data.gen_train_data()
    btree = OIBTree()
    rubbish_k = np.load('6k.npy')
    rubbish_v = np.load('6v.npy')
    w = np.load('w.npy', allow_pickle=True)
    b = np.load('b.npy', allow_pickle=True)
    for i in range(len(rubbish_k)):
        btree[str(rubbish_k[i])] = int(rubbish_v[i])
    return keys, label, w, b, btree
Example #24
0
    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # Additional queue for internal crawler to revalidate the site
        self.crawl_queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0
Example #25
0
    def __init__(self, id="++conversation++default"):
        self.id = id

        # username -> count of comments; key is removed when count reaches 0
        self._commentators = OIBTree()

        # id -> comment - find comment by id
        self._comments = LOBTree()

        # id -> LLSet (children) - find all children for a given comment.
        # 0 signifies root.
        self._children = LOBTree()
Example #26
0
def setupAnnotations(context):
    """
    set up the annotations if they haven't been set up
    already. The rest of the functions in here assume that
    this has already been set up
    """
    annotations = IAnnotations(context)
    changed = False
    if yays not in annotations:
        annotations[yays] = OIBTree()
        changed = True

    if nays not in annotations:
        annotations[nays] = OIBTree()
        changed = True

    if changed:
        request = getRequest()
        alsoProvides(request, IDisableCSRFProtection)

    return annotations
Example #27
0
    def _convertBTrees(self, threshold=200):
        if (type(self._lexicon) is OIBTree and
            type(getattr(self, '_inverseLex', None)) is IOBTree):
            return

        from BTrees.convert import convert

        lexicon=self._lexicon
        self._lexicon=OIBTree()
        self._lexicon._p_jar=self._p_jar
        convert(lexicon, self._lexicon, threshold)

        try:
            inverseLex=self._inverseLex
            self._inverseLex=IOBTree()
        except AttributeError:
            # older lexicons didn't have an inverse lexicon
            self._inverseLex=IOBTree()
            inverseLex=self._inverseLex

        self._inverseLex._p_jar=self._p_jar
        convert(inverseLex, self._inverseLex, threshold)
Example #28
0
    def clear(self):
        """ clear catalog """

        self.data  = IOBTree()  # mapping of rid to meta_data
        self.uids  = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid

        # convert old-style Catalog object to new in-place
        try: self.__len__.set(0)
        except AttributeError: self.__len__=BTrees.Length.Length()

        for index in self.indexes.keys():
            self.getIndex(index).clear()
Example #29
0
    def clear(self):
        """Empty the index"""
        
        IOBTree = BTrees.family64.IO.BTree

        self._index = IOBTree() # {rangeid: [document_id, ...]}
        self._unindex = IOBTree() # {document_id: [rangeid, ...]}
        self._range_mapping = IOBTree() # {rangeid: range}
        self._reverse_range_mapping = OIBTree() # {range: rangeid}
        self._since_index = IOBTree() # {since: [rangeid,...]}
        self._until_index = IOBTree() # {until: [rangeid,...]}
        self._length = BTrees.Length.Length()
        self._unique_values_length = BTrees.Length.Length()
Example #30
0
 def __init__(self, datafs, writable=0, trans=0, pack=0):
     self.trans_limit = trans
     self.pack_limit = pack
     self.trans_count = 0
     self.pack_count = 0
     self.stopdict = get_stopdict()
     self.mh = mhlib.MH()
     self.filestorage = FileStorage(datafs, read_only=(not writable))
     self.database = DB(self.filestorage)
     self.connection = self.database.open()
     self.root = self.connection.root()
     try:
         self.index = self.root["index"]
     except KeyError:
         self.index = self.root["index"] = TextIndexWrapper()
     try:
         self.docpaths = self.root["docpaths"]
     except KeyError:
         self.docpaths = self.root["docpaths"] = IOBTree()
     try:
         self.doctimes = self.root["doctimes"]
     except KeyError:
         self.doctimes = self.root["doctimes"] = IIBTree()
     try:
         self.watchfolders = self.root["watchfolders"]
     except KeyError:
         self.watchfolders = self.root["watchfolders"] = {}
     self.path2docid = OIBTree()
     for docid in self.docpaths.keys():
         path = self.docpaths[docid]
         self.path2docid[path] = docid
     try:
         self.maxdocid = max(self.docpaths.keys())
     except ValueError:
         self.maxdocid = 0
     print(len(self.docpaths), "Document ids")
     print(len(self.path2docid), "Pathnames")
     print(self.index.lexicon.length(), "Words")
Example #31
0
    def __init__(self, *pipeline):
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree() # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.
        self._nextwid = 1
        self._pipeline = pipeline

        # Keep some statistics about indexing
        self._nbytes = 0 # Number of bytes indexed (at start of pipeline)
        self._nwords = 0 # Number of words indexed (after pipeline)
Example #32
0
 def importNode(self, node, mode=PURGE):
     """Import the object from the DOM node.
     """
     pipeline = []
     for child in node.childNodes:
         if child.nodeName == 'element':
             element = element_factory.instantiate(
                 child.getAttribute('group'), child.getAttribute('name'))
             pipeline.append(element)
     self.context._pipeline = tuple(pipeline)
     #clear lexicon
     self.context._wids = OIBTree()
     self.context._words = IOBTree()
     self.context.length = Length()
Example #33
0
class MessageService(Persistent, Location):
    interface.implements(IMessageService)

    def __init__(self, storage):
        self.__parent__ = storage

        self.index = OIBTree()
        self.unread = Length(0)

    def __len__(self):
        return len(self.index)

    def __iter__(self):
        return iter(self.index.values())

    def __contains__(self, key):
        msg = self.__parent__.getMessage(key)
        if msg is not None:
            return True
        else:
            return False

    def get(self, msgId, default=None):
        msg = self.__parent__.getMessage(msgId)
        if msg is not None:
            if msg.__date__ in self.index:
                return msg

        return default

    def append(self, message):
        message.__parent__ = self

        if self.__parent__.readStatus(message):
            self.unread.change(1)

        self.index[message.__date__] = message.__id__

    def remove(self, message):
        id = message.__date__

        if id in self.index:
            del self.index[id]

            if self.__parent__.readStatus(message) and self.unread() > 0:
                self.unread.change(-1)

    def create(self, **data):
        raise NotImplemented('create')
Example #34
0
 def _setOb(self, id, object):
     """Store the named object in the folder.
     """
     tree = self._tree
     if tree.has_key(id):
         raise KeyError('There is already an item named "%s".' % id)
     tree[id] = object
     self._count.change(1)
     # Update the meta type index.
     mti = self._mt_index
     meta_type = getattr(object, 'meta_type', None)
     if meta_type is not None:
         ids = mti.get(meta_type, None)
         if ids is None:
             ids = OIBTree()
             mti[meta_type] = ids
         ids[id] = 1
Example #35
0
 def create_token(self, userid, timeout=None, data=None):
     payload = {}
     payload['sub'] = userid
     if timeout is None:
         timeout = self.token_timeout
     if timeout:
         payload['exp'] = datetime.utcnow() + timedelta(seconds=timeout)
     if data is not None:
         payload.update(data)
     token = jwt.encode(payload, self._signing_secret(), algorithm='HS256')
     if self.store_tokens:
         if self._tokens is None:
             self._tokens = OOBTree()
         if userid not in self._tokens:
             self._tokens[userid] = OIBTree()
         self._tokens[userid][token] = int(time.time())
     return token
Example #36
0
    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0
Example #37
0
class Ballots(object):
    """ Simple object to help counting votes. It's not addable anywhere.
        Should be treated as an internal object for polls.
    """
    def __init__(self):
        """ Ballots attr is an OIBTree, since they can have any object as key.
        """
        self.ballots = OIBTree()

    def result(self):
        """ Return a tuple with sorted ballot items. """
        return tuple(sorted(self.ballots.iteritems()))

    def add(self, value):
        """ Add a dict of results - a ballot - to the pool. Append and increase counter. """
        if value in self.ballots:
            self.ballots[value] += 1
        else:
            self.ballots[value] = 1
Example #38
0
class Ballots(object):
    """ Simple object to help counting votes. It's not addable anywhere.
        Should be treated as an internal object for polls.
    """

    def __init__(self):
        """ Ballots attr is an OIBTree, since they can have any object as key.
        """
        self.ballots = OIBTree()

    def result(self):
        """ Return a tuple with sorted ballot items. """
        return tuple( sorted( self.ballots.iteritems() ) )

    def add(self, value):
        """ Add a dict of results - a ballot - to the pool. Append and increase counter. """
        if value in self.ballots:
            self.ballots[value] += 1
        else:
            self.ballots[value] = 1
Example #39
0
    def _convertBTrees(self, threshold=200):
        if (type(self._lexicon) is OIBTree and
            type(getattr(self, '_inverseLex', None)) is IOBTree):
            return

        from BTrees.convert import convert

        lexicon=self._lexicon
        self._lexicon=OIBTree()
        self._lexicon._p_jar=self._p_jar
        convert(lexicon, self._lexicon, threshold)

        try:
            inverseLex=self._inverseLex
            self._inverseLex=IOBTree()
        except AttributeError:
            # older lexicons didn't have an inverse lexicon
            self._inverseLex=IOBTree()
            inverseLex=self._inverseLex

        self._inverseLex._p_jar=self._p_jar
        convert(inverseLex, self._inverseLex, threshold)
Example #40
0
    def testCleanup(self):
        self.assert_(self.f._cleanup())
        key = TrojanKey('a')
        self.f._tree[key] = 'b'
        self.assert_(self.f._cleanup())
        key.value = 'z'

        # With a key in the wrong place, there should now be damage.
        self.assert_(not self.f._cleanup())
        # Now it's fixed.
        self.assert_(self.f._cleanup())

        from BTrees.OIBTree import OIBTree
        tree = self.f._mt_index['d'] = OIBTree()
        tree['e'] = 1
        self.assert_(not self.f._cleanup())

        # Verify the management interface also works,
        # but don't test return values.
        self.f.manage_cleanup()
        key.value = 'a'
        self.f.manage_cleanup()
Example #41
0
 def __init__(self, datafs, writable=0, trans=0, pack=0):
     self.trans_limit = trans
     self.pack_limit = pack
     self.trans_count = 0
     self.pack_count = 0
     self.stopdict = get_stopdict()
     self.mh = mhlib.MH()
     self.filestorage = FileStorage(datafs, read_only=(not writable))
     self.database = DB(self.filestorage)
     self.connection = self.database.open()
     self.root = self.connection.root()
     try:
         self.index = self.root["index"]
     except KeyError:
         self.index = self.root["index"] = TextIndexWrapper()
     try:
         self.docpaths = self.root["docpaths"]
     except KeyError:
         self.docpaths = self.root["docpaths"] = IOBTree()
     try:
         self.doctimes = self.root["doctimes"]
     except KeyError:
         self.doctimes = self.root["doctimes"] = IIBTree()
     try:
         self.watchfolders = self.root["watchfolders"]
     except KeyError:
         self.watchfolders = self.root["watchfolders"] = {}
     self.path2docid = OIBTree()
     for docid in self.docpaths.keys():
         path = self.docpaths[docid]
         self.path2docid[path] = docid
     try:
         self.maxdocid = max(self.docpaths.keys())
     except ValueError:
         self.maxdocid = 0
     print(len(self.docpaths), "Document ids")
     print(len(self.path2docid), "Pathnames")
     print(self.index.lexicon.length(), "Words")
Example #42
0
def fixupPloneLexicon(context):
    """Updates the plone_lexicon pipeline with the new splitter
       and case normalizer.
    """
    catalog = getToolByName(context, 'portal_catalog', None)
    if catalog is not None:
        if 'plone_lexicon' in catalog.objectIds():
            lexicon = catalog.plone_lexicon
            pipeline = list(lexicon._pipeline)
            if len(pipeline) >= 2:
                if (not isinstance(pipeline[0], Splitter)
                        or not isinstance(pipeline[1], CaseNormalizer)):
                    pipeline[0] = Splitter()
                    pipeline[1] = CaseNormalizer()
                    lexicon._pipeline = tuple(pipeline)
                    # Clear the lexicon
                    from BTrees.OIBTree import OIBTree
                    from BTrees.IOBTree import IOBTree
                    from BTrees.Length import Length
                    lexicon._wids = OIBTree()
                    lexicon._words = IOBTree()
                    lexicon.length = Length()
                    logger.info('Updated plone_lexicon pipeline.')
Example #43
0
    def _convertBTrees(self, threshold=200):

        from BTrees.convert import convert

        if type(self.data) is not IOBTree:
            data=self.data
            self.data=IOBTree()
            convert(data, self.data, threshold)

            self.__len__=BTrees.Length.Length(len(data))

            uids=self.uids
            self.uids=OIBTree()
            convert(uids, self.uids, threshold)

            paths=self.paths
            self.paths=IOBTree()
            convert(paths, self.paths, threshold)


        for index in self.indexes.values():
            if hasattr(index, '__of__'): index=index.__of__(self)
            index._convertBTrees(threshold)
Example #44
0
 def create_token(self, userid, timeout=None, data=None):
     payload = {}
     payload["sub"] = userid
     if timeout is None:
         timeout = self.token_timeout
     if timeout:
         payload["exp"] = datetime.utcnow() + timedelta(seconds=timeout)
     if data is not None:
         payload.update(data)
     algorithm = "HS256"
     if self.haveRSAKeys():
         algorithm = "RS256"
     token = jwt.encode(payload,
                        self._signing_secret(),
                        algorithm=algorithm)
     if not six.PY2:
         token = token.decode("utf-8")
     if self.store_tokens:
         if self._tokens is None:
             self._tokens = OOBTree()
         if userid not in self._tokens:
             self._tokens[userid] = OIBTree()
         self._tokens[userid][token] = int(time.time())
     return token
Example #45
0
    def initialize_storage(self):
        ann = IAnnotations(self.context)
        if self.ANNOTATIONS_KEY not in ann:
            ann[self.ANNOTATIONS_KEY] = OOBTree()

        self._storage = ann[self.ANNOTATIONS_KEY]

        # Actual list of actions
        if self.STORAGE_ACTIONS_KEY not in self._storage:
            self._storage[self.STORAGE_ACTIONS_KEY] = IOBTree()
        self._actions = self._storage[self.STORAGE_ACTIONS_KEY]

        # Indexes needed for fast lookups
        if self.STORAGE_INDEXES_KEY not in self._storage:
            self._storage[self.STORAGE_INDEXES_KEY] = OOBTree()
        self._indexes = self._storage[self.STORAGE_INDEXES_KEY]

        # Index: unique_name -> action_id
        if self.IDX_UNIQUE_NAME not in self._indexes:
            self._indexes[self.IDX_UNIQUE_NAME] = OIBTree()

        # Counter for the next 'action_id'
        if self.STORAGE_NEXT_ID_KEY not in self._storage:
            self._storage[self.STORAGE_NEXT_ID_KEY] = 0
class UUIDIndex(UnIndex):
    """Index for uuid fields with an unique value per key.

    The internal structure is:

    self._index = {datum:documentId]}
    self._unindex = {documentId:datum}

    For each datum only one documentId can exist.
    """

    meta_type = "UUIDIndex"

    manage_options = (
        {'label': 'Settings', 'action': 'manage_main'},
        {'label': 'Browse', 'action': 'manage_browse'},
    )

    query_options = ["query", "range"]

    manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals())
    manage_main._setName('manage_main')
    manage_browse = DTMLFile('../dtml/browseIndex', globals())

    def clear(self):
        self._length = Length()
        self._index = OIBTree()
        self._unindex = IOBTree()
        self._counter = Length()

    def numObjects(self):
        """Return the number of indexed objects. Since we have a 1:1 mapping
        from documents to values, we can reuse the stored length.
        """
        return self.indexSize()

    def uniqueValues(self, name=None, withLengths=0):
        """returns the unique values for name

        if withLengths is true, returns a sequence of
        tuples of (value, length)
        """
        if name is None:
            name = self.id
        elif name != self.id:
            raise StopIteration

        if not withLengths:
            for key in self._index.keys():
                yield key
        else:
            # We know the length for each value is one
            for key in self._index.keys():
                yield (key, 1)

    def insertForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and put it in the correct place
        in the forward index.
        """
        if entry is None:
            return

        old_docid = self._index.get(entry, _marker)
        if old_docid is _marker:
            self._index[entry] = documentId
            self._length.change(1)
        elif old_docid != documentId:
            logger.error("A different document with value '%s' already "
                "exists in the index.'" % entry)

    def removeForwardIndexEntry(self, entry, documentId):
        """Take the entry provided and remove any reference to documentId
        in its entry in the index.
        """
        old_docid = self._index.get(entry, _marker)
        if old_docid is not _marker:
            del self._index[entry]
            self._length.change(-1)

    def _get_object_datum(self, obj, attr):
        # for a uuid it never makes sense to acquire a parent value via
        # Acquisition
        has_attr = getattr(aq_base(obj), attr, _marker)
        if has_attr is _marker:
            return _marker
        return super(UUIDIndex, self)._get_object_datum(obj, attr)
Example #47
0
class Indexer(object):

    filestorage = database = connection = root = None

    def __init__(self, datafs, writable=0, trans=0, pack=0):
        self.trans_limit = trans
        self.pack_limit = pack
        self.trans_count = 0
        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
        self.database = DB(self.filestorage)
        self.connection = self.database.open()
        self.root = self.connection.root()
        try:
            self.index = self.root["index"]
        except KeyError:
            self.index = self.root["index"] = TextIndexWrapper()
        try:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
        try:
            self.doctimes = self.root["doctimes"]
        except KeyError:
            self.doctimes = self.root["doctimes"] = IIBTree()
        try:
            self.watchfolders = self.root["watchfolders"]
        except KeyError:
            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
            self.path2docid[path] = docid
        try:
            self.maxdocid = max(self.docpaths.keys())
        except ValueError:
            self.maxdocid = 0
        print(len(self.docpaths), "Document ids")
        print(len(self.path2docid), "Pathnames")
        print(self.index.lexicon.length(), "Words")

    def dumpfreqs(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        L = []
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            L.append((freq, wid, lexicon.get_word(wid)))
        L.sort()
        L.reverse()
        for freq, wid, word in L:
            print("%10d %10d %s" % (wid, freq, word))

    def dumpwids(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid)))

    def dumpwords(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for word in lexicon.words():
            wid = lexicon.get_wid(word)
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, word))

    def close(self):
        self.root = None
        if self.connection is not None:
            self.connection.close()
            self.connection = None
        if self.database is not None:
            self.database.close()
            self.database = None
        if self.filestorage is not None:
            self.filestorage.close()
            self.filestorage = None

    def interact(self, nbest=NBEST, maxlines=MAXLINES):
        try:
            import readline
        except ImportError:
            pass
        text = ""
        top = 0
        results = []
        while 1:
            try:
                line = raw_input("Query: ")
            except EOFError:
                print("\nBye.")
                break
            line = line.strip()
            if line.startswith("/"):
                self.specialcommand(line, results, top - nbest)
                continue
            if line:
                text = line
                top = 0
            else:
                if not text:
                    continue
            try:
                results, n = self.timequery(text, top + nbest)
            except KeyboardInterrupt:
                raise
            except:
                reportexc()
                text = ""
                continue
            if len(results) <= top:
                if not n:
                    print("No hits for %r." % text)
                else:
                    print("No more hits for %r." % text)
                text = ""
                continue
            print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
                  end=" ")
            print("for query %s]" % repr(text))
            self.formatresults(text, results, maxlines, top, top+nbest)
            top += nbest

    def specialcommand(self, line, results, first):
        assert line.startswith("/")
        line = line[1:]
        if not line:
            n = first
        else:
            try:
                n = int(line) - 1
            except:
                print("Huh?")
                return
        if n < 0 or n >= len(results):
            print("Out of range")
            return
        docid, score = results[n]
        path = self.docpaths[docid]
        i = path.rfind("/")
        assert i > 0
        folder = path[:i]
        n = path[i+1:]
        cmd = "show +%s %s" % (folder, n)
        if os.getenv("DISPLAY"):
            os.system("xterm -e  sh -c '%s | less' &" % cmd)
        else:
            os.system(cmd)

    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
        results, n = self.timequery(text, nbest)
        if not n:
            print("No hits for %r." % text)
            return
        print("[Results 1-%d from %d]" % (len(results), n))
        self.formatresults(text, results, maxlines)

    def timequery(self, text, nbest):
        t0 = time.time()
        c0 = time.clock()
        results, n = self.index.query(text, 0, nbest)
        t1 = time.time()
        c1 = time.clock()
        print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0))
        return results, n

    def formatresults(self, text, results, maxlines=MAXLINES,
                      lo=0, hi=sys.maxint):
        stop = self.stopdict.has_key
        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
        pattern = r"\b(" + "|".join(words) + r")\b"
        pattern = pattern.replace("*", ".*") # glob -> re syntax
        prog = re.compile(pattern, re.IGNORECASE)
        print('='*70)
        rank = lo
        for docid, score in results[lo:hi]:
            rank += 1
            path = self.docpaths[docid]
            score *= 100.0
            print("Rank:    %d   Score: %d%%   File: %s" % (rank, score, path))
            path = os.path.join(self.mh.getpath(), path)
            try:
                fp = open(path)
            except (IOError, OSError) as msg:
                print("Can't open:", msg)
                continue
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
                h = msg.getheader(header)
                if h:
                    print("%-8s %s" % (header+":", h))
            text = self.getmessagetext(msg)
            if text:
                print()
                nleft = maxlines
                for part in text:
                    for line in part.splitlines():
                        if prog.search(line):
                            print(line)
                            nleft -= 1
                            if nleft <= 0:
                                break
                    if nleft <= 0:
                        break
            print('-'*70)

    def update(self, args):
        folder = None
        seqs = []

        for arg in args:
            if arg.startswith("+"):
                if folder is None:
                    folder = arg[1:]
                else:
                    print("only one folder at a time")
                    return
            else:
                seqs.append(arg)

        if not folder:
            folder = self.mh.getcontext()
        if not seqs:
            seqs = ['all']

        try:
            f = self.mh.openfolder(folder)
        except mhlib.Error as msg:
            print(msg)
            return

        dict = {}
        for seq in seqs:
            try:
                nums = f.parsesequence(seq)
            except mhlib.Error as msg:
                print(msg or "unparsable message sequence: %s" % repr(seq))
                return
            for n in nums:
                dict[n] = n
        msgs = dict.keys()
        msgs.sort()

        self.updatefolder(f, msgs)
        self.commit()

    def optimize(self, args):
        uniqwords = {}
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nOPTIMIZE FOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.prescan(f, f.listmessages(), uniqwords)
        L = [(uniqwords[word], word) for word in uniqwords.keys()]
        L.sort()
        L.reverse()
        for i in range(100):
            print("%3d. %6d %s" % ((i+1,) + L[i]))
        self.index.lexicon.sourceToWordIds([word for (count, word) in L])

    def prescan(self, f, msgs, uniqwords):
        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
        for n in msgs:
            print("prescanning", n)
            m = f.openmessage(n)
            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
        if not args:
            print("No folders specified; use ALL to bulk-index all folders")
            return
        if "ALL" in args:
            i = args.index("ALL")
            args[i:i+1] = self.mh.listfolders()
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nFOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.updatefolder(f, f.listmessages())
            print("Total", len(self.docpaths))
        self.commit()
        print("Indexed", self.index.lexicon._nbytes, "bytes and",)
        print(self.index.lexicon._nwords, "words;",)
        print(len(self.index.lexicon._words), "unique words.")

    def updatefolder(self, f, msgs):
        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
            path = "%s/%s" % (f.name, n)
            docid = self.path2docid.get(path, 0)
            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
                print("unchanged", docid, path)
                continue
            docid = self.newdocid(path)
            try:
                m = f.openmessage(n)
            except IOError:
                print("disappeared", docid, path)
                self.unindexpath(path)
                continue
            text = self.getmessagetext(m, f.name)
            if not text:
                self.unindexpath(path)
                continue
            print("indexing", docid, path)
            self.index.index_doc(docid, text)
            self.maycommit()
        # Remove messages from the folder that no longer exist
        for path in list(self.path2docid.keys(f.name)):
            if not path.startswith(f.name + "/"):
                break
            if self.getmtime(path) == 0:
                self.unindexpath(path)
        print("done.")

    def unindexpath(self, path):
        if self.path2docid.has_key(path):
            docid = self.path2docid[path]
            print("unindexing", docid, path)
            del self.docpaths[docid]
            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex_doc(docid)
            except KeyError as msg:
                print("KeyError", msg)
            self.maycommit()

    def getmessagetext(self, m, name=None):
        L = []
        if name:
            L.append("_folder " + name) # To restrict search to a folder
            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except KeyboardInterrupt:
            raise
        except:
            print("(getmsgparts failed:)")
            reportexc()
        return L

    def getmsgparts(self, m, L, level):
        ctype = m.gettype()
        if level or ctype != "text/plain":
            print(". "*level + str(ctype))
        if ctype == "text/plain":
            L.append(m.getbodytext())
        elif ctype in ("multipart/alternative", "multipart/mixed"):
            for part in m.getbodyparts():
                self.getmsgparts(part, L, level+1)
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
            self.getheaders(m, L)
            self.getmsgparts(m, L, level+1)

    def getheaders(self, m, L):
        H = []
        for key in "from", "to", "cc", "bcc", "subject":
            value = m.get(key)
            if value:
                H.append(value)
        if H:
            L.append("\n".join(H))

    def newdocid(self, path):
        docid = self.path2docid.get(path)
        if docid is not None:
            self.doctimes[docid] = self.getmtime(path)
            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

    def getmtime(self, path):
        path = os.path.join(self.mh.getpath(), path)
        try:
            st = os.stat(path)
        except os.error as msg:
            return 0
        return int(st[ST_MTIME])

    def maycommit(self):
        self.trans_count += 1
        if self.trans_count >= self.trans_limit > 0:
            self.commit()

    def commit(self):
        if self.trans_count > 0:
            print("committing...")
            transaction.commit()
            self.trans_count = 0
            self.pack_count += 1
            if self.pack_count >= self.pack_limit > 0:
                self.pack()

    def pack(self):
        if self.pack_count > 0:
            print("packing...")
            self.database.pack()
            self.pack_count = 0
Example #48
0
 def clear( self ):
     """ Complete reset """
     self._index = IOBTree()
     self._unindex = OIBTree()
Example #49
0
class Path(String):

    root = None     # root as passed to Catalog()
    path2rid = None # OIBTree mapping path to rid (one:one)
    rid2path = None # IOBTree mapping rid to path (one:one)
    parts = None    # OOBTree mapping (level, part) to rids (one:many)
    levels = None   # IOBTree mapping level to a list of rids (one:many)
    case_sensitive = None

    sorted = None   # OOBTree for sorting; inherited from Path


    def __init__(self, root, case_sensitive=None):

        # Root
        # ====

        if not isinstance(root, basestring):
            raise TypeError("root is not a string: '%s'" % root)
        elif not isdir(root):
            raise ValueError("root doesn't point to a directory: '%s'" % root)
        self.root = root.rstrip(os.sep)


        # Case Sensitivity
        # ================

        if case_sensitive is None:
            if 'win' in sys.platform:
                case_sensitive = False
            else:
                case_sensitive = True
        if case_sensitive not in (False, True, 0, 1):
            raise TypeError( "case_sensitive isn't a boolean: "
                           + "'%s'" % case_sensitive
                            )
        self.case_sensitive = bool(case_sensitive)

        self.reset()


    # Index contract
    # ==============

    __name__ = 'Path' # used in command-line interface


    def reset(self):
        """Forget everything; usually called from __init__.
        """
        String.reset(self)

        self.path2rid = OIBTree()   # {path:rid}
        self.rid2path = IOBTree()   # {rid:path}
        self.parts = OOBTree()      # {(level,part):rids}
        self.rids = IOBTree()       # {rid:(level,part)s}
        self.levels = IOBTree()     # {level:rids}


    def learn(self, rid, value):
        """Given an rid and a value, associate them.
        """
        String.learn(self, rid, value)


        # Parse and validate.
        # ===================
        # Value is an absolute path, rooted in self.root.

        if not isinstance(value, basestring):
            raise TypeError("string expected")
        elif value and not value.startswith(os.sep):
            raise ValueError("path not specified absolutely: '%s'" % value)
        if self.case_sensitive:
            path = value
        else:
            path = value.lower()
        path = path.rstrip(os.sep) # safety net; should never need this
        parts = value.split(os.sep)
        #parts = value.split(os.sep)[1:]


        # Add to simple identity indices.
        # ===============================

        self.path2rid[path] = rid
        self.rid2path[rid] = path


        # Add to complex level/part indices.
        # ==================================

        for level in range(len(parts)):
            token_ = (level, parts[level])


            # Add to (one:many) mapping of (level,part) to [rids].
            # ====================================================

            if token_ not in self.parts:
                self.parts[token_] = IITreeSet([rid])
            else:
                self.parts[token_].insert(rid)


            # Add to the (one:many) mapping of rid to (level,part)s.
            # ======================================================
            # This exists so we know how to forget about this rid when the time
            # comes.

            if rid not in self.rids:
                self.rids[rid] = OOSet([token_])
            else:
                self.rids[rid].insert(token_)


        # Add to (one:many) mapping of levels to rids.
        # ============================================
        # This is used to implement level limits.

        if level not in self.levels:
            self.levels[level] = IITreeSet([rid])
        else:
            self.levels[level].insert(rid)


    def forget(self, rid):
        """Given an rid, remove it from all indices.
        """
        String.forget(self, rid)


        # Remove from the (one:many) mapping of (level, part) to rids.
        # ============================================================
        # We also track the level here and remove the rid from the (one:many)
        # mapping of levels to rids.

        level = -1
        for token_ in self.rids[rid]:
            if token_[0] > level:
                level = token_[0]
            self.parts[token_].remove(rid)
            if len(self.parts[token_]) == 0:
                del self.parts[token_]
        self.levels[level].remove(rid)
        if len(self.levels[level]) == 0:
            del self.levels[level]


        # Remove from the (one:many) mapping of rid to tokens.
        # ====================================================

        del self.rids[rid]


        # Remove from simple identity indices.
        # ====================================
        path = self.rid2path[rid]
        del self.path2rid[path]
        del self.rid2path[rid]


    # Searches
    # ========

    def above(self, arg):
        """Find all resources at or above path, within the limits given.

        Here we actually call below() on <path> and all of its ancestors,
        passing the limits straight through, with the exception that limits
        default to 0:1 rather than None:None. Use '0:' for the latter.

        """

        # Parse and validate.
        # ===================

        path, upper, lower = self._path_and_limits(arg)
        rid = self.path2rid.get(path, None)
        if rid is None:
            return


        # Build
        # =====

        tmpl = "%s "
        if (upper, lower) == (None, None):
            tmpl += '0:1' # default: breadcrumbs
        else:
            if upper is not None:
                tmpl += str(upper)
            tmpl += ":"
            if lower is not None:
                tmpl += str(lower)

        parts = path.split(os.sep)
        rids = []
        for level in range(len(parts)):
            ancestor = os.sep.join(parts[:level+1])
            ancestor = ancestor and ancestor or '/'
            rids.append(self.below(tmpl % ancestor))
        rids = multiunion(rids)


    def below(self, arg):
        """Find all resources at or below path, within the limits given.
        """

        # Parse and validate.
        # ===================

        path, upper, lower = self._path_and_limits(arg)
        rid = self.path2rid.get(path, None)
        if rid is None:
            return


        # Build
        # =====

        parts = path.split(os.sep)
        rids = None
        for level in range(len(parts)):
            rids = intersection(rids, self.parts[(level, parts[level])])
        if rids is None:
            return IISet() # short-cut


        # Limits
        # ======
        # Remove rids that are above any upper limit, and then only include rids
        # that are above any lower limit. Limits are relative to the level of
        # the requested path.

        if upper is not None:
            upper += level
            for i in range(level, upper):
                if i not in self.levels:
                    break
                rids = difference(rids, self.levels[i])
        if lower is not None:
            lower += level
            _rids = []
            for i in range(level, lower):
                if i not in self.levels:
                    break
                _rids.append(self.levels[i])
            rids = intersection(rids, multiunion(_rids))

        return rids


    def is_(self, arg):
        """Return the rid corresponding to a single path. Root is special-cased.
        """
        path, foo, bar = self._path_and_limits(arg)
        return self.path2rid.get(arg, None)


    # Parser
    # ======

    def _path_and_limits(self, arg):
        """Given an argument from a Collection constraint, return three params.

        Arg is of the form:

           /some/path 0:4

        The first token is the path, the second is a limits specification. The
        path must not contain a space (@@: really should support that). The
        limits spec is optional; if given, it must have a colon and at least one
        end specified. To the left of the colon is the upper bound; to the right
        is the lower bound. These bounds specify the tree levels that the path
        filter should apply to, but the specifics of how it applies depend on
        the searches above.

        (Yes this nomenclature is all wacky. The root is conceptually 'higher'
        for some reason, even though the root is 0 and a real tree's roots are
        lower than its branches. Go figure.)

        """

        path = ''
        upper = None
        lower = None

        parts = arg.split()
        nparts = len(parts)
        assert nparts in (1, 2), "either need path or path and limits"


        # Path
        # ====

        if nparts == 1:
            path = parts[0]
        elif nparts == 2:
            path = parts[0]


            # Limits
            # ======

            limits = parts[1]
            if not limits.count(':') == 1:
                raise ValueError("malformed limits (no colon): '%s'" % limits)
            upper, lower = limits.split(':')
            #if not (upper + lower):
            #    raise ValueError("no limits given: '%s'" % limits)

            if not upper:
                upper = None
            else:
                if not upper.isdigit():
                    raise ValueError("bad upper limit: '%s'" % upper)
                upper = int(upper)

            if not lower:
                lower = None
            else:
                if not lower.isdigit():
                    raise ValueError("bad lower limit: '%s'" % lower)
                lower = int(lower)

            if None not in (upper, lower):
                if upper > lower:
                    raise ValueError( "upper limit greater than lower: "
                                    + "%d > %d" % (upper, lower)
                                     )

        if path == os.sep:
            path = ''
        if not self.case_sensitive:
            path = path.lower()
        return path, upper, lower
class Repository(Implicit, Persistent):
    """The repository implementation manages the actual data of versions
       and version histories. It does not handle user interface issues."""

    def __init__(self):
        # These keep track of symbolic label and branch names that
        # have been used to ensure that they don't collide.
        self._branches = OIBTree()
        self._branches['mainline'] = 1
        self._labels = OIBTree()

        self._histories = OOBTree()
        self._created = time.time()

    security = ClassSecurityInfo()

    security.declarePrivate('createVersionHistory')
    def createVersionHistory(self, object):
        """Internal: create a new version history for a resource."""
        # When one creates the first version in a version history, neither
        # the version or version history yet have a _p_jar, which causes
        # copy operations to fail. To work around that, we share our _p_jar.
        history_id = None
        while history_id is None or self._histories.has_key(history_id):
            history_id = str(randint(1, 9999999999))
        history = ZopeVersionHistory(history_id, object)
        self._histories[history_id] = history
        return history.__of__(self)

    security.declarePrivate('getVersionHistory')
    def getVersionHistory(self, history_id):
        """Internal: return a version history given a version history id."""
        return self._histories[history_id].__of__(self)

    security.declarePrivate('replaceState')
    def replaceState(self, obj, new_state):
        """Internal: replace the state of a persistent object.
        """
        non_versioned = getNonVersionedData(obj)
        # XXX There ought to be some way to do this more cleanly.
        # This fills the __dict__ of the old object with new state.
        # The other way to achieve the desired effect is to replace
        # the object in its container, but this method preserves the
        # identity of the object.
        if obj.__class__ is not new_state.__class__:
            raise VersionControlError(
                "The class of the versioned object has changed. %s != %s"
                % (repr(obj.__class__, new_state.__class__)))
        obj._p_changed = 1
        for key in obj.__dict__.keys():
            if not new_state.__dict__.has_key(key):
                del obj.__dict__[key]
        for key, value in new_state.__dict__.items():
            obj.__dict__[key] = value
        if non_versioned:
            # Restore the non-versioned data into the new state.
            restoreNonVersionedData(obj, non_versioned)
        return obj

    #####################################################################
    # This is the implementation of the public version control interface.
    #####################################################################

    security.declarePublic('isAVersionableResource')
    def isAVersionableResource(self, obj):
        # For now, an object must be persistent (have its own db record)
        # in order to be considered a versionable resource.
        return isAVersionableResource(obj)

    security.declarePublic('isUnderVersionControl')
    def isUnderVersionControl(self, object):
        return hasattr(object, '__vc_info__')

    security.declarePublic('isResourceUpToDate')
    def isResourceUpToDate(self, object, require_branch=0):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        branch = 'mainline'
        if info.sticky:
            if info.sticky[0] == 'B':
                branch = info.sticky[1]
            elif require_branch:
                # The object is updated to a particular version
                # rather than a branch.  The caller
                # requires a branch.
                return 0
        return history.isLatestVersion(info.version_id, branch)

    security.declarePublic('isResourceChanged')
    def isResourceChanged(self, object):
        # Return true if the state of a resource has changed in a transaction
        # *after* the version bookkeeping was saved. Note that this method is
        # not appropriate for detecting changes within a transaction!
        info = self.getVersionInfo(object)
        itime = getattr(info, '_p_mtime', None)
        if itime is None:
            return 0
        mtime = Utility._findModificationTime(object)
        if mtime is None:
            return 0
        return mtime > itime

    security.declarePublic('getVersionInfo')
    def getVersionInfo(self, object):
        info = getattr(object, '__vc_info__', None)
        if info is not None:
            return info
        raise VersionControlError(
            'The specified resource is not under version control.'
            )

    security.declareProtected(use_vc_permission, 'applyVersionControl')
    def applyVersionControl(self, object, message=None):
        if self.isUnderVersionControl(object):
            raise VersionControlError(
                'The resource is already under version control.'
                )
        if not self.isAVersionableResource(object):
            raise VersionControlError(
                'This resource cannot be put under version control.'
                )

        # Need to check the parent to see if the container of the object
        # being put under version control is itself a version-controlled
        # object. If so, we need to use the branch id of the container.
        branch = 'mainline'
        parent = aq_parent(aq_inner(object))
        p_info = getattr(parent, '__vc_info__', None)
        if p_info is not None:
            sticky = p_info.sticky
            if sticky and sticky[0] == 'B':
                branch = sticky[1]

        # Create a new version history and initial version object.
        history = self.createVersionHistory(object)
        version = history.createVersion(object, branch)

        history_id = history.getId()
        version_id = version.getId()

        # Add bookkeeping information to the version controlled object.
        info = VersionInfo(history_id, version_id, VersionInfo.CHECKED_IN)
        if branch != 'mainline':
            info.sticky = ('B', branch)
        object.__vc_info__ = info

        # Save an audit record of the action being performed.
        history.addLogEntry(version_id,
                            LogEntry.ACTION_CHECKIN,
                            _findPath(object),
                            message is None and 'Initial checkin.' or message
                            )
        return object

    security.declareProtected(use_vc_permission, 'checkoutResource')
    def checkoutResource(self, object):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource is already checked out.'
                )

        if info.sticky and info.sticky[0] != 'B':
            raise VersionControlError(
                'The selected resource has been updated to a particular '
                'version, label or date. The resource must be updated to '
                'the mainline or a branch before it may be checked out.'
                )

        if not self.isResourceUpToDate(object):
            raise VersionControlError(
                'The selected resource is not up to date!'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        # Save an audit record of the action being performed.
        history.addLogEntry(info.version_id,
                            LogEntry.ACTION_CHECKOUT,
                            ob_path
                            )

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.status = newinfo.CHECKED_OUT
        object.__vc_info__ = newinfo
        return object

    security.declareProtected(use_vc_permission, 'checkinResource')
    def checkinResource(self, object, message=''):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_OUT:
            raise VersionControlError(
                'The selected resource is not checked out.'
                )

        if info.sticky and info.sticky[0] != 'B':
            raise VersionControlError(
                'The selected resource has been updated to a particular '
                'version, label or date. The resource must be updated to '
                'the mainline or a branch before it may be checked in.'
                )

        if not self.isResourceUpToDate(object):
            raise VersionControlError(
                'The selected resource is not up to date!'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        branch = 'mainline'
        if info.sticky is not None and info.sticky[0] == 'B':
            branch = info.sticky[1]

        version = history.createVersion(object, branch)

        # Save an audit record of the action being performed.
        history.addLogEntry(version.getId(),
                            LogEntry.ACTION_CHECKIN,
                            ob_path,
                            message
                            )

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.version_id = version.getId()
        newinfo.status = newinfo.CHECKED_IN
        object.__vc_info__ = newinfo
        return object

    security.declareProtected(use_vc_permission, 'uncheckoutResource')
    def uncheckoutResource(self, object):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_OUT:
            raise VersionControlError(
                'The selected resource is not checked out.'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        version = history.getVersionById(info.version_id)
        new_obj = version.copyState()

        # Save an audit record of the action being performed.
        history.addLogEntry(info.version_id,
                            LogEntry.ACTION_UNCHECKOUT,
                            ob_path
                            )

        # Replace the state of the object with a reverted state.
        new_obj = self.replaceState(object, new_obj)

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.version_id = version.getId()
        newinfo.status = newinfo.CHECKED_IN
        new_obj.__vc_info__ = newinfo
        return new_obj

    security.declareProtected(use_vc_permission, 'updateResource')
    def updateResource(self, object, selector=None):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in to be updated.'
                )

        history = self.getVersionHistory(info.history_id)
        version = None
        sticky = info.sticky

        if not selector:
            # If selector is null, update to the latest version taking any
            # sticky attrs into account (branch, date). Note that the sticky
            # tag could also be a date or version id. We don't bother checking
            # for those, since in both cases we do nothing (because we'll
            # always be up to date until the sticky tag changes).
            if sticky and sticky[0] == 'L':
                # A label sticky tag, so update to that label (since it is
                # possible, but unlikely, that the label has been moved).
                version = history.getVersionByLabel(sticky[1])
            elif sticky and sticky[0] == 'B':
                # A branch sticky tag. Update to latest version on branch.
                version = history.getLatestVersion(selector)
            else:
                # Update to mainline, forgetting any date or version id
                # sticky tag that was previously associated with the object.
                version = history.getLatestVersion('mainline')
                sticky = None
        else:
            # If the selector is non-null, we find the version specified
            # and update the sticky tag. Later we'll check the version we
            # found and decide whether we really need to update the object.
            if history.hasVersionId(selector):
                version = history.getVersionById(selector)
                sticky = ('V', selector)

            elif self._labels.has_key(selector):
                version = history.getVersionByLabel(selector)
                sticky = ('L', selector)

            elif self._branches.has_key(selector):
                version = history.getLatestVersion(selector)
                if selector == 'mainline':
                    sticky = None
                else:
                    sticky = ('B', selector)
            else:
                try:    date = DateTime(selector)
                except:
                    raise VersionControlError(
                        'Invalid version selector: %s' % selector
                        )
                else:
                    timestamp = date.timeTime()
                    sticky = ('D', timestamp)
                    # Fix!
                    branch = history.findBranchId(info.version_id)
                    version = history.getVersionByDate(branch, timestamp)

        # If the state of the resource really needs to be changed, do the
        # update and make a log entry for the update.
        version_id = version and version.getId() or info.version_id
        new_object = object
        if version and (version_id != info.version_id):
            new_object = version.copyState()
            new_object = self.replaceState(object, new_object)

            history.addLogEntry(version_id,
                                LogEntry.ACTION_UPDATE,
                                _findPath(new_object)
                                )

        # Update bookkeeping information.
        newinfo = info.clone(1)
        newinfo.version_id = version_id
        newinfo.status = newinfo.CHECKED_IN
        if sticky is not None:
            newinfo.sticky = sticky
        new_object.__vc_info__ = newinfo
        return new_object

    security.declareProtected(use_vc_permission, 'labelResource')
    def labelResource(self, object, label, force=0):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in to be labeled.'
                )

        # Make sure that labels and branch ids do not collide.
        if self._branches.has_key(label) or label == 'mainline':
            raise VersionControlError(
                'The label value given is already in use as an activity id.'
                )
        if not self._labels.has_key(label):
            self._labels[label] = 1

        history = self.getVersionHistory(info.history_id)
        history.labelVersion(info.version_id, label, force)
        return object

    security.declareProtected(use_vc_permission, 'makeActivity')
    def makeActivity(self, object, branch_id):
        # Note - this is not part of the official version control API yet.
        # It is here to allow unit testing of the architectural aspects
        # that are already in place to support activities in the future.

        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in.'
                )

        branch_id = branch_id or None

        # Make sure that activity ids and labels do not collide.
        if self._labels.has_key(branch_id) or branch_id == 'mainline':
            raise VersionControlError(
                'The value given is already in use as a version label.'
                )

        if not self._branches.has_key(branch_id):
            self._branches[branch_id] = 1

        history = self.getVersionHistory(info.history_id)

        if history._branches.has_key(branch_id):
            raise VersionControlError(
                'The resource is already associated with the given activity.'
                )

        history.createBranch(branch_id, info.version_id)
        return object

    security.declareProtected(use_vc_permission, 'getVersionOfResource')
    def getVersionOfResource(self, history_id, selector):
        history = self.getVersionHistory(history_id)
        sticky = None

        if not selector or selector == 'mainline':
            version = history.getLatestVersion('mainline')
        else:
            if history.hasVersionId(selector):
                version = history.getVersionById(selector)
                sticky = ('V', selector)

            elif self._labels.has_key(selector):
                version = history.getVersionByLabel(selector)
                sticky = ('L', selector)

            elif self._branches.has_key(selector):
                version = history.getLatestVersion(selector)
                sticky = ('B', selector)
            else:
                try: date = DateTime(selector)
                except:
                    raise VersionControlError(
                        'Invalid version selector: %s' % selector
                        )
                else:
                    timestamp = date.timeTime()
                    sticky = ('D', timestamp)
                    version = history.getVersionByDate('mainline', timestamp)

        object = version.copyState()

        info = VersionInfo(history_id, version.getId(), VersionInfo.CHECKED_IN)
        if sticky is not None:
            info.sticky = sticky
        object.__vc_info__ = info
        return object

    security.declareProtected(use_vc_permission, 'getVersionIds')
    def getVersionIds(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getVersionIds()

    security.declareProtected(use_vc_permission, 'getLabelsForResource')
    def getLabelsForResource(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getLabels()

    security.declareProtected(use_vc_permission, 'getLogEntries')
    def getLogEntries(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getLogEntries()
 def clear(self):
     self._lexicon = OIBTree()
     self._inverseLex = IOBTree()
     self._digrams = OOBTree()
class GlobbingLexicon(Lexicon):
    """Lexicon which supports basic globbing function ('*' and '?').

    This lexicon keeps several data structures around that are useful
    for searching. They are:

      '_lexicon' -- Contains the mapping from word => word_id

      '_inverseLex' -- Contains the mapping from word_id => word

      '_digrams' -- Contains a mapping from digram => word_id

    Before going further, it is necessary to understand what a digram is,
    as it is a core component of the structure of this lexicon.  A digram
    is a two-letter sequence in a word.  For example, the word 'zope'
    would be converted into the digrams::

      ['$z', 'zo', 'op', 'pe', 'e$']

    where the '$' is a word marker.  It is used at the beginning and end
    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'


    def __init__(self,useSplitter=None,extra=None):
        self.clear()
        self.useSplitter = useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)

    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()

    def _convertBTrees(self, threshold=200):
        Lexicon._convertBTrees(self, threshold)
        if type(self._digrams) is OOBTree: return

        from BTrees.convert import convert

        _digrams=self._digrams
        self._digrams=OOBTree()
        self._digrams._p_jar=self._p_jar
        convert(_digrams, self._digrams, threshold, IITreeSet)


    def createDigrams(self, word):
        """Returns a list with the set of digrams in the word."""

        word = '$'+word+'$'
        return [ word[i:i+2] for i in range(len(word)-1)]


    def getWordId(self, word):
        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
            return self.assignWordId(word)

    set = getWordId                     # Kludge for old code

    def getWord(self, wid):
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word, and return it."""

        # Double check it's not in the lexicon already, and if it is, just
        # return it.
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        # Get word id. BBB Backward compat pain.
        inverse=self._inverseLex
        try: insert=inverse.insert
        except AttributeError:
            # we have an "old" BTree object
            if inverse:
                wid=inverse.keys()[-1]+1
            else:
                self._inverseLex=IOBTree()
                wid=1
            inverse[wid] = word
        else:
            # we have a "new" IOBTree object
            wid=randid()
            while not inverse.insert(wid, word):
                wid=randid()

        self._lexicon[word] = wid

        # Now take all the digrams and insert them into the digram map.
        for digram in self.createDigrams(word):
            set = self._digrams.get(digram, None)
            if set is None:
                self._digrams[digram] = set = IISet()
            set.insert(wid)

        return wid


    def get(self, pattern):
        """ Query the lexicon for words matching a pattern."""

        # single word pattern  produce a slicing problem below.
        # Because the splitter throws away single characters we can
        # return an empty tuple here.

        if len(pattern)==1: return ()

        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
        globbing = 0
        for i in range(len(pattern)):
            if pattern[i] in wc_set:
                globbing = 1
                continue

            if i == 0:
                digrams.insert(i, (self.eow + pattern[i]) )
                digrams.append((pattern[i] + pattern[i+1]))
            else:
                try:
                    if pattern[i+1] not in wc_set:
                        digrams.append( pattern[i] + pattern[i+1] )

                except IndexError:
                    digrams.append( (pattern[i] + self.eow) )

        if not globbing:
            result =  self._lexicon.get(pattern, None)
            if result is None:
                return ()
            return (result, )

        ## now get all of the intsets that contain the result digrams
        result = None
        for digram in digrams:
            result=union(result, self._digrams.get(digram, None))

        if not result:
            return ()
        else:
            ## now we have narrowed the list of possible candidates
            ## down to those words which contain digrams.  However,
            ## some words may have been returned that match digrams,
            ## but do not match 'pattern'.  This is because some words
            ## may contain all matching digrams, but in the wrong
            ## order.

            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = IISet()
            for x in result:
                if expr.match(self._inverseLex[x]):
                    hits.insert(x)
            return hits


    def __getitem__(self, word):
        """ """
        return self.get(word)


    def query_hook(self, q):
        """expand wildcards"""
        ListType = type([])
        i = len(q) - 1
        while i >= 0:
            e = q[i]
            if isinstance(e, ListType):
                self.query_hook(e)
            elif isinstance(e, Op):
                pass
            elif ( (self.multi_wc in e) or
                   (self.single_wc in e) ):
                wids = self.get(e)
                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
                if not words:
                    # if words is empty, return something that will make
                    # textindex's __getitem__ return an empty result list
                    words.append('')
                q[i] = words
            i = i - 1

        return q

    def Splitter(self, astring, words=None, encoding="latin1"):
        """ wrap the splitter """

        ## don't do anything, less efficient but there's not much
        ## sense in stemming a globbing lexicon.

        try:
            return self.SplitterFunc(
                    astring,
                    words,
                    encoding=encoding,
                    singlechar=self.splitterParams.splitterSingleChars,
                    indexnumbers=self.splitterParams.splitterIndexNumbers,
                    casefolding=self.splitterParams.splitterCasefolding
                    )
        except:
            return self.SplitterFunc(astring, words)


    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

        # Remove characters that are meaningful in a regex
        if not isinstance(pat, UnicodeType):
            transTable = string.maketrans("", "")
            result = string.translate(pat, transTable,
                                      r'()&|!@#$%^{}\<>.')
        else:
            transTable={}
            for ch in r'()&|!@#$%^{}\<>.':
                transTable[ord(ch)]=None
            result=pat.translate(transTable)

        # First, deal with multi-character globbing
        result = result.replace( '*', '.*')

        # Next, we need to deal with single-character globbing
        result = result.replace( '?', '.')

        return "%s$" % result
Example #53
0
class Lexicon(Persistent):
    """
    Implementation of :class:`zope.index.text.interfaces.ILexicon`.
    """

    def __init__(self, *pipeline):
        self._wids = OIBTree()  # word -> wid
        self._words = IOBTree() # wid -> word
        # wid 0 is reserved for words that aren't in the lexicon (OOV -- out
        # of vocabulary).  This can happen, e.g., if a query contains a word
        # we never saw before, and that isn't a known stopword (or otherwise
        # filtered out).  Returning a special wid value for OOV words is a
        # way to let clients know when an OOV word appears.
        self.wordCount = Length()
        self._pipeline = pipeline

    def wordCount(self):
        """Return the number of unique terms in the lexicon."""
        # overridden per instance
        return len(self._wids)

    def words(self):
        return self._wids.keys()

    def wids(self):
        return self._words.keys()

    def items(self):
        return self._wids.items()

    def sourceToWordIds(self, text):
        if text is None:
            text = ''
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        if not isinstance(self.wordCount, Length):
            # Make sure wordCount is overridden with a BTrees.Length.Length
            self.wordCount = Length(self.wordCount())
        # Strategically unload the length value so that we get the most
        # recent value written to the database to minimize conflicting wids
        # Because length is independent, this will load the most
        # recent value stored, regardless of whether MVCC is enabled
        self.wordCount._p_deactivate()
        return list(map(self._getWordIdCreate, last))

    def termToWordIds(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            last = element.process(last)
        wids = []
        for word in last:
            wids.append(self._wids.get(word, 0))
        return wids

    def parseTerms(self, text):
        last = _text2list(text)
        for element in self._pipeline:
            process = getattr(element, "processGlob", element.process)
            last = process(last)
        return last

    def isGlob(self, word):
        return "*" in word or "?" in word

    def get_word(self, wid):
        return self._words[wid]

    def get_wid(self, word):
        return self._wids.get(word, 0)

    def globToWordIds(self, pattern):
        # Implement * and ? just as in the shell, except the pattern
        # must not start with either of these
        prefix = ""
        while pattern and pattern[0] not in "*?":
            prefix += pattern[0]
            pattern = pattern[1:]
        if not pattern:
            # There were no globbing characters in the pattern
            wid = self._wids.get(prefix, 0)
            if wid:
                return [wid]
            else:
                return []
        if not prefix:
            # The pattern starts with a globbing character.
            # This is too efficient, so we raise an exception.
            raise QueryError(
                "pattern %r shouldn't start with glob character" % pattern)
        pat = prefix
        for c in pattern:
            if c == "*":
                pat += ".*"
            elif c == "?":
                pat += "."
            else:
                pat += re.escape(c)
        pat += "$"
        prog = re.compile(pat)
        keys = self._wids.keys(prefix) # Keys starting at prefix
        wids = []
        for key in keys:
            if not key.startswith(prefix):
                break
            if prog.match(key):
                wids.append(self._wids[key])
        return wids

    def _getWordIdCreate(self, word):
        wid = self._wids.get(word)
        if wid is None:
            wid = self._new_wid()
            self._wids[word] = wid
            self._words[wid] = word
        return wid

    def _new_wid(self):
        count = self.wordCount
        count.change(1)
        while count() in self._words:
            # just to be safe
            count.change(1)
        return count()
Example #54
0
class DocumentMap(Persistent):
    """ A two-way map between addresses (e.g. location paths) and document ids.

    The map is a persistent object meant to live in a ZODB storage.

    Additionally, the map is capable of mapping 'metadata' to docids.
    """
    _v_nextid = None
    family = BTrees.family32
    _randrange = random.randrange
    docid_to_metadata = None  # latch for b/c

    def __init__(self):
        self.docid_to_address = IOBTree()
        self.address_to_docid = OIBTree()
        self.docid_to_metadata = IOBTree()

    def docid_for_address(self, address):
        """ Retrieve a document id for a given address.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        Return the integer document id corresponding to ``address``.

        If ``address`` doesn't exist in the document map, return None.
        """
        return self.address_to_docid.get(address)

    def address_for_docid(self, docid):
        """ Retrieve an address for a given document id.

        ``docid`` is an integer document id.

        Return the address corresponding to ``docid``.

        If ``docid`` doesn't exist in the document map, return None.
        """
        return self.docid_to_address.get(docid)

    def add(self, address, docid=_marker):
        """ Add a new document to the document map.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        ``docid``, if passed, must be an int.  In this case, remove
        any previous address stored for it before mapping it to the
        new address.  Passing an explicit ``docid`` also removes any
        metadata associated with that docid.
        
        If ``docid`` is not passed, generate a new docid.

        Return the integer document id mapped to ``address``.
        """
        if docid is _marker:
            docid = self.new_docid()

        self.remove_docid(docid)
        self.remove_address(address)

        self.docid_to_address[docid] = address
        self.address_to_docid[address] = docid
        return docid

    def remove_docid(self, docid):
        """ Remove a document from the document map for the given document ID.

        ``docid`` is an integer document id.

        Remove any corresponding metadata for ``docid`` as well.

        Return a True if ``docid`` existed in the map, else return False.
        """
        # It should be an invariant that if one entry exists in
        # docid_to_address for a docid/address pair, exactly one
        # corresponding entry exists in address_to_docid for the same
        # docid/address pair.  However, versions of this code before
        # r.catalog 0.7.3 had a bug which, if this method was called
        # multiple times, each time with the same address but a
        # different docid, the ``docid_to_address`` mapping could
        # contain multiple entries for the same address each with a
        # different docid, causing this invariant to be violated.  The
        # symptom: in systems that used r.catalog 0.7.2 and lower,
        # there might be more entries in docid_to_address than there
        # are in address_to_docid.  The conditional fuzziness in the
        # code directly below is a runtime kindness to systems in that
        # state.  Technically, the administrator of a system in such a
        # state should normalize the two data structures by running a
        # script after upgrading to 0.7.3.  If we made the admin do
        # this, some of the code fuzziness below could go away,
        # replaced with something simpler.  But there's no sense in
        # breaking systems at runtime through being a hardass about
        # consistency if an unsuspecting upgrader has not yet run the
        # data fixer script. The "fix the data" mantra rings a
        # little hollow when you weren't the one who broke the data in
        # the first place ;-)

        self._check_metadata()

        address = self.docid_to_address.get(docid, _marker)
        if address is _marker:
            return False

        old_docid = self.address_to_docid.get(address, _marker)
        if (old_docid is not _marker) and (old_docid != docid):
            self.remove_docid(old_docid)

        if docid in self.docid_to_address:
            del self.docid_to_address[docid]
        if address in self.address_to_docid:
            del self.address_to_docid[address]
        if docid in self.docid_to_metadata:
            del self.docid_to_metadata[docid]

        return True

    def remove_address(self, address):
        """ Remove a document from the document map using an address.

        ``address`` is a string or other hashable object which represents
        a token known by the application.

        Remove any corresponding metadata for ``address`` as well.

        Return a True if ``address`` existed in the map, else return False.
        """
        # See the comment in remove_docid for complexity rationalization

        self._check_metadata()

        docid = self.address_to_docid.get(address, _marker)
        if docid is _marker:
            return False

        old_address = self.docid_to_address.get(docid, _marker)
        if (old_address is not _marker) and (old_address != address):
            self.remove_address(old_address)

        if docid in self.docid_to_address:
            del self.docid_to_address[docid]
        if address in self.address_to_docid:
            del self.address_to_docid[address]
        if docid in self.docid_to_metadata:
            del self.docid_to_metadata[docid]

        return True

    def _check_metadata(self):
        # backwards compatibility
        if self.docid_to_metadata is None:
            self.docid_to_metadata = IOBTree()

    def add_metadata(self, docid, data):
        """ Add metadata related to a given document id.

        ``data`` must be a mapping, such as a dictionary.
        
        For each key/value pair in ``data`` insert a metadata key/value pair
        into the metadata stored for ``docid``.

        Overwrite any existing values for the keys in ``data``, leaving values
        unchanged for other existing keys.

        Raise a KeyError If ``docid`` doesn't relate to an address in the
        document map.
        """
        if not docid in self.docid_to_address:
            raise KeyError(docid)
        if len(list(data.keys())) == 0:
            return
        self._check_metadata()
        meta = self.docid_to_metadata.setdefault(docid, OOBTree())
        for k in data:
            meta[k] = data[k]

    def remove_metadata(self, docid, *keys):
        """ Remove metadata related to a given document id.

        If ``docid`` doesn't exist in the metadata map, raise a KeyError.

        For each key in ``keys``, remove the metadata value for the
        docid related to that key.
        
        Do not raise any error if no value exists for a given key.

        If no keys are specified, remove all metadata related to the docid.
        """
        self._check_metadata()
        if keys:
            meta = self.docid_to_metadata.get(docid, _marker)
            if meta is _marker:
                raise KeyError(docid)
            for k in keys:
                if k in meta:
                    del meta[k]
            if not meta:
                del self.docid_to_metadata[docid]
        else:
            if not (docid in self.docid_to_metadata):
                raise KeyError(docid)
            del self.docid_to_metadata[docid]

    def get_metadata(self, docid):
        """ Return the metadata for ``docid``.

        Return a mapping of the keys and values set using ``add_metadata``.

        Raise a KeyError If metadata does not exist for ``docid``.
        """
        if self.docid_to_metadata is None:
            raise KeyError(docid)
        meta = self.docid_to_metadata[docid]
        return meta

    def new_docid(self):
        """ Return a new document id.

        The returned value is guaranteed not to be used already in this
        document map.
        """
        while True:
            if self._v_nextid is None:
                self._v_nextid = self._randrange(self.family.minint,
                                                 self.family.maxint)
            uid = self._v_nextid
            self._v_nextid += 1
            if uid not in self.docid_to_address:
                return uid
            self._v_nextid = None
Example #55
0
 def __init__(self):
     self.docid_to_address = IOBTree()
     self.address_to_docid = OIBTree()
     self.docid_to_metadata = IOBTree()
 def clear(self):
     self._length = Length()
     self._index = OIBTree()
     self._unindex = IOBTree()
     self._counter = Length()
Example #57
0
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
    """ An Object Catalog

    An Object Catalog maintains a table of object metadata, and a
    series of manageable indexes to quickly search for objects
    (references in the metadata) that satisfy a search query.

    This class is not Zope specific, and can be used in any python
    program to build catalogs of objects.  Note that it does require
    the objects to be Persistent, and thus must be used with ZODB3.
    """

    _v_brains = NoBrainer

    def __init__(self, vocabulary=None, brains=None):
        # Catalogs no longer care about vocabularies and lexicons
        # so the vocabulary argument is ignored. (Casey)

        self.schema = {}   # mapping from attribute name to column number
        self.names = ()    # sequence of column names
        self.indexes = {}  # mapping from index name to index object

        # The catalog maintains a BTree of object meta_data for
        # convenient display on result pages.  meta_data attributes
        # are turned into brain objects and returned by
        # searchResults.  The indexing machinery indexes all records
        # by an integer id (rid). self.data is a mapping from the
        # integer id to the meta_data, self.uids is a mapping of the
        # object unique identifier to the rid, and self.paths is a
        # mapping of the rid to the unique identifier.

        self.clear()

        if brains is not None:
            self._v_brains = brains

        self.updateBrains()

    def __len__(self):
        return self._length()

    def clear(self):
        """ clear catalog """

        self.data = IOBTree()  # mapping of rid to meta_data
        self.uids = OIBTree()  # mapping of uid to rid
        self.paths = IOBTree()  # mapping of rid to uid
        self._length = BTrees.Length.Length()

        for index in self.indexes.keys():
            self.getIndex(index).clear()

    def updateBrains(self):
        self.useBrains(self._v_brains)

    def __getitem__(self, index):
        """
        Returns instances of self._v_brains, or whatever is passed
        into self.useBrains.
        """
        if isinstance(index, tuple):
            # then it contains a score...
            normalized_score, score, key = index
        else:
            # otherwise no score, set all scores to 1
            normalized_score, score, key = (1, 1, index)

        data = self.data[key]
        klass = self._v_result_class
        schema_len = len(klass.__record_schema__)
        if schema_len == len(data) + 3:
            # if we have complete data, create in a single pass
            r = klass(tuple(data) + (key, score, normalized_score))
        else:
            r = klass(data)
            r.data_record_id_ = key
            r.data_record_score_ = score
            r.data_record_normalized_score_ = normalized_score
        r = r.__of__(aq_parent(self))
        return r

    def __setstate__(self, state):
        """ initialize your brains.  This method is called when the
        catalog is first activated (from the persistent storage) """
        Persistent.__setstate__(self, state)
        self.updateBrains()

    def useBrains(self, brains):
        """ Sets up the Catalog to return an object (ala ZTables) that
        is created on the fly from the tuple stored in the self.data
        Btree.
        """

        class mybrains(AbstractCatalogBrain, brains):
            pass

        scopy = self.schema.copy()

        schema_len = len(self.schema.keys())
        scopy['data_record_id_'] = schema_len
        scopy['data_record_score_'] = schema_len + 1
        scopy['data_record_normalized_score_'] = schema_len + 2

        mybrains.__record_schema__ = scopy

        self._v_brains = brains
        self._v_result_class = mybrains

    def addColumn(self, name, default_value=None, threshold=10000):
        """Adds a row to the meta data schema"""
        schema = self.schema
        names = list(self.names)

        if name != name.strip():
            # Someone could have mistakenly added a space at the end
            # of the input field.
            LOG.warn("stripped space from new column %r -> %r", name,
                     name.strip())
            name = name.strip()

        if name in schema:
            raise CatalogError('The column %s already exists' % name)

        if name[0] == '_':
            raise CatalogError('Cannot cache fields beginning with "_"')

        values = schema.values()
        if values:
            schema[name] = max(values) + 1
        else:
            schema[name] = 0
        names.append(name)

        if default_value in (None, ''):
            default_value = MV

        if len(self):
            pghandler = ZLogHandler(threshold)
            pghandler.init('Adding %s column' % name, len(self))
            for i, (key, value) in enumerate(self.data.iteritems()):
                pghandler.report(i)
                self.data[key] = value + (default_value, )
            pghandler.finish()

        self.names = tuple(names)
        self.schema = schema

        # new column? update the brain
        self.updateBrains()

    def delColumn(self, name, threshold=10000):
        """Deletes a row from the meta data schema"""
        names = list(self.names)
        _index = names.index(name)

        if not name in self.schema:
            LOG.error('delColumn attempted to delete nonexistent '
                      'column %s.' % str(name))
            return

        del names[_index]

        # rebuild the schema
        schema = {}
        for i, name in enumerate(names):
            schema[name] = i

        self.schema = schema
        self.names = tuple(names)

        # update the brain
        self.updateBrains()

        # remove the column value from each record
        if len(self):
            _next_index = _index + 1
            pghandler = ZLogHandler(threshold)
            pghandler.init('Deleting %s column' % name, len(self))
            for i, (key, value) in enumerate(self.data.iteritems()):
                pghandler.report(i)
                self.data[key] = value[:_index] + value[_next_index:]
            pghandler.finish()

    def addIndex(self, name, index_type):
        """Create a new index, given a name and a index_type.

        Old format: index_type was a string, 'FieldIndex' 'TextIndex' or
        'KeywordIndex' is no longer valid; the actual index must be
        instantiated and passed in to addIndex.

        New format: index_type is the actual index object to be stored.
        """

        if name in self.indexes:
            raise CatalogError('The index %s already exists' % name)

        if name.startswith('_'):
            raise CatalogError('Cannot index fields beginning with "_"')

        if not name:
            raise CatalogError('Name of index is empty')

        if name != name.strip():
            # Someone could have mistakenly added a space at the end
            # of the input field.
            LOG.warn("stripped space from new index %r -> %r", name,
                     name.strip())
            name = name.strip()

        indexes = self.indexes

        if isinstance(index_type, str):
            raise TypeError("Catalog addIndex now requires the index type to"
                            "be resolved prior to adding; create the proper "
                            "index in the caller.")

        indexes[name] = index_type
        self.indexes = indexes

    def delIndex(self, name):
        """ deletes an index """

        if not name in self.indexes:
            raise CatalogError('The index %s does not exist' % name)

        indexes = self.indexes
        del indexes[name]
        self.indexes = indexes

    def getIndex(self, name):
        """ get an index wrapped in the catalog """
        return self.indexes[name].__of__(self)

    def updateMetadata(self, object, uid, index):
        """ Given an object and a uid, update the column data for the
        uid with the object data iff the object has changed """
        data = self.data
        newDataRecord = self.recordify(object)

        if index is None:
            index = getattr(self, '_v_nextid', 0)
            if index % 4000 == 0:
                index = randint(-2000000000, 2000000000)
            while not data.insert(index, newDataRecord):
                index = randint(-2000000000, 2000000000)

            # We want ids to be somewhat random, but there are
            # advantages for having some ids generated
            # sequentially when many catalog updates are done at
            # once, such as when reindexing or bulk indexing.
            # We allocate ids sequentially using a volatile base,
            # so different threads get different bases. This
            # further reduces conflict and reduces churn in
            # here and it result sets when bulk indexing.
            self._v_nextid = index + 1
        else:
            if data.get(index, 0) != newDataRecord:
                data[index] = newDataRecord
        return index

    # the cataloging API

    def catalogObject(self, object, uid, threshold=None, idxs=None,
                      update_metadata=True):
        """
        Adds an object to the Catalog by iteratively applying it to
        all indexes.

        'object' is the object to be cataloged

        'uid' is the unique Catalog identifier for this object

        If 'idxs' is specified (as a sequence), apply the object only
        to the named indexes.

        If 'update_metadata' is true (the default), also update metadata for
        the object.  If the object is new to the catalog, this flag has
        no effect (metadata is always created for new objects).
        """
        if idxs is None:
            idxs = []

        index = self.uids.get(uid, None)

        if index is None:
            # we are inserting new data
            index = self.updateMetadata(object, uid, None)
            self._length.change(1)
            self.uids[uid] = index
            self.paths[index] = uid
        elif update_metadata:
            # we are updating and we need to update metadata
            self.updateMetadata(object, uid, index)

        # do indexing
        total = 0

        if idxs == []:
            use_indexes = self.indexes.keys()
        else:
            use_indexes = idxs

        for name in use_indexes:
            x = self.getIndex(name)
            if hasattr(x, 'index_object'):
                blah = x.index_object(index, object, threshold)
                total = total + blah
            else:
                LOG.error('catalogObject was passed bad index '
                          'object %s.' % str(x))

        return total

    def uncatalogObject(self, uid):
        """
        Uncatalog and object from the Catalog.  and 'uid' is a unique
        Catalog identifier

        Note, the uid must be the same as when the object was
        catalogued, otherwise it will not get removed from the catalog

        This method should not raise an exception if the uid cannot
        be found in the catalog.
        """
        data = self.data
        uids = self.uids
        paths = self.paths
        indexes = self.indexes.keys()
        rid = uids.get(uid, None)

        if rid is not None:
            for name in indexes:
                x = self.getIndex(name)
                if hasattr(x, 'unindex_object'):
                    x.unindex_object(rid)
            del data[rid]
            del paths[rid]
            del uids[uid]
            self._length.change(-1)

        else:
            LOG.error('uncatalogObject unsuccessfully '
                      'attempted to uncatalog an object '
                      'with a uid of %s. ' % str(uid))

    def uniqueValuesFor(self, name):
        """ return unique values for FieldIndex name """
        return tuple(self.getIndex(name).uniqueValues())

    def hasuid(self, uid):
        """ return the rid if catalog contains an object with uid """
        return self.uids.get(uid)

    def recordify(self, object):
        """ turns an object into a record tuple """
        record = []
        # the unique id is always the first element
        for x in self.names:
            attr = getattr(object, x, MV)
            if (attr is not MV and safe_callable(attr)):
                attr = attr()
            record.append(attr)
        return tuple(record)

    def instantiate(self, record):
        r = self._v_result_class(record[1])
        r.data_record_id_ = record[0]
        return r.__of__(self)

    def getMetadataForRID(self, rid):
        record = self.data[rid]
        result = {}
        for (key, pos) in self.schema.items():
            result[key] = record[pos]
        return result

    def getIndexDataForRID(self, rid):
        result = {}
        for name in self.indexes.keys():
            result[name] = self.getIndex(name).getEntryForObject(rid, "")
        return result

    # This is the Catalog search engine. Most of the heavy lifting happens
    # below

    def make_query(self, request):
        # This is a bit of a mess, but the ZCatalog API has traditionally
        # supported passing in query restrictions in almost arbitary ways
        real_req = None
        if isinstance(request, dict):
            query = request.copy()
        elif isinstance(request, CatalogSearchArgumentsMap):
            query = {}
            query.update(request.keywords)
            real_req = request.request
            if isinstance(real_req, dict):
                query.update(real_req)
                real_req = None
        else:
            real_req = request

        if real_req:
            warnings.warn('You have specified a query using either a request '
                          'object or a mixture of a query dict and keyword '
                          'arguments. Please use only a simple query dict. '
                          'Your query contained "%s". This support is '
                          'deprecated and will be removed in Zope 4.' %
                          repr(real_req), DeprecationWarning, stacklevel=4)

            known_keys = query.keys()
            # The request has too many places where an index restriction
            # might be specified. Putting all of request.form,
            # request.other, ... into the query isn't what we want.
            # So we iterate over all known indexes instead and see if they
            # are in the request.
            for iid in self.indexes.keys():
                if iid in known_keys:
                    continue
                value = real_req.get(iid)
                if value:
                    query[iid] = value
        return query

    def _get_index_query_names(self, index):
        if hasattr(index, 'getIndexQueryNames'):
            return index.getIndexQueryNames()
        return (index.getId(),)

    def _sorted_search_indexes(self, query):
        # Simple implementation ordering only by limited result support
        query_keys = query.keys()
        order = []
        for name, index in self.indexes.items():
            for attr in self._get_index_query_names(index):
                if attr in query_keys:
                    order.append((ILimitedResultIndex.providedBy(index), name))
        order.sort()
        return [i[1] for i in order]

    def _limit_sequence(self, sequence, slen, b_start=0, b_size=None,
                        switched_reverse=False):
        if b_size is not None:
            sequence = sequence[b_start:b_start + b_size]
            if slen:
                slen = len(sequence)
        if switched_reverse:
            sequence.reverse()
        return (sequence, slen)

    def search(self,
            query, sort_index=None, reverse=False, limit=None, merge=True):
        """Iterate through the indexes, applying the query to each one. If
        merge is true then return a lazy result set (sorted if appropriate)
        otherwise return the raw (possibly scored) results for later merging.
        Limit is used in conjuntion with sorting or scored results to inform
        the catalog how many results you are really interested in. The catalog
        can then use optimizations to save time and memory. The number of
        results is not guaranteed to fall within the limit however, you should
        still slice or batch the results as usual."""

        # Indexes fulfill a fairly large contract here. We hand each
        # index the query mapping we are given (which may be composed
        # of some combination of web request, kw mappings or plain old dicts)
        # and the index decides what to do with it. If the index finds work
        # for itself in the query, it returns the results and a tuple of
        # the attributes that were used. If the index finds nothing for it
        # to do then it returns None.

        # Canonicalize the request into a sensible query before passing it on
        query = self.make_query(query)

        cr = self.getCatalogPlan(query)
        cr.start()

        plan = cr.plan()
        if not plan:
            plan = self._sorted_search_indexes(query)

        rs = None  # result set
        indexes = self.indexes.keys()
        for i in plan:
            if i not in indexes:
                # We can have bogus keys or the plan can contain index names
                # that have been removed in the meantime
                continue

            index = self.getIndex(i)
            _apply_index = getattr(index, "_apply_index", None)
            if _apply_index is None:
                continue

            cr.start_split(i)
            limit_result = ILimitedResultIndex.providedBy(index)
            if limit_result:
                r = _apply_index(query, rs)
            else:
                r = _apply_index(query)

            if r is not None:
                r, u = r
                # Short circuit if empty result
                # BBB: We can remove the "r is not None" check in Zope 4
                # once we don't need to support the "return everything" case
                # anymore
                if r is not None and not r:
                    cr.stop_split(i, result=None, limit=limit_result)
                    return LazyCat([])

                # provide detailed info about the pure intersection time
                intersect_id = i + '#intersection'
                cr.start_split(intersect_id)
                # weightedIntersection preserves the values from any mappings
                # we get, as some indexes don't return simple sets
                if hasattr(rs, 'items') or hasattr(r, 'items'):
                    _, rs = weightedIntersection(rs, r)
                else:
                    rs = intersection(rs, r)

                cr.stop_split(intersect_id)

                # consider the time it takes to intersect the index result
                # with the total result set to be part of the index time
                cr.stop_split(i, result=r, limit=limit_result)
                if not rs:
                    break
            else:
                cr.stop_split(i, result=None, limit=limit_result)

        # Try to deduce the sort limit from batching arguments
        b_start = int(query.get('b_start', 0))
        b_size = query.get('b_size', None)
        if b_size is not None:
            b_size = int(b_size)

        if b_size is not None:
            limit = b_start + b_size
        elif limit and b_size is None:
            b_size = limit

        if sort_index is None:
            sort_report_name = None
        else:
            if isinstance(sort_index, list):
                sort_name = '-'.join(i.getId() for i in sort_index)
            else:
                sort_name = sort_index.getId()
            if isinstance(reverse, list):
                reverse_name = '-'.join(
                    'desc' if r else 'asc' for r in reverse)
            else:
                reverse_name = 'desc' if reverse else 'asc'
            sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name
            if limit is not None:
                sort_report_name += '#limit-%s' % limit

        if rs is None:
            # None of the indexes found anything to do with the query
            # We take this to mean that the query was empty (an empty filter)
            # and so we return everything in the catalog
            warnings.warn('Your query %s produced no query restriction. '
                          'Currently the entire catalog content is returned. '
                          'In Zope 4 this will result in an empty LazyCat '
                          'to be returned.' % repr(cr.make_key(query)),
                          DeprecationWarning, stacklevel=3)

            rlen = len(self)
            if sort_index is None:
                sequence, slen = self._limit_sequence(self.data.items(), rlen,
                    b_start, b_size)
                result = LazyMap(self.instantiate, sequence, slen,
                    actual_result_count=rlen)
            else:
                cr.start_split(sort_report_name)
                result = self.sortResults(
                    self.data, sort_index, reverse, limit, merge,
                        actual_result_count=rlen, b_start=b_start,
                        b_size=b_size)
                cr.stop_split(sort_report_name, None)
        elif rs:
            # We got some results from the indexes.
            # Sort and convert to sequences.
            # XXX: The check for 'values' is really stupid since we call
            # items() and *not* values()
            rlen = len(rs)
            if sort_index is None and hasattr(rs, 'items'):
                # having a 'items' means we have a data structure with
                # scores.  Build a new result set, sort it by score, reverse
                # it, compute the normalized score, and Lazify it.

                if not merge:
                    # Don't bother to sort here, return a list of
                    # three tuples to be passed later to mergeResults
                    # note that data_record_normalized_score_ cannot be
                    # calculated and will always be 1 in this case
                    getitem = self.__getitem__
                    result = [(score, (1, score, rid), getitem)
                            for rid, score in rs.items()]
                else:
                    cr.start_split('sort_on#score')

                    # sort it by score
                    rs = rs.byValue(0)
                    max = float(rs[0][0])

                    # Here we define our getter function inline so that
                    # we can conveniently store the max value as a default arg
                    # and make the normalized score computation lazy
                    def getScoredResult(item, max=max, self=self):
                        """
                        Returns instances of self._v_brains, or whatever is
                        passed into self.useBrains.
                        """
                        score, key = item
                        data = self.data[key]
                        klass = self._v_result_class
                        schema_len = len(klass.__record_schema__)
                        norm_score = int(100.0 * score / max)
                        if schema_len == len(data) + 3:
                            r = klass(tuple(data) + (key, score, norm_score))
                        else:
                            r = klass(data)
                            r.data_record_id_ = key
                            r.data_record_score_ = score
                            r.data_record_normalized_score_ = norm_score
                        r = r.__of__(aq_parent(self))
                        return r

                    sequence, slen = self._limit_sequence(rs, rlen, b_start,
                        b_size)
                    result = LazyMap(getScoredResult, sequence, slen,
                        actual_result_count=rlen)
                    cr.stop_split('sort_on#score', None)

            elif sort_index is None and not hasattr(rs, 'values'):
                # no scores
                if hasattr(rs, 'keys'):
                    rs = rs.keys()
                sequence, slen = self._limit_sequence(rs, rlen, b_start,
                    b_size)
                result = LazyMap(self.__getitem__, sequence, slen,
                    actual_result_count=rlen)
            else:
                # sort.  If there are scores, then this block is not
                # reached, therefore 'sort-on' does not happen in the
                # context of a text index query.  This should probably
                # sort by relevance first, then the 'sort-on' attribute.
                cr.start_split(sort_report_name)
                result = self.sortResults(rs, sort_index, reverse, limit,
                    merge, actual_result_count=rlen, b_start=b_start,
                    b_size=b_size)
                cr.stop_split(sort_report_name, None)
        else:
            # Empty result set
            result = LazyCat([])
        cr.stop()
        return result

    def sortResults(self, rs, sort_index, reverse=False, limit=None,
            merge=True, actual_result_count=None, b_start=0, b_size=None):
        # Sort a result set using one or more sort indexes. Both sort_index
        # and reverse can be lists of indexes and reverse specifications.
        # Return a lazy result set in sorted order if merge is true otherwise
        # returns a list of (sortkey, uid, getter_function) tuples, where
        # sortkey can be a tuple on its own.
        second_indexes = None
        second_indexes_key_map = None
        sort_index_length = 1
        if isinstance(sort_index, list):
            sort_index_length = len(sort_index)
            if sort_index_length > 1:
                second_indexes = sort_index[1:]
                second_indexes_key_map = []
                for si in second_indexes:
                    second_indexes_key_map.append(si.documentToKeyMap())
            sort_index = sort_index[0]
        _self__getitem__ = self.__getitem__
        index_key_map = sort_index.documentToKeyMap()
        result = []
        r_append = result.append
        r_insert = result.insert
        if hasattr(rs, 'keys'):
            rs = rs.keys()
        if actual_result_count is None:
            rlen = len(rs)
            actual_result_count = rlen
        else:
            rlen = actual_result_count

        # don't limit to more than what we have
        if limit is not None and limit >= rlen:
            limit = rlen

        # if we want a batch from the end of the result set, reverse sorting
        # order and limit it, then reverse the result set again
        switched_reverse = False
        if b_size and b_start and b_start > rlen / 2:
            if isinstance(reverse, list):
                reverse = [not r for r in reverse]
            else:
                reverse = not reverse
            switched_reverse = True
            b_end = b_start + b_size
            if b_end >= rlen:
                overrun = rlen - b_end
                if b_start >= rlen:
                    # bail out, we are outside the possible range
                    return LazyCat([], 0, actual_result_count)
                else:
                    b_size += overrun
                b_start = 0
            else:
                b_start = rlen - b_end
            limit = b_start + b_size

        # determine sort_spec
        if isinstance(reverse, list):
            sort_spec = [r and -1 or 1 for r in reverse]
            # limit to current maximum of sort indexes
            sort_spec = sort_spec[:sort_index_length]
            # use first sort order for choosing the algorithm
            first_reverse = reverse[0]
        else:
            sort_spec = []
            for i in xrange(sort_index_length):
                sort_spec.append(reverse and -1 or 1)
            first_reverse = reverse

        if merge and limit is None and (
           rlen > (len(sort_index) * (rlen / 100 + 1))):
            # The result set is much larger than the sorted index,
            # so iterate over the sorted index for speed.
            # TODO: len(sort_index) isn't actually what we want for a keyword
            # index, as it's only the unique values, not the documents.
            # Don't use this case while using limit, as we return results of
            # non-flattened intsets, and would have to merge/unflattened those
            # before limiting.
            length = 0
            try:
                intersection(rs, IISet(()))
            except TypeError:
                # rs is not an object in the IIBTree family.
                # Try to turn rs into an IISet.
                rs = IISet(rs)

            if sort_index_length == 1:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        r_append((k, intset, _self__getitem__))
                result.sort(reverse=reverse)
            else:
                for k, intset in sort_index.items():
                    # We have an index that has a set of values for
                    # each sort key, so we intersect with each set and
                    # get a sorted sequence of the intersections.
                    intset = intersection(rs, intset)
                    if intset:
                        keys = getattr(intset, 'keys', None)
                        if keys is not None:
                            # Is this ever true?
                            intset = keys()
                        length += len(intset)
                        # sort on secondary index
                        keysets = defaultdict(list)
                        for i in intset:
                            full_key = (k, )
                            for km in second_indexes_key_map:
                                try:
                                    full_key += (km[i], )
                                except KeyError:
                                    pass
                            keysets[full_key].append(i)
                        for k2, v2 in keysets.items():
                            r_append((k2, v2, _self__getitem__))
                result = multisort(result, sort_spec)
            sequence, slen = self._limit_sequence(result, length, b_start,
                b_size, switched_reverse)
            result = LazyCat(LazyValues(sequence), slen, actual_result_count)
        elif limit is None or (limit * 4 > rlen):
            # Iterate over the result set getting sort keys from the index.
            # If we are interested in at least 25% or more of the result set,
            # the N-Best algorithm is slower, so we iterate over all.
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        # The reference back to __getitem__ is used in case
                        # we do not merge now and need to intermingle the
                        # results with those of other catalogs while avoiding
                        # the cost of instantiating a LazyMap per result
                        r_append((key, did, _self__getitem__))
                if merge:
                    result.sort(reverse=reverse)
            else:
                for did in rs:
                    try:
                        full_key = (index_key_map[did], )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        r_append((full_key, did, _self__getitem__))
                if merge:
                    result = multisort(result, sort_spec)
            if merge:
                if limit is not None:
                    result = result[:limit]
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                    switched_reverse)
                return sequence
        elif first_reverse:
            # Limit / sort results using N-Best algorithm
            # This is faster for large sets then a full sort
            # And uses far less memory
            keys = []
            k_insert = keys.insert
            n = 0
            worst = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result.reverse()
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key <= worst:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[0], result[0]
                        else:
                            n += 1
                        worst = keys[0]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence
        elif not first_reverse:
            # Limit / sort results using N-Best algorithm in reverse (N-Worst?)
            keys = []
            k_insert = keys.insert
            n = 0
            best = None
            if sort_index_length == 1:
                for did in rs:
                    try:
                        key = index_key_map[did]
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
            else:
                for did in rs:
                    try:
                        key = index_key_map[did]
                        full_key = (key, )
                        for km in second_indexes_key_map:
                            full_key += (km[did], )
                    except KeyError:
                        # This document is not in the sort key index, skip it.
                        actual_result_count -= 1
                    else:
                        if n >= limit and key >= best:
                            continue
                        i = bisect(keys, key)
                        k_insert(i, key)
                        r_insert(i, (full_key, did, _self__getitem__))
                        if n == limit:
                            del keys[-1], result[-1]
                        else:
                            n += 1
                        best = keys[-1]
                result = multisort(result, sort_spec)
            sequence, _ = self._limit_sequence(result, 0, b_start, b_size,
                switched_reverse)
            if merge:
                result = LazyValues(sequence)
                result.actual_result_count = actual_result_count
            else:
                return sequence

        return LazyMap(self.__getitem__, result, len(result),
            actual_result_count=actual_result_count)

    def _get_sort_attr(self, attr, kw):
        """Helper function to find sort-on or sort-order."""
        # There are three different ways to find the attribute:
        # 1. kw[sort-attr]
        # 2. self.sort-attr
        # 3. kw[sort_attr]
        # kw may be a dict or an ExtensionClass MultiMapping, which
        # differ in what get() returns with no default value.
        name = "sort-%s" % attr
        val = kw.get(name, None)
        if val is not None:
            return val
        val = getattr(self, name, None)
        if val is not None:
            return val
        return kw.get("sort_%s" % attr, None)

    def _getSortIndex(self, args):
        """Returns a list of search index objects or None."""
        sort_index_names = self._get_sort_attr("on", args)
        if sort_index_names is not None:
            # self.indexes is always a dict, so get() w/ 1 arg works
            sort_indexes = []
            if not isinstance(sort_index_names, (list, tuple)):
                sort_index_names = [sort_index_names]
            for name in sort_index_names:
                sort_index = self.indexes.get(name)
                if sort_index is None:
                    raise CatalogError('Unknown sort_on index: %s' %
                                       repr(name))
                else:
                    if not hasattr(sort_index, 'documentToKeyMap'):
                        raise CatalogError('The index chosen for sort_on is '
                            'not capable of being used as a sort index: '
                            '%s' % repr(name))
                sort_indexes.append(sort_index)
            if len(sort_indexes) == 1:
                # be nice and keep the old API intact for single sort_on's
                return sort_indexes[0]
            return sort_indexes
        return None

    def searchResults(self, REQUEST=None, used=None, _merge=True, **kw):
        # You should pass in a simple dictionary as the request argument,
        # which only contains the relevant query.
        # The used argument is deprecated and is ignored
        if REQUEST is None and not kw:
            # Try to acquire request if we get no args for bw compat
            warnings.warn('Calling searchResults without a query argument nor '
                          'keyword arguments is deprecated. In Zope 4 the '
                          'query will no longer be automatically taken from '
                          'the acquired request.',
                          DeprecationWarning, stacklevel=3)
            REQUEST = getattr(self, 'REQUEST', None)
        if isinstance(REQUEST, dict) and not kw:
            # short cut for the best practice
            args = REQUEST
        else:
            args = CatalogSearchArgumentsMap(REQUEST, kw)
        sort_indexes = self._getSortIndex(args)
        sort_limit = self._get_sort_attr('limit', args)
        reverse = False
        if sort_indexes is not None:
            order = self._get_sort_attr("order", args)
            reverse = []
            if order is None:
                order = ['']
            elif isinstance(order, str):
                order = [order]
            for o in order:
                reverse.append(o.lower() in ('reverse', 'descending'))
            if len(reverse) == 1:
                # be nice and keep the old API intact for single sort_order
                reverse = reverse[0]
        # Perform searches with indexes and sort_index
        return self.search(args, sort_indexes, reverse, sort_limit, _merge)

    __call__ = searchResults

    def getCatalogPlan(self, query=None):
        """Query time reporting and planning.
        """
        parent = aq_base(aq_parent(self))
        threshold = getattr(parent, 'long_query_time', 0.1)
        return CatalogPlan(self, query, threshold)
Example #58
0
class IntegerRangesIndex(SimpleItem):
    """ Index a set of integer ranges:
        [(1,2), (12,23), (12, 22)]
    """

    implements(IPluggableIndex)
    meta_type = 'IntegerRangesIndex'

    def __init__(self, id, caller=None, extra=None):
        self.id = id
        self.caller = caller
        self.clear()
        self.__genid = 0

    def __len__(self):
        return self._length()

    def getId(self):
        """Return Id of index."""
        return self.id

    def clear(self):
        """Empty the index"""
        
        IOBTree = BTrees.family64.IO.BTree

        self._index = IOBTree() # {rangeid: [document_id, ...]}
        self._unindex = IOBTree() # {document_id: [rangeid, ...]}
        self._range_mapping = IOBTree() # {rangeid: range}
        self._reverse_range_mapping = OIBTree() # {range: rangeid}
        self._since_index = IOBTree() # {since: [rangeid,...]}
        self._until_index = IOBTree() # {until: [rangeid,...]}
        self._length = BTrees.Length.Length()
        self._unique_values_length = BTrees.Length.Length()

    def __get_range_id(self, range_):
        return self._reverse_range_mapping.get(range_, None)

    def __get_range(self, range_id):
        return self._range_mapping.get(range_id, None)

    def __index_range(self, range_):
        """ index range if needed and return the rangeid
        """
        range_id = self.__get_range_id(range_)
        if range_id is None:
            range_id = self.genid()
            # index range
            self._unique_values_length.change(1)
            self._range_mapping[range_id] = range_
            self._reverse_range_mapping[range_] = range_id
            # index range boundaries
            since, until = range_
            self.__insert_in_index_set(self._since_index, since, range_id)
            self.__insert_in_index_set(self._until_index, until, range_id)
        return range_id

    def __unindex_range(self, range_id):
        range_ = self.__get_range(range_id)
        if range_ is None:
            return None
        since, until = range_
        self.__remove_in_index_set(self._since_index, since, range_id)
        self.__remove_in_index_set(self._until_index, until, range_id)
        self._unique_values_length.change(-1)
        del self._range_mapping[range_id]
        del self._reverse_range_mapping[range_]
        return range_

    def genid(self):
        self.__genid += 1
        return self.__genid

    def getEntryForObject(self, document_id, default=_marker):
        """Get all information contained for 'document_id'."""
        if default is _marker:
            return self._unindex.get(document_id)
        else:
            return self._index.get(document_id, default)

    def getIndexSourceNames(self):
        """Get a sequence of attribute names that are indexed by the index.
        """
        return [self.id]

    def index_object(self, document_id, obj, threshold=None):
        """Index an object.

        'document_id' is the integer ID of the document.
        'obj' is the object to be indexed.
        'threshold' is the number of words to process between committing
        subtransactions.  If None, subtransactions are disabled.
        """
        new_ranges = self._get_object_data(obj, self.id)
        if new_ranges:
            new_set = IISet(map(self.__index_range, new_ranges))
        else:
            new_set = IISet()

        old_set = self._unindex.get(document_id, IISet())

        new_entries = difference(new_set, old_set)
        expired_entries = difference(old_set, new_set)

        if not (new_entries or expired_entries):
            # nothing to do, bail out !
            return 0
        for expired_entry in expired_entries:
            self.__remove_in_index_set(self._unindex, document_id,
                expired_entry)
            if self.__remove_in_index_set(self._index, expired_entry, \
                    document_id):
                # range is not used anymore, retire it
                self.__unindex_range(expired_entry)

        for new_entry in new_entries:
            if self.__insert_in_index_set(self._unindex, document_id,
                    new_entry):
                self._length.change(1)
            self.__insert_in_index_set(self._index, new_entry, document_id)

        return 1

    def unindex_object(self, document_id):
        """Remove the document_id from the index."""
        entries = self._unindex.get(document_id, _marker)
        if entries is _marker:
            return
        if isinstance(entries, int):
            entries = [entries]
        for expired_entry in entries:
            if self.__remove_in_index_set(self._index, expired_entry, \
                    document_id):
                # range is not used anymore, retire it
                self.__unindex_range(expired_entry)
        self._length.change(-1)
        del self._unindex[document_id]

    def __insert_in_index_set(self, index, key, value, set_type=IISet):
        """ Insert value in the index. If the key was not present and
        the index row was created it returns True
        """
        index_row = index.get(key, _marker)
        if index_row is _marker:
            index[key] = value
            return True
        if isinstance(index_row, set_type):
            index_row.insert(value)
            return False
        # it was an int
        index[key] = set_type((index_row, value,))
        return False

    def __remove_in_index_set(self, index, key, value, set_type=IISet):
        """ remove the value in the index, index row is a Set
        It returns true if the index row as been removed (The set was empty)
        """
        index_row = index.get(key, _marker)
        if index_row is _marker:
            return True
        if isinstance(index_row, IISet):
            index_row.remove(value)
            if len(index_row) == 0:
                del index[key]
                return True
            if len(index_row) == 1:
                index[key] = index_row[0]
            return False
        del index[key]
        return True

    def _apply_index(self, request):
        record = parseIndexRequest(request, self.id)
        try:
            qstart, qend = record.keys
        except TypeError:
            return None

        minint = BTrees.family64.minint
        maxint = BTrees.family64.maxint

        qstart = min(maxint, max(minint, qstart))
        qend = max(minint, min(maxint, qend))

        # start in inside range
        start = multiunion(self._since_index.values(max=qstart))
        end = multiunion(self._until_index.values(min=qstart))
        start_into = intersection(start, end)

        # end inside range
        start = multiunion(self._since_index.values(max=qend))
        end = multiunion(self._until_index.values(min=qend))
        end_into = intersection(start, end)

        # start before range and end after range
        start = multiunion(self._since_index.values(min=qstart))
        end = multiunion(self._until_index.values(max=qend))
        start_before_end_after = intersection(start, end)

        result = union(start_into, end_into)
        result = union(result, start_before_end_after)

        return multiunion(map(self._index.__getitem__, result)), (self.id,)

    def numObjects(self):
        """Return the number of indexed objects"""
        return self._length()

    def indexSize(self):
        """Return the size of the index in terms of distinct values"""
        return self._unique_values_length()

    def _get_object_data(self, obj, attr):
        # self.id is the name of the index, which is also the name of the
        # attribute we're interested in.  If the attribute is callable,
        # we'll do so.
        try:
            datum = getattr(obj, attr)
            if safe_callable(datum):
                datum = datum()
        except AttributeError:
            datum = _marker
        return datum