def clear(self): self.data = IOBTree() # {rid -> record as tuple} self.indexes = {} # {index_name -> OOBTree({value -> IITreeSet})} self.primary_index = OIBTree() # {primary key -> rid} for position, column in self.col_info: if column.indexed: self.indexes[column.name] = OOBTree()
def clear(self): """Empty the lexicon. """ self.length = Length() self._wid_length_based = False self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word
def clear(self): self._length = Length() self._index = OIBTree() self._unindex = IOBTree() if self._counter is None: self._counter = Length() else: self._increment_counter()
def clear(self): self._nextid = BTrees.Length.Length() self._forward_idx = OIBTree() self._inverse_idx = IOBTree() if self.truncate_left: self._lforward_idx = OIBTree() else: self._lforward_idx = None
def __init__(self): # These keep track of symbolic label and branch names that # have been used to ensure that they don't collide. self._branches = OIBTree() self._branches['mainline'] = 1 self._labels = OIBTree() self._histories = OOBTree() self._created = time.time()
def clear(self): """ clear catalog """ self.data = IOBTree() # mapping of rid to meta_data self.uids = OIBTree() # mapping of uid to rid self.paths = IOBTree() # mapping of rid to uid self._length = BTrees.Length.Length() for index in self.indexes.keys(): self.getIndex(index).clear()
def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self.wordCount = Length() self._pipeline = pipeline
def setupAnnotations(context): """ set up the annotations if they haven't been set up already. The rest of the functions in here assume that this has already been set up """ annotations = IAnnotations(context) if yays not in annotations: annotations[yays] = OIBTree() if nays not in annotations: annotations[nays] = OIBTree() return annotations
def convert_to_uuidindex(catalog, index): if isinstance(index, UUIDIndex): return logger.info('Converting index `%s` to UUIDIndex.' % index.getId()) index.__class__ = UUIDIndex index._p_changed = True catalog._catalog._p_changed = True # convert from OOBTree to OIBTree old_index = index._index if not isinstance(old_index, OIBTree): index._index = _index = OIBTree() for k, v in old_index.items(): if k is None: continue if isinstance(v, int): _index[k] = v else: if isinstance(v, (IISet, IITreeSet)): # inconsistent data, one uid with multiple docids paths = dict((tuple(catalog.getpath(k).split('/')), k) for k in v.keys()) shortest = min(paths, key=len) for path, key in paths.iteritems(): if path[:len(shortest)] != shortest: raise ValueError( 'Inconsistent UID index, UID %s is associated ' 'with multiple docids: %r' % (k, paths)) # All other docids are sub-paths of another # indicating the UID was just acquired, # choose the shortest _index[k] = paths[shortest] del old_index transaction.savepoint(optimistic=True) logger.info('Finished conversion.')
def __init__(self): super(ContentTypeScopeManager, self).__init__() self._mappings = IOBTree() # Methods permitted to access this mapping with. Originally # I wanted to provide alternative sets of mapping on a per # mapping_id basis, however this proved to be complex and # complicated due to extra relationships involved. self._methods = IOBTree() # For metadata related to the above. self._mappings_metadata = IOBTree() # To ease the usage of scopes, the mappings are referenced by # names and are called profiles which add a few useful fields to # allow slightly easier usage. This separates the name from the # already active tokens such that once a token is instantiated # with a scope, the mapping is stuck until the token is revoked. self._named_mappings = OIBTree() # name to id. # To not overburden the named mappings with work-in-progress # profiles, instantiate one here also. self._edit_mappings = OOBTree() self.default_mapping_id = self.addMapping({})
def storage(self): """ get the counter storage """ annotation = get_portal_annotation() if annotation.get(NUMBER_STORAGE) is None: annotation[NUMBER_STORAGE] = OIBTree() return annotation[NUMBER_STORAGE]
def resetOrdering(ordering): annotations = IAnnotations(ordering.context) order = PersistentList() annotations.__setitem__(ordering.ORDER_KEY, order) pos = OIBTree() annotations.__setitem__(ordering.POS_KEY, pos) return order, pos
def _getCacheId(self): """Return a cache id for preferences. We use: - user_id: because preferences are always different by user - self._preference_cache[user_id] which is increased everytime a user preference is modified - self._preference_cache[None] which is increased everytime a global preference is modified """ user_id = getSecurityManager().getUser().getId() try: self._preference_cache except AttributeError: self._preference_cache = OIBTree() return self._preference_cache.get(None), self._preference_cache.get( user_id), user_id
def _clear_and_rebuild(self, ids=[]): """ """ self._positionId = IOBTree() self._idPosition = OIBTree() for id in ids: self.addObject(id)
def _cleanup(self): """Cleans up errors in the BTrees. Certain ZODB bugs have caused BTrees to become slightly insane. Fortunately, there is a way to clean up damaged BTrees that always seems to work: make a new BTree containing the items() of the old one. Returns 1 if no damage was detected, or 0 if damage was detected and fixed. """ from BTrees.check import check path = '/'.join(self.getPhysicalPath()) try: check(self._tree) for key in self._tree.keys(): if key not in self._tree: raise AssertionError( "Missing value for key: %s" % repr(key)) check(self._mt_index) keys = set(self._tree.keys()) for key, value in self._mt_index.items(): if (key not in self._mt_index or self._mt_index[key] is not value): raise AssertionError( "Missing or incorrect meta_type index: %s" % repr(key)) check(value) for k in value.keys(): if k not in value or k not in keys: raise AssertionError( "Missing values for meta_type index: %s" % repr(key)) return 1 except AssertionError: LOG.warn('Detected damage to %s. Fixing now.' % path, exc_info=sys.exc_info()) try: self._tree = OOBTree(self._tree) keys = set(self._tree.keys()) mt_index = OOBTree() for key, value in self._mt_index.items(): for name in tuple(value.keys()): if name not in keys: del value[name] mt_index[key] = OIBTree(value) self._mt_index = mt_index new = len(keys) if self._count() != new: self._count.set(new) except: LOG.error('Failed to fix %s.' % path, exc_info=sys.exc_info()) raise else: LOG.info('Fixed %s.' % path) return 0
def initializeAnnotations(obj, event): """Ensure that we don't delegate certain annotations by setting them from the beginning. """ annotations = IAnnotations(obj) annotations.setdefault(DefaultOrdering.ORDER_KEY, PersistentList()) annotations.setdefault(DefaultOrdering.POS_KEY, OIBTree()) annotations.setdefault(CONTENTRULES_KEY, None) annotations.setdefault(CONTEXT_ASSIGNMENT_KEY, OOBTree())
class DateIndex(UnIndex): """ Index for Dates """ __implements__ = (PluggableIndex.PluggableIndexInterface,) meta_type = 'DateIndex' query_options = ['query', 'range'] manage = manage_main = DTMLFile( 'dtml/manageDateIndex', globals() ) manage_main._setName( 'manage_main' ) manage_options = ( { 'label' : 'Settings' , 'action' : 'manage_main' }, ) def clear( self ): """ Complete reset """ self._index = IOBTree() self._unindex = OIBTree() def index_object( self, documentId, obj, threshold=None ): """index an object, normalizing the indexed value to an integer o Normalized value has granularity of one minute. o Objects which have 'None' as indexed value are *omitted*, by design. """ returnStatus = 0 try: date_attr = getattr( obj, self.id ) if callable( date_attr ): date_attr = date_attr() ConvertedDate = self._convert( value=date_attr, default=_marker ) except AttributeError: ConvertedDate = _marker oldConvertedDate = self._unindex.get( documentId, _marker ) if ConvertedDate != oldConvertedDate: if oldConvertedDate is not _marker: self.removeForwardIndexEntry(oldConvertedDate, documentId) if ConvertedDate is not _marker: self.insertForwardIndexEntry( ConvertedDate, documentId ) self._unindex[documentId] = ConvertedDate returnStatus = 1 return returnStatus def _apply_index( self, request, cid='', type=type, None=None ):
def noteLock(self, obj, user_id): mapping = getattr(self, '_locks', None) if mapping is None: mapping = self._locks = OOBTree() path = '/'.join(obj.getPhysicalPath()) items = mapping.get(user_id, None) if items is None: items = OIBTree() mapping[user_id] = items items[path] = 1
def reset(self): """Forget everything; usually called from __init__. """ String.reset(self) self.path2rid = OIBTree() # {path:rid} self.rid2path = IOBTree() # {rid:path} self.parts = OOBTree() # {(level,part):rids} self.rids = IOBTree() # {rid:(level,part)s} self.levels = IOBTree() # {level:rids}
def load_model(): data = DataFS() keys, label = data.gen_train_data() btree = OIBTree() rubbish_k = np.load('6k.npy') rubbish_v = np.load('6v.npy') w = np.load('w.npy', allow_pickle=True) b = np.load('b.npy', allow_pickle=True) for i in range(len(rubbish_k)): btree[str(rubbish_k[i])] = int(rubbish_v[i]) return keys, label, w, b, btree
def __init__(self, id=None): super(LinkCheckTool, self).__init__(id) # This is the work queue; items in this queue are scheduled # for link validity check. self.queue = CompositeQueue() # Additional queue for internal crawler to revalidate the site self.crawl_queue = CompositeQueue() # This is the link database. It maps a hyperlink index to a # tuple (timestamp, status, referers). self.checked = IOBTree() # Indexes self.index = OIBTree() self.links = IOBTree() # This is a counter that allows us to add new hyperlinks and # provide an indexc quickly. self.counter = 0
def __init__(self, id="++conversation++default"): self.id = id # username -> count of comments; key is removed when count reaches 0 self._commentators = OIBTree() # id -> comment - find comment by id self._comments = LOBTree() # id -> LLSet (children) - find all children for a given comment. # 0 signifies root. self._children = LOBTree()
def setupAnnotations(context): """ set up the annotations if they haven't been set up already. The rest of the functions in here assume that this has already been set up """ annotations = IAnnotations(context) changed = False if yays not in annotations: annotations[yays] = OIBTree() changed = True if nays not in annotations: annotations[nays] = OIBTree() changed = True if changed: request = getRequest() alsoProvides(request, IDisableCSRFProtection) return annotations
def _convertBTrees(self, threshold=200): if (type(self._lexicon) is OIBTree and type(getattr(self, '_inverseLex', None)) is IOBTree): return from BTrees.convert import convert lexicon=self._lexicon self._lexicon=OIBTree() self._lexicon._p_jar=self._p_jar convert(lexicon, self._lexicon, threshold) try: inverseLex=self._inverseLex self._inverseLex=IOBTree() except AttributeError: # older lexicons didn't have an inverse lexicon self._inverseLex=IOBTree() inverseLex=self._inverseLex self._inverseLex._p_jar=self._p_jar convert(inverseLex, self._inverseLex, threshold)
def clear(self): """ clear catalog """ self.data = IOBTree() # mapping of rid to meta_data self.uids = OIBTree() # mapping of uid to rid self.paths = IOBTree() # mapping of rid to uid # convert old-style Catalog object to new in-place try: self.__len__.set(0) except AttributeError: self.__len__=BTrees.Length.Length() for index in self.indexes.keys(): self.getIndex(index).clear()
def clear(self): """Empty the index""" IOBTree = BTrees.family64.IO.BTree self._index = IOBTree() # {rangeid: [document_id, ...]} self._unindex = IOBTree() # {document_id: [rangeid, ...]} self._range_mapping = IOBTree() # {rangeid: range} self._reverse_range_mapping = OIBTree() # {range: rangeid} self._since_index = IOBTree() # {since: [rangeid,...]} self._until_index = IOBTree() # {until: [rangeid,...]} self._length = BTrees.Length.Length() self._unique_values_length = BTrees.Length.Length()
def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words")
def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self._nextwid = 1 self._pipeline = pipeline # Keep some statistics about indexing self._nbytes = 0 # Number of bytes indexed (at start of pipeline) self._nwords = 0 # Number of words indexed (after pipeline)
def importNode(self, node, mode=PURGE): """Import the object from the DOM node. """ pipeline = [] for child in node.childNodes: if child.nodeName == 'element': element = element_factory.instantiate( child.getAttribute('group'), child.getAttribute('name')) pipeline.append(element) self.context._pipeline = tuple(pipeline) #clear lexicon self.context._wids = OIBTree() self.context._words = IOBTree() self.context.length = Length()
class MessageService(Persistent, Location): interface.implements(IMessageService) def __init__(self, storage): self.__parent__ = storage self.index = OIBTree() self.unread = Length(0) def __len__(self): return len(self.index) def __iter__(self): return iter(self.index.values()) def __contains__(self, key): msg = self.__parent__.getMessage(key) if msg is not None: return True else: return False def get(self, msgId, default=None): msg = self.__parent__.getMessage(msgId) if msg is not None: if msg.__date__ in self.index: return msg return default def append(self, message): message.__parent__ = self if self.__parent__.readStatus(message): self.unread.change(1) self.index[message.__date__] = message.__id__ def remove(self, message): id = message.__date__ if id in self.index: del self.index[id] if self.__parent__.readStatus(message) and self.unread() > 0: self.unread.change(-1) def create(self, **data): raise NotImplemented('create')
def _setOb(self, id, object): """Store the named object in the folder. """ tree = self._tree if tree.has_key(id): raise KeyError('There is already an item named "%s".' % id) tree[id] = object self._count.change(1) # Update the meta type index. mti = self._mt_index meta_type = getattr(object, 'meta_type', None) if meta_type is not None: ids = mti.get(meta_type, None) if ids is None: ids = OIBTree() mti[meta_type] = ids ids[id] = 1
def create_token(self, userid, timeout=None, data=None): payload = {} payload['sub'] = userid if timeout is None: timeout = self.token_timeout if timeout: payload['exp'] = datetime.utcnow() + timedelta(seconds=timeout) if data is not None: payload.update(data) token = jwt.encode(payload, self._signing_secret(), algorithm='HS256') if self.store_tokens: if self._tokens is None: self._tokens = OOBTree() if userid not in self._tokens: self._tokens[userid] = OIBTree() self._tokens[userid][token] = int(time.time()) return token
def __init__(self, id=None): super(LinkCheckTool, self).__init__(id) # This is the work queue; items in this queue are scheduled # for link validity check. self.queue = CompositeQueue() # This is the link database. It maps a hyperlink index to a # tuple (timestamp, status, referers). self.checked = IOBTree() # Indexes self.index = OIBTree() self.links = IOBTree() # This is a counter that allows us to add new hyperlinks and # provide an indexc quickly. self.counter = 0
class Ballots(object): """ Simple object to help counting votes. It's not addable anywhere. Should be treated as an internal object for polls. """ def __init__(self): """ Ballots attr is an OIBTree, since they can have any object as key. """ self.ballots = OIBTree() def result(self): """ Return a tuple with sorted ballot items. """ return tuple(sorted(self.ballots.iteritems())) def add(self, value): """ Add a dict of results - a ballot - to the pool. Append and increase counter. """ if value in self.ballots: self.ballots[value] += 1 else: self.ballots[value] = 1
class Ballots(object): """ Simple object to help counting votes. It's not addable anywhere. Should be treated as an internal object for polls. """ def __init__(self): """ Ballots attr is an OIBTree, since they can have any object as key. """ self.ballots = OIBTree() def result(self): """ Return a tuple with sorted ballot items. """ return tuple( sorted( self.ballots.iteritems() ) ) def add(self, value): """ Add a dict of results - a ballot - to the pool. Append and increase counter. """ if value in self.ballots: self.ballots[value] += 1 else: self.ballots[value] = 1
def testCleanup(self): self.assert_(self.f._cleanup()) key = TrojanKey('a') self.f._tree[key] = 'b' self.assert_(self.f._cleanup()) key.value = 'z' # With a key in the wrong place, there should now be damage. self.assert_(not self.f._cleanup()) # Now it's fixed. self.assert_(self.f._cleanup()) from BTrees.OIBTree import OIBTree tree = self.f._mt_index['d'] = OIBTree() tree['e'] = 1 self.assert_(not self.f._cleanup()) # Verify the management interface also works, # but don't test return values. self.f.manage_cleanup() key.value = 'a' self.f.manage_cleanup()
def fixupPloneLexicon(context): """Updates the plone_lexicon pipeline with the new splitter and case normalizer. """ catalog = getToolByName(context, 'portal_catalog', None) if catalog is not None: if 'plone_lexicon' in catalog.objectIds(): lexicon = catalog.plone_lexicon pipeline = list(lexicon._pipeline) if len(pipeline) >= 2: if (not isinstance(pipeline[0], Splitter) or not isinstance(pipeline[1], CaseNormalizer)): pipeline[0] = Splitter() pipeline[1] = CaseNormalizer() lexicon._pipeline = tuple(pipeline) # Clear the lexicon from BTrees.OIBTree import OIBTree from BTrees.IOBTree import IOBTree from BTrees.Length import Length lexicon._wids = OIBTree() lexicon._words = IOBTree() lexicon.length = Length() logger.info('Updated plone_lexicon pipeline.')
def _convertBTrees(self, threshold=200): from BTrees.convert import convert if type(self.data) is not IOBTree: data=self.data self.data=IOBTree() convert(data, self.data, threshold) self.__len__=BTrees.Length.Length(len(data)) uids=self.uids self.uids=OIBTree() convert(uids, self.uids, threshold) paths=self.paths self.paths=IOBTree() convert(paths, self.paths, threshold) for index in self.indexes.values(): if hasattr(index, '__of__'): index=index.__of__(self) index._convertBTrees(threshold)
def create_token(self, userid, timeout=None, data=None): payload = {} payload["sub"] = userid if timeout is None: timeout = self.token_timeout if timeout: payload["exp"] = datetime.utcnow() + timedelta(seconds=timeout) if data is not None: payload.update(data) algorithm = "HS256" if self.haveRSAKeys(): algorithm = "RS256" token = jwt.encode(payload, self._signing_secret(), algorithm=algorithm) if not six.PY2: token = token.decode("utf-8") if self.store_tokens: if self._tokens is None: self._tokens = OOBTree() if userid not in self._tokens: self._tokens[userid] = OIBTree() self._tokens[userid][token] = int(time.time()) return token
def initialize_storage(self): ann = IAnnotations(self.context) if self.ANNOTATIONS_KEY not in ann: ann[self.ANNOTATIONS_KEY] = OOBTree() self._storage = ann[self.ANNOTATIONS_KEY] # Actual list of actions if self.STORAGE_ACTIONS_KEY not in self._storage: self._storage[self.STORAGE_ACTIONS_KEY] = IOBTree() self._actions = self._storage[self.STORAGE_ACTIONS_KEY] # Indexes needed for fast lookups if self.STORAGE_INDEXES_KEY not in self._storage: self._storage[self.STORAGE_INDEXES_KEY] = OOBTree() self._indexes = self._storage[self.STORAGE_INDEXES_KEY] # Index: unique_name -> action_id if self.IDX_UNIQUE_NAME not in self._indexes: self._indexes[self.IDX_UNIQUE_NAME] = OIBTree() # Counter for the next 'action_id' if self.STORAGE_NEXT_ID_KEY not in self._storage: self._storage[self.STORAGE_NEXT_ID_KEY] = 0
class UUIDIndex(UnIndex): """Index for uuid fields with an unique value per key. The internal structure is: self._index = {datum:documentId]} self._unindex = {documentId:datum} For each datum only one documentId can exist. """ meta_type = "UUIDIndex" manage_options = ( {'label': 'Settings', 'action': 'manage_main'}, {'label': 'Browse', 'action': 'manage_browse'}, ) query_options = ["query", "range"] manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals()) manage_main._setName('manage_main') manage_browse = DTMLFile('../dtml/browseIndex', globals()) def clear(self): self._length = Length() self._index = OIBTree() self._unindex = IOBTree() self._counter = Length() def numObjects(self): """Return the number of indexed objects. Since we have a 1:1 mapping from documents to values, we can reuse the stored length. """ return self.indexSize() def uniqueValues(self, name=None, withLengths=0): """returns the unique values for name if withLengths is true, returns a sequence of tuples of (value, length) """ if name is None: name = self.id elif name != self.id: raise StopIteration if not withLengths: for key in self._index.keys(): yield key else: # We know the length for each value is one for key in self._index.keys(): yield (key, 1) def insertForwardIndexEntry(self, entry, documentId): """Take the entry provided and put it in the correct place in the forward index. """ if entry is None: return old_docid = self._index.get(entry, _marker) if old_docid is _marker: self._index[entry] = documentId self._length.change(1) elif old_docid != documentId: logger.error("A different document with value '%s' already " "exists in the index.'" % entry) def removeForwardIndexEntry(self, entry, documentId): """Take the entry provided and remove any reference to documentId in its entry in the index. """ old_docid = self._index.get(entry, _marker) if old_docid is not _marker: del self._index[entry] self._length.change(-1) def _get_object_datum(self, obj, attr): # for a uuid it never makes sense to acquire a parent value via # Acquisition has_attr = getattr(aq_base(obj), attr, _marker) if has_attr is _marker: return _marker return super(UUIDIndex, self)._get_object_datum(obj, attr)
class Indexer(object): filestorage = database = connection = root = None def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words") def dumpfreqs(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) L = [] for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f L.append((freq, wid, lexicon.get_word(wid))) L.sort() L.reverse() for freq, wid, word in L: print("%10d %10d %s" % (wid, freq, word)) def dumpwids(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid))) def dumpwords(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for word in lexicon.words(): wid = lexicon.get_wid(word) freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, word)) def close(self): self.root = None if self.connection is not None: self.connection.close() self.connection = None if self.database is not None: self.database.close() self.database = None if self.filestorage is not None: self.filestorage.close() self.filestorage = None def interact(self, nbest=NBEST, maxlines=MAXLINES): try: import readline except ImportError: pass text = "" top = 0 results = [] while 1: try: line = raw_input("Query: ") except EOFError: print("\nBye.") break line = line.strip() if line.startswith("/"): self.specialcommand(line, results, top - nbest) continue if line: text = line top = 0 else: if not text: continue try: results, n = self.timequery(text, top + nbest) except KeyboardInterrupt: raise except: reportexc() text = "" continue if len(results) <= top: if not n: print("No hits for %r." % text) else: print("No more hits for %r." % text) text = "" continue print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=" ") print("for query %s]" % repr(text)) self.formatresults(text, results, maxlines, top, top+nbest) top += nbest def specialcommand(self, line, results, first): assert line.startswith("/") line = line[1:] if not line: n = first else: try: n = int(line) - 1 except: print("Huh?") return if n < 0 or n >= len(results): print("Out of range") return docid, score = results[n] path = self.docpaths[docid] i = path.rfind("/") assert i > 0 folder = path[:i] n = path[i+1:] cmd = "show +%s %s" % (folder, n) if os.getenv("DISPLAY"): os.system("xterm -e sh -c '%s | less' &" % cmd) else: os.system(cmd) def query(self, text, nbest=NBEST, maxlines=MAXLINES): results, n = self.timequery(text, nbest) if not n: print("No hits for %r." % text) return print("[Results 1-%d from %d]" % (len(results), n)) self.formatresults(text, results, maxlines) def timequery(self, text, nbest): t0 = time.time() c0 = time.clock() results, n = self.index.query(text, 0, nbest) t1 = time.time() c1 = time.clock() print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)) return results, n def formatresults(self, text, results, maxlines=MAXLINES, lo=0, hi=sys.maxint): stop = self.stopdict.has_key words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)] pattern = r"\b(" + "|".join(words) + r")\b" pattern = pattern.replace("*", ".*") # glob -> re syntax prog = re.compile(pattern, re.IGNORECASE) print('='*70) rank = lo for docid, score in results[lo:hi]: rank += 1 path = self.docpaths[docid] score *= 100.0 print("Rank: %d Score: %d%% File: %s" % (rank, score, path)) path = os.path.join(self.mh.getpath(), path) try: fp = open(path) except (IOError, OSError) as msg: print("Can't open:", msg) continue msg = mhlib.Message("<folder>", 0, fp) for header in "From", "To", "Cc", "Bcc", "Subject", "Date": h = msg.getheader(header) if h: print("%-8s %s" % (header+":", h)) text = self.getmessagetext(msg) if text: print() nleft = maxlines for part in text: for line in part.splitlines(): if prog.search(line): print(line) nleft -= 1 if nleft <= 0: break if nleft <= 0: break print('-'*70) def update(self, args): folder = None seqs = [] for arg in args: if arg.startswith("+"): if folder is None: folder = arg[1:] else: print("only one folder at a time") return else: seqs.append(arg) if not folder: folder = self.mh.getcontext() if not seqs: seqs = ['all'] try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) return dict = {} for seq in seqs: try: nums = f.parsesequence(seq) except mhlib.Error as msg: print(msg or "unparsable message sequence: %s" % repr(seq)) return for n in nums: dict[n] = n msgs = dict.keys() msgs.sort() self.updatefolder(f, msgs) self.commit() def optimize(self, args): uniqwords = {} for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nOPTIMIZE FOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.prescan(f, f.listmessages(), uniqwords) L = [(uniqwords[word], word) for word in uniqwords.keys()] L.sort() L.reverse() for i in range(100): print("%3d. %6d %s" % ((i+1,) + L[i])) self.index.lexicon.sourceToWordIds([word for (count, word) in L]) def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1 def bulkupdate(self, args): if not args: print("No folders specified; use ALL to bulk-index all folders") return if "ALL" in args: i = args.index("ALL") args[i:i+1] = self.mh.listfolders() for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nFOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.updatefolder(f, f.listmessages()) print("Total", len(self.docpaths)) self.commit() print("Indexed", self.index.lexicon._nbytes, "bytes and",) print(self.index.lexicon._nwords, "words;",) print(len(self.index.lexicon._words), "unique words.") def updatefolder(self, f, msgs): self.watchfolders[f.name] = self.getmtime(f.name) for n in msgs: path = "%s/%s" % (f.name, n) docid = self.path2docid.get(path, 0) if docid and self.getmtime(path) == self.doctimes.get(docid, 0): print("unchanged", docid, path) continue docid = self.newdocid(path) try: m = f.openmessage(n) except IOError: print("disappeared", docid, path) self.unindexpath(path) continue text = self.getmessagetext(m, f.name) if not text: self.unindexpath(path) continue print("indexing", docid, path) self.index.index_doc(docid, text) self.maycommit() # Remove messages from the folder that no longer exist for path in list(self.path2docid.keys(f.name)): if not path.startswith(f.name + "/"): break if self.getmtime(path) == 0: self.unindexpath(path) print("done.") def unindexpath(self, path): if self.path2docid.has_key(path): docid = self.path2docid[path] print("unindexing", docid, path) del self.docpaths[docid] del self.doctimes[docid] del self.path2docid[path] try: self.index.unindex_doc(docid) except KeyError as msg: print("KeyError", msg) self.maycommit() def getmessagetext(self, m, name=None): L = [] if name: L.append("_folder " + name) # To restrict search to a folder self.getheaders(m, L) try: self.getmsgparts(m, L, 0) except KeyboardInterrupt: raise except: print("(getmsgparts failed:)") reportexc() return L def getmsgparts(self, m, L, level): ctype = m.gettype() if level or ctype != "text/plain": print(". "*level + str(ctype)) if ctype == "text/plain": L.append(m.getbodytext()) elif ctype in ("multipart/alternative", "multipart/mixed"): for part in m.getbodyparts(): self.getmsgparts(part, L, level+1) elif ctype == "message/rfc822": f = StringIO(m.getbodytext()) m = mhlib.Message("<folder>", 0, f) self.getheaders(m, L) self.getmsgparts(m, L, level+1) def getheaders(self, m, L): H = [] for key in "from", "to", "cc", "bcc", "subject": value = m.get(key) if value: H.append(value) if H: L.append("\n".join(H)) def newdocid(self, path): docid = self.path2docid.get(path) if docid is not None: self.doctimes[docid] = self.getmtime(path) return docid docid = self.maxdocid + 1 self.maxdocid = docid self.docpaths[docid] = path self.doctimes[docid] = self.getmtime(path) self.path2docid[path] = docid return docid def getmtime(self, path): path = os.path.join(self.mh.getpath(), path) try: st = os.stat(path) except os.error as msg: return 0 return int(st[ST_MTIME]) def maycommit(self): self.trans_count += 1 if self.trans_count >= self.trans_limit > 0: self.commit() def commit(self): if self.trans_count > 0: print("committing...") transaction.commit() self.trans_count = 0 self.pack_count += 1 if self.pack_count >= self.pack_limit > 0: self.pack() def pack(self): if self.pack_count > 0: print("packing...") self.database.pack() self.pack_count = 0
def clear( self ): """ Complete reset """ self._index = IOBTree() self._unindex = OIBTree()
class Path(String): root = None # root as passed to Catalog() path2rid = None # OIBTree mapping path to rid (one:one) rid2path = None # IOBTree mapping rid to path (one:one) parts = None # OOBTree mapping (level, part) to rids (one:many) levels = None # IOBTree mapping level to a list of rids (one:many) case_sensitive = None sorted = None # OOBTree for sorting; inherited from Path def __init__(self, root, case_sensitive=None): # Root # ==== if not isinstance(root, basestring): raise TypeError("root is not a string: '%s'" % root) elif not isdir(root): raise ValueError("root doesn't point to a directory: '%s'" % root) self.root = root.rstrip(os.sep) # Case Sensitivity # ================ if case_sensitive is None: if 'win' in sys.platform: case_sensitive = False else: case_sensitive = True if case_sensitive not in (False, True, 0, 1): raise TypeError( "case_sensitive isn't a boolean: " + "'%s'" % case_sensitive ) self.case_sensitive = bool(case_sensitive) self.reset() # Index contract # ============== __name__ = 'Path' # used in command-line interface def reset(self): """Forget everything; usually called from __init__. """ String.reset(self) self.path2rid = OIBTree() # {path:rid} self.rid2path = IOBTree() # {rid:path} self.parts = OOBTree() # {(level,part):rids} self.rids = IOBTree() # {rid:(level,part)s} self.levels = IOBTree() # {level:rids} def learn(self, rid, value): """Given an rid and a value, associate them. """ String.learn(self, rid, value) # Parse and validate. # =================== # Value is an absolute path, rooted in self.root. if not isinstance(value, basestring): raise TypeError("string expected") elif value and not value.startswith(os.sep): raise ValueError("path not specified absolutely: '%s'" % value) if self.case_sensitive: path = value else: path = value.lower() path = path.rstrip(os.sep) # safety net; should never need this parts = value.split(os.sep) #parts = value.split(os.sep)[1:] # Add to simple identity indices. # =============================== self.path2rid[path] = rid self.rid2path[rid] = path # Add to complex level/part indices. # ================================== for level in range(len(parts)): token_ = (level, parts[level]) # Add to (one:many) mapping of (level,part) to [rids]. # ==================================================== if token_ not in self.parts: self.parts[token_] = IITreeSet([rid]) else: self.parts[token_].insert(rid) # Add to the (one:many) mapping of rid to (level,part)s. # ====================================================== # This exists so we know how to forget about this rid when the time # comes. if rid not in self.rids: self.rids[rid] = OOSet([token_]) else: self.rids[rid].insert(token_) # Add to (one:many) mapping of levels to rids. # ============================================ # This is used to implement level limits. if level not in self.levels: self.levels[level] = IITreeSet([rid]) else: self.levels[level].insert(rid) def forget(self, rid): """Given an rid, remove it from all indices. """ String.forget(self, rid) # Remove from the (one:many) mapping of (level, part) to rids. # ============================================================ # We also track the level here and remove the rid from the (one:many) # mapping of levels to rids. level = -1 for token_ in self.rids[rid]: if token_[0] > level: level = token_[0] self.parts[token_].remove(rid) if len(self.parts[token_]) == 0: del self.parts[token_] self.levels[level].remove(rid) if len(self.levels[level]) == 0: del self.levels[level] # Remove from the (one:many) mapping of rid to tokens. # ==================================================== del self.rids[rid] # Remove from simple identity indices. # ==================================== path = self.rid2path[rid] del self.path2rid[path] del self.rid2path[rid] # Searches # ======== def above(self, arg): """Find all resources at or above path, within the limits given. Here we actually call below() on <path> and all of its ancestors, passing the limits straight through, with the exception that limits default to 0:1 rather than None:None. Use '0:' for the latter. """ # Parse and validate. # =================== path, upper, lower = self._path_and_limits(arg) rid = self.path2rid.get(path, None) if rid is None: return # Build # ===== tmpl = "%s " if (upper, lower) == (None, None): tmpl += '0:1' # default: breadcrumbs else: if upper is not None: tmpl += str(upper) tmpl += ":" if lower is not None: tmpl += str(lower) parts = path.split(os.sep) rids = [] for level in range(len(parts)): ancestor = os.sep.join(parts[:level+1]) ancestor = ancestor and ancestor or '/' rids.append(self.below(tmpl % ancestor)) rids = multiunion(rids) def below(self, arg): """Find all resources at or below path, within the limits given. """ # Parse and validate. # =================== path, upper, lower = self._path_and_limits(arg) rid = self.path2rid.get(path, None) if rid is None: return # Build # ===== parts = path.split(os.sep) rids = None for level in range(len(parts)): rids = intersection(rids, self.parts[(level, parts[level])]) if rids is None: return IISet() # short-cut # Limits # ====== # Remove rids that are above any upper limit, and then only include rids # that are above any lower limit. Limits are relative to the level of # the requested path. if upper is not None: upper += level for i in range(level, upper): if i not in self.levels: break rids = difference(rids, self.levels[i]) if lower is not None: lower += level _rids = [] for i in range(level, lower): if i not in self.levels: break _rids.append(self.levels[i]) rids = intersection(rids, multiunion(_rids)) return rids def is_(self, arg): """Return the rid corresponding to a single path. Root is special-cased. """ path, foo, bar = self._path_and_limits(arg) return self.path2rid.get(arg, None) # Parser # ====== def _path_and_limits(self, arg): """Given an argument from a Collection constraint, return three params. Arg is of the form: /some/path 0:4 The first token is the path, the second is a limits specification. The path must not contain a space (@@: really should support that). The limits spec is optional; if given, it must have a colon and at least one end specified. To the left of the colon is the upper bound; to the right is the lower bound. These bounds specify the tree levels that the path filter should apply to, but the specifics of how it applies depend on the searches above. (Yes this nomenclature is all wacky. The root is conceptually 'higher' for some reason, even though the root is 0 and a real tree's roots are lower than its branches. Go figure.) """ path = '' upper = None lower = None parts = arg.split() nparts = len(parts) assert nparts in (1, 2), "either need path or path and limits" # Path # ==== if nparts == 1: path = parts[0] elif nparts == 2: path = parts[0] # Limits # ====== limits = parts[1] if not limits.count(':') == 1: raise ValueError("malformed limits (no colon): '%s'" % limits) upper, lower = limits.split(':') #if not (upper + lower): # raise ValueError("no limits given: '%s'" % limits) if not upper: upper = None else: if not upper.isdigit(): raise ValueError("bad upper limit: '%s'" % upper) upper = int(upper) if not lower: lower = None else: if not lower.isdigit(): raise ValueError("bad lower limit: '%s'" % lower) lower = int(lower) if None not in (upper, lower): if upper > lower: raise ValueError( "upper limit greater than lower: " + "%d > %d" % (upper, lower) ) if path == os.sep: path = '' if not self.case_sensitive: path = path.lower() return path, upper, lower
class Repository(Implicit, Persistent): """The repository implementation manages the actual data of versions and version histories. It does not handle user interface issues.""" def __init__(self): # These keep track of symbolic label and branch names that # have been used to ensure that they don't collide. self._branches = OIBTree() self._branches['mainline'] = 1 self._labels = OIBTree() self._histories = OOBTree() self._created = time.time() security = ClassSecurityInfo() security.declarePrivate('createVersionHistory') def createVersionHistory(self, object): """Internal: create a new version history for a resource.""" # When one creates the first version in a version history, neither # the version or version history yet have a _p_jar, which causes # copy operations to fail. To work around that, we share our _p_jar. history_id = None while history_id is None or self._histories.has_key(history_id): history_id = str(randint(1, 9999999999)) history = ZopeVersionHistory(history_id, object) self._histories[history_id] = history return history.__of__(self) security.declarePrivate('getVersionHistory') def getVersionHistory(self, history_id): """Internal: return a version history given a version history id.""" return self._histories[history_id].__of__(self) security.declarePrivate('replaceState') def replaceState(self, obj, new_state): """Internal: replace the state of a persistent object. """ non_versioned = getNonVersionedData(obj) # XXX There ought to be some way to do this more cleanly. # This fills the __dict__ of the old object with new state. # The other way to achieve the desired effect is to replace # the object in its container, but this method preserves the # identity of the object. if obj.__class__ is not new_state.__class__: raise VersionControlError( "The class of the versioned object has changed. %s != %s" % (repr(obj.__class__, new_state.__class__))) obj._p_changed = 1 for key in obj.__dict__.keys(): if not new_state.__dict__.has_key(key): del obj.__dict__[key] for key, value in new_state.__dict__.items(): obj.__dict__[key] = value if non_versioned: # Restore the non-versioned data into the new state. restoreNonVersionedData(obj, non_versioned) return obj ##################################################################### # This is the implementation of the public version control interface. ##################################################################### security.declarePublic('isAVersionableResource') def isAVersionableResource(self, obj): # For now, an object must be persistent (have its own db record) # in order to be considered a versionable resource. return isAVersionableResource(obj) security.declarePublic('isUnderVersionControl') def isUnderVersionControl(self, object): return hasattr(object, '__vc_info__') security.declarePublic('isResourceUpToDate') def isResourceUpToDate(self, object, require_branch=0): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) branch = 'mainline' if info.sticky: if info.sticky[0] == 'B': branch = info.sticky[1] elif require_branch: # The object is updated to a particular version # rather than a branch. The caller # requires a branch. return 0 return history.isLatestVersion(info.version_id, branch) security.declarePublic('isResourceChanged') def isResourceChanged(self, object): # Return true if the state of a resource has changed in a transaction # *after* the version bookkeeping was saved. Note that this method is # not appropriate for detecting changes within a transaction! info = self.getVersionInfo(object) itime = getattr(info, '_p_mtime', None) if itime is None: return 0 mtime = Utility._findModificationTime(object) if mtime is None: return 0 return mtime > itime security.declarePublic('getVersionInfo') def getVersionInfo(self, object): info = getattr(object, '__vc_info__', None) if info is not None: return info raise VersionControlError( 'The specified resource is not under version control.' ) security.declareProtected(use_vc_permission, 'applyVersionControl') def applyVersionControl(self, object, message=None): if self.isUnderVersionControl(object): raise VersionControlError( 'The resource is already under version control.' ) if not self.isAVersionableResource(object): raise VersionControlError( 'This resource cannot be put under version control.' ) # Need to check the parent to see if the container of the object # being put under version control is itself a version-controlled # object. If so, we need to use the branch id of the container. branch = 'mainline' parent = aq_parent(aq_inner(object)) p_info = getattr(parent, '__vc_info__', None) if p_info is not None: sticky = p_info.sticky if sticky and sticky[0] == 'B': branch = sticky[1] # Create a new version history and initial version object. history = self.createVersionHistory(object) version = history.createVersion(object, branch) history_id = history.getId() version_id = version.getId() # Add bookkeeping information to the version controlled object. info = VersionInfo(history_id, version_id, VersionInfo.CHECKED_IN) if branch != 'mainline': info.sticky = ('B', branch) object.__vc_info__ = info # Save an audit record of the action being performed. history.addLogEntry(version_id, LogEntry.ACTION_CHECKIN, _findPath(object), message is None and 'Initial checkin.' or message ) return object security.declareProtected(use_vc_permission, 'checkoutResource') def checkoutResource(self, object): info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource is already checked out.' ) if info.sticky and info.sticky[0] != 'B': raise VersionControlError( 'The selected resource has been updated to a particular ' 'version, label or date. The resource must be updated to ' 'the mainline or a branch before it may be checked out.' ) if not self.isResourceUpToDate(object): raise VersionControlError( 'The selected resource is not up to date!' ) history = self.getVersionHistory(info.history_id) ob_path = _findPath(object) # Save an audit record of the action being performed. history.addLogEntry(info.version_id, LogEntry.ACTION_CHECKOUT, ob_path ) # Update bookkeeping information. newinfo = info.clone() newinfo.status = newinfo.CHECKED_OUT object.__vc_info__ = newinfo return object security.declareProtected(use_vc_permission, 'checkinResource') def checkinResource(self, object, message=''): info = self.getVersionInfo(object) if info.status != info.CHECKED_OUT: raise VersionControlError( 'The selected resource is not checked out.' ) if info.sticky and info.sticky[0] != 'B': raise VersionControlError( 'The selected resource has been updated to a particular ' 'version, label or date. The resource must be updated to ' 'the mainline or a branch before it may be checked in.' ) if not self.isResourceUpToDate(object): raise VersionControlError( 'The selected resource is not up to date!' ) history = self.getVersionHistory(info.history_id) ob_path = _findPath(object) branch = 'mainline' if info.sticky is not None and info.sticky[0] == 'B': branch = info.sticky[1] version = history.createVersion(object, branch) # Save an audit record of the action being performed. history.addLogEntry(version.getId(), LogEntry.ACTION_CHECKIN, ob_path, message ) # Update bookkeeping information. newinfo = info.clone() newinfo.version_id = version.getId() newinfo.status = newinfo.CHECKED_IN object.__vc_info__ = newinfo return object security.declareProtected(use_vc_permission, 'uncheckoutResource') def uncheckoutResource(self, object): info = self.getVersionInfo(object) if info.status != info.CHECKED_OUT: raise VersionControlError( 'The selected resource is not checked out.' ) history = self.getVersionHistory(info.history_id) ob_path = _findPath(object) version = history.getVersionById(info.version_id) new_obj = version.copyState() # Save an audit record of the action being performed. history.addLogEntry(info.version_id, LogEntry.ACTION_UNCHECKOUT, ob_path ) # Replace the state of the object with a reverted state. new_obj = self.replaceState(object, new_obj) # Update bookkeeping information. newinfo = info.clone() newinfo.version_id = version.getId() newinfo.status = newinfo.CHECKED_IN new_obj.__vc_info__ = newinfo return new_obj security.declareProtected(use_vc_permission, 'updateResource') def updateResource(self, object, selector=None): info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource must be checked in to be updated.' ) history = self.getVersionHistory(info.history_id) version = None sticky = info.sticky if not selector: # If selector is null, update to the latest version taking any # sticky attrs into account (branch, date). Note that the sticky # tag could also be a date or version id. We don't bother checking # for those, since in both cases we do nothing (because we'll # always be up to date until the sticky tag changes). if sticky and sticky[0] == 'L': # A label sticky tag, so update to that label (since it is # possible, but unlikely, that the label has been moved). version = history.getVersionByLabel(sticky[1]) elif sticky and sticky[0] == 'B': # A branch sticky tag. Update to latest version on branch. version = history.getLatestVersion(selector) else: # Update to mainline, forgetting any date or version id # sticky tag that was previously associated with the object. version = history.getLatestVersion('mainline') sticky = None else: # If the selector is non-null, we find the version specified # and update the sticky tag. Later we'll check the version we # found and decide whether we really need to update the object. if history.hasVersionId(selector): version = history.getVersionById(selector) sticky = ('V', selector) elif self._labels.has_key(selector): version = history.getVersionByLabel(selector) sticky = ('L', selector) elif self._branches.has_key(selector): version = history.getLatestVersion(selector) if selector == 'mainline': sticky = None else: sticky = ('B', selector) else: try: date = DateTime(selector) except: raise VersionControlError( 'Invalid version selector: %s' % selector ) else: timestamp = date.timeTime() sticky = ('D', timestamp) # Fix! branch = history.findBranchId(info.version_id) version = history.getVersionByDate(branch, timestamp) # If the state of the resource really needs to be changed, do the # update and make a log entry for the update. version_id = version and version.getId() or info.version_id new_object = object if version and (version_id != info.version_id): new_object = version.copyState() new_object = self.replaceState(object, new_object) history.addLogEntry(version_id, LogEntry.ACTION_UPDATE, _findPath(new_object) ) # Update bookkeeping information. newinfo = info.clone(1) newinfo.version_id = version_id newinfo.status = newinfo.CHECKED_IN if sticky is not None: newinfo.sticky = sticky new_object.__vc_info__ = newinfo return new_object security.declareProtected(use_vc_permission, 'labelResource') def labelResource(self, object, label, force=0): info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource must be checked in to be labeled.' ) # Make sure that labels and branch ids do not collide. if self._branches.has_key(label) or label == 'mainline': raise VersionControlError( 'The label value given is already in use as an activity id.' ) if not self._labels.has_key(label): self._labels[label] = 1 history = self.getVersionHistory(info.history_id) history.labelVersion(info.version_id, label, force) return object security.declareProtected(use_vc_permission, 'makeActivity') def makeActivity(self, object, branch_id): # Note - this is not part of the official version control API yet. # It is here to allow unit testing of the architectural aspects # that are already in place to support activities in the future. info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource must be checked in.' ) branch_id = branch_id or None # Make sure that activity ids and labels do not collide. if self._labels.has_key(branch_id) or branch_id == 'mainline': raise VersionControlError( 'The value given is already in use as a version label.' ) if not self._branches.has_key(branch_id): self._branches[branch_id] = 1 history = self.getVersionHistory(info.history_id) if history._branches.has_key(branch_id): raise VersionControlError( 'The resource is already associated with the given activity.' ) history.createBranch(branch_id, info.version_id) return object security.declareProtected(use_vc_permission, 'getVersionOfResource') def getVersionOfResource(self, history_id, selector): history = self.getVersionHistory(history_id) sticky = None if not selector or selector == 'mainline': version = history.getLatestVersion('mainline') else: if history.hasVersionId(selector): version = history.getVersionById(selector) sticky = ('V', selector) elif self._labels.has_key(selector): version = history.getVersionByLabel(selector) sticky = ('L', selector) elif self._branches.has_key(selector): version = history.getLatestVersion(selector) sticky = ('B', selector) else: try: date = DateTime(selector) except: raise VersionControlError( 'Invalid version selector: %s' % selector ) else: timestamp = date.timeTime() sticky = ('D', timestamp) version = history.getVersionByDate('mainline', timestamp) object = version.copyState() info = VersionInfo(history_id, version.getId(), VersionInfo.CHECKED_IN) if sticky is not None: info.sticky = sticky object.__vc_info__ = info return object security.declareProtected(use_vc_permission, 'getVersionIds') def getVersionIds(self, object): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) return history.getVersionIds() security.declareProtected(use_vc_permission, 'getLabelsForResource') def getLabelsForResource(self, object): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) return history.getLabels() security.declareProtected(use_vc_permission, 'getLogEntries') def getLogEntries(self, object): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) return history.getLogEntries()
def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() self._digrams = OOBTree()
class GlobbingLexicon(Lexicon): """Lexicon which supports basic globbing function ('*' and '?'). This lexicon keeps several data structures around that are useful for searching. They are: '_lexicon' -- Contains the mapping from word => word_id '_inverseLex' -- Contains the mapping from word_id => word '_digrams' -- Contains a mapping from digram => word_id Before going further, it is necessary to understand what a digram is, as it is a core component of the structure of this lexicon. A digram is a two-letter sequence in a word. For example, the word 'zope' would be converted into the digrams:: ['$z', 'zo', 'op', 'pe', 'e$'] where the '$' is a word marker. It is used at the beginning and end of the words. Those digrams are significant. """ multi_wc = '*' single_wc = '?' eow = '$' def __init__(self,useSplitter=None,extra=None): self.clear() self.useSplitter = useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() self._digrams = OOBTree() def _convertBTrees(self, threshold=200): Lexicon._convertBTrees(self, threshold) if type(self._digrams) is OOBTree: return from BTrees.convert import convert _digrams=self._digrams self._digrams=OOBTree() self._digrams._p_jar=self._p_jar convert(_digrams, self._digrams, threshold, IITreeSet) def createDigrams(self, word): """Returns a list with the set of digrams in the word.""" word = '$'+word+'$' return [ word[i:i+2] for i in range(len(word)-1)] def getWordId(self, word): """Provided 'word', return the matching integer word id.""" if self._lexicon.has_key(word): return self._lexicon[word] else: return self.assignWordId(word) set = getWordId # Kludge for old code def getWord(self, wid): return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word, and return it.""" # Double check it's not in the lexicon already, and if it is, just # return it. if self._lexicon.has_key(word): return self._lexicon[word] # Get word id. BBB Backward compat pain. inverse=self._inverseLex try: insert=inverse.insert except AttributeError: # we have an "old" BTree object if inverse: wid=inverse.keys()[-1]+1 else: self._inverseLex=IOBTree() wid=1 inverse[wid] = word else: # we have a "new" IOBTree object wid=randid() while not inverse.insert(wid, word): wid=randid() self._lexicon[word] = wid # Now take all the digrams and insert them into the digram map. for digram in self.createDigrams(word): set = self._digrams.get(digram, None) if set is None: self._digrams[digram] = set = IISet() set.insert(wid) return wid def get(self, pattern): """ Query the lexicon for words matching a pattern.""" # single word pattern produce a slicing problem below. # Because the splitter throws away single characters we can # return an empty tuple here. if len(pattern)==1: return () wc_set = [self.multi_wc, self.single_wc] digrams = [] globbing = 0 for i in range(len(pattern)): if pattern[i] in wc_set: globbing = 1 continue if i == 0: digrams.insert(i, (self.eow + pattern[i]) ) digrams.append((pattern[i] + pattern[i+1])) else: try: if pattern[i+1] not in wc_set: digrams.append( pattern[i] + pattern[i+1] ) except IndexError: digrams.append( (pattern[i] + self.eow) ) if not globbing: result = self._lexicon.get(pattern, None) if result is None: return () return (result, ) ## now get all of the intsets that contain the result digrams result = None for digram in digrams: result=union(result, self._digrams.get(digram, None)) if not result: return () else: ## now we have narrowed the list of possible candidates ## down to those words which contain digrams. However, ## some words may have been returned that match digrams, ## but do not match 'pattern'. This is because some words ## may contain all matching digrams, but in the wrong ## order. expr = re.compile(self.createRegex(pattern)) words = [] hits = IISet() for x in result: if expr.match(self._inverseLex[x]): hits.insert(x) return hits def __getitem__(self, word): """ """ return self.get(word) def query_hook(self, q): """expand wildcards""" ListType = type([]) i = len(q) - 1 while i >= 0: e = q[i] if isinstance(e, ListType): self.query_hook(e) elif isinstance(e, Op): pass elif ( (self.multi_wc in e) or (self.single_wc in e) ): wids = self.get(e) words = [] for wid in wids: if words: words.append(Or) words.append(wid) if not words: # if words is empty, return something that will make # textindex's __getitem__ return an empty result list words.append('') q[i] = words i = i - 1 return q def Splitter(self, astring, words=None, encoding="latin1"): """ wrap the splitter """ ## don't do anything, less efficient but there's not much ## sense in stemming a globbing lexicon. try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding ) except: return self.SplitterFunc(astring, words) def createRegex(self, pat): """Translate a PATTERN to a regular expression. There is no way to quote meta-characters. """ # Remove characters that are meaningful in a regex if not isinstance(pat, UnicodeType): transTable = string.maketrans("", "") result = string.translate(pat, transTable, r'()&|!@#$%^{}\<>.') else: transTable={} for ch in r'()&|!@#$%^{}\<>.': transTable[ord(ch)]=None result=pat.translate(transTable) # First, deal with multi-character globbing result = result.replace( '*', '.*') # Next, we need to deal with single-character globbing result = result.replace( '?', '.') return "%s$" % result
class Lexicon(Persistent): """ Implementation of :class:`zope.index.text.interfaces.ILexicon`. """ def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self.wordCount = Length() self._pipeline = pipeline def wordCount(self): """Return the number of unique terms in the lexicon.""" # overridden per instance return len(self._wids) def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): if text is None: text = '' last = _text2list(text) for element in self._pipeline: last = element.process(last) if not isinstance(self.wordCount, Length): # Make sure wordCount is overridden with a BTrees.Length.Length self.wordCount = Length(self.wordCount()) # Strategically unload the length value so that we get the most # recent value written to the database to minimize conflicting wids # Because length is independent, this will load the most # recent value stored, regardless of whether MVCC is enabled self.wordCount._p_deactivate() return list(map(self._getWordIdCreate, last)) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: wid = self._new_wid() self._wids[word] = wid self._words[wid] = word return wid def _new_wid(self): count = self.wordCount count.change(1) while count() in self._words: # just to be safe count.change(1) return count()
class DocumentMap(Persistent): """ A two-way map between addresses (e.g. location paths) and document ids. The map is a persistent object meant to live in a ZODB storage. Additionally, the map is capable of mapping 'metadata' to docids. """ _v_nextid = None family = BTrees.family32 _randrange = random.randrange docid_to_metadata = None # latch for b/c def __init__(self): self.docid_to_address = IOBTree() self.address_to_docid = OIBTree() self.docid_to_metadata = IOBTree() def docid_for_address(self, address): """ Retrieve a document id for a given address. ``address`` is a string or other hashable object which represents a token known by the application. Return the integer document id corresponding to ``address``. If ``address`` doesn't exist in the document map, return None. """ return self.address_to_docid.get(address) def address_for_docid(self, docid): """ Retrieve an address for a given document id. ``docid`` is an integer document id. Return the address corresponding to ``docid``. If ``docid`` doesn't exist in the document map, return None. """ return self.docid_to_address.get(docid) def add(self, address, docid=_marker): """ Add a new document to the document map. ``address`` is a string or other hashable object which represents a token known by the application. ``docid``, if passed, must be an int. In this case, remove any previous address stored for it before mapping it to the new address. Passing an explicit ``docid`` also removes any metadata associated with that docid. If ``docid`` is not passed, generate a new docid. Return the integer document id mapped to ``address``. """ if docid is _marker: docid = self.new_docid() self.remove_docid(docid) self.remove_address(address) self.docid_to_address[docid] = address self.address_to_docid[address] = docid return docid def remove_docid(self, docid): """ Remove a document from the document map for the given document ID. ``docid`` is an integer document id. Remove any corresponding metadata for ``docid`` as well. Return a True if ``docid`` existed in the map, else return False. """ # It should be an invariant that if one entry exists in # docid_to_address for a docid/address pair, exactly one # corresponding entry exists in address_to_docid for the same # docid/address pair. However, versions of this code before # r.catalog 0.7.3 had a bug which, if this method was called # multiple times, each time with the same address but a # different docid, the ``docid_to_address`` mapping could # contain multiple entries for the same address each with a # different docid, causing this invariant to be violated. The # symptom: in systems that used r.catalog 0.7.2 and lower, # there might be more entries in docid_to_address than there # are in address_to_docid. The conditional fuzziness in the # code directly below is a runtime kindness to systems in that # state. Technically, the administrator of a system in such a # state should normalize the two data structures by running a # script after upgrading to 0.7.3. If we made the admin do # this, some of the code fuzziness below could go away, # replaced with something simpler. But there's no sense in # breaking systems at runtime through being a hardass about # consistency if an unsuspecting upgrader has not yet run the # data fixer script. The "fix the data" mantra rings a # little hollow when you weren't the one who broke the data in # the first place ;-) self._check_metadata() address = self.docid_to_address.get(docid, _marker) if address is _marker: return False old_docid = self.address_to_docid.get(address, _marker) if (old_docid is not _marker) and (old_docid != docid): self.remove_docid(old_docid) if docid in self.docid_to_address: del self.docid_to_address[docid] if address in self.address_to_docid: del self.address_to_docid[address] if docid in self.docid_to_metadata: del self.docid_to_metadata[docid] return True def remove_address(self, address): """ Remove a document from the document map using an address. ``address`` is a string or other hashable object which represents a token known by the application. Remove any corresponding metadata for ``address`` as well. Return a True if ``address`` existed in the map, else return False. """ # See the comment in remove_docid for complexity rationalization self._check_metadata() docid = self.address_to_docid.get(address, _marker) if docid is _marker: return False old_address = self.docid_to_address.get(docid, _marker) if (old_address is not _marker) and (old_address != address): self.remove_address(old_address) if docid in self.docid_to_address: del self.docid_to_address[docid] if address in self.address_to_docid: del self.address_to_docid[address] if docid in self.docid_to_metadata: del self.docid_to_metadata[docid] return True def _check_metadata(self): # backwards compatibility if self.docid_to_metadata is None: self.docid_to_metadata = IOBTree() def add_metadata(self, docid, data): """ Add metadata related to a given document id. ``data`` must be a mapping, such as a dictionary. For each key/value pair in ``data`` insert a metadata key/value pair into the metadata stored for ``docid``. Overwrite any existing values for the keys in ``data``, leaving values unchanged for other existing keys. Raise a KeyError If ``docid`` doesn't relate to an address in the document map. """ if not docid in self.docid_to_address: raise KeyError(docid) if len(list(data.keys())) == 0: return self._check_metadata() meta = self.docid_to_metadata.setdefault(docid, OOBTree()) for k in data: meta[k] = data[k] def remove_metadata(self, docid, *keys): """ Remove metadata related to a given document id. If ``docid`` doesn't exist in the metadata map, raise a KeyError. For each key in ``keys``, remove the metadata value for the docid related to that key. Do not raise any error if no value exists for a given key. If no keys are specified, remove all metadata related to the docid. """ self._check_metadata() if keys: meta = self.docid_to_metadata.get(docid, _marker) if meta is _marker: raise KeyError(docid) for k in keys: if k in meta: del meta[k] if not meta: del self.docid_to_metadata[docid] else: if not (docid in self.docid_to_metadata): raise KeyError(docid) del self.docid_to_metadata[docid] def get_metadata(self, docid): """ Return the metadata for ``docid``. Return a mapping of the keys and values set using ``add_metadata``. Raise a KeyError If metadata does not exist for ``docid``. """ if self.docid_to_metadata is None: raise KeyError(docid) meta = self.docid_to_metadata[docid] return meta def new_docid(self): """ Return a new document id. The returned value is guaranteed not to be used already in this document map. """ while True: if self._v_nextid is None: self._v_nextid = self._randrange(self.family.minint, self.family.maxint) uid = self._v_nextid self._v_nextid += 1 if uid not in self.docid_to_address: return uid self._v_nextid = None
def __init__(self): self.docid_to_address = IOBTree() self.address_to_docid = OIBTree() self.docid_to_metadata = IOBTree()
def clear(self): self._length = Length() self._index = OIBTree() self._unindex = IOBTree() self._counter = Length()
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): """ An Object Catalog An Object Catalog maintains a table of object metadata, and a series of manageable indexes to quickly search for objects (references in the metadata) that satisfy a search query. This class is not Zope specific, and can be used in any python program to build catalogs of objects. Note that it does require the objects to be Persistent, and thus must be used with ZODB3. """ _v_brains = NoBrainer def __init__(self, vocabulary=None, brains=None): # Catalogs no longer care about vocabularies and lexicons # so the vocabulary argument is ignored. (Casey) self.schema = {} # mapping from attribute name to column number self.names = () # sequence of column names self.indexes = {} # mapping from index name to index object # The catalog maintains a BTree of object meta_data for # convenient display on result pages. meta_data attributes # are turned into brain objects and returned by # searchResults. The indexing machinery indexes all records # by an integer id (rid). self.data is a mapping from the # integer id to the meta_data, self.uids is a mapping of the # object unique identifier to the rid, and self.paths is a # mapping of the rid to the unique identifier. self.clear() if brains is not None: self._v_brains = brains self.updateBrains() def __len__(self): return self._length() def clear(self): """ clear catalog """ self.data = IOBTree() # mapping of rid to meta_data self.uids = OIBTree() # mapping of uid to rid self.paths = IOBTree() # mapping of rid to uid self._length = BTrees.Length.Length() for index in self.indexes.keys(): self.getIndex(index).clear() def updateBrains(self): self.useBrains(self._v_brains) def __getitem__(self, index): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ if isinstance(index, tuple): # then it contains a score... normalized_score, score, key = index else: # otherwise no score, set all scores to 1 normalized_score, score, key = (1, 1, index) data = self.data[key] klass = self._v_result_class schema_len = len(klass.__record_schema__) if schema_len == len(data) + 3: # if we have complete data, create in a single pass r = klass(tuple(data) + (key, score, normalized_score)) else: r = klass(data) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = normalized_score r = r.__of__(aq_parent(self)) return r def __setstate__(self, state): """ initialize your brains. This method is called when the catalog is first activated (from the persistent storage) """ Persistent.__setstate__(self, state) self.updateBrains() def useBrains(self, brains): """ Sets up the Catalog to return an object (ala ZTables) that is created on the fly from the tuple stored in the self.data Btree. """ class mybrains(AbstractCatalogBrain, brains): pass scopy = self.schema.copy() schema_len = len(self.schema.keys()) scopy['data_record_id_'] = schema_len scopy['data_record_score_'] = schema_len + 1 scopy['data_record_normalized_score_'] = schema_len + 2 mybrains.__record_schema__ = scopy self._v_brains = brains self._v_result_class = mybrains def addColumn(self, name, default_value=None, threshold=10000): """Adds a row to the meta data schema""" schema = self.schema names = list(self.names) if name != name.strip(): # Someone could have mistakenly added a space at the end # of the input field. LOG.warn("stripped space from new column %r -> %r", name, name.strip()) name = name.strip() if name in schema: raise CatalogError('The column %s already exists' % name) if name[0] == '_': raise CatalogError('Cannot cache fields beginning with "_"') values = schema.values() if values: schema[name] = max(values) + 1 else: schema[name] = 0 names.append(name) if default_value in (None, ''): default_value = MV if len(self): pghandler = ZLogHandler(threshold) pghandler.init('Adding %s column' % name, len(self)) for i, (key, value) in enumerate(self.data.iteritems()): pghandler.report(i) self.data[key] = value + (default_value, ) pghandler.finish() self.names = tuple(names) self.schema = schema # new column? update the brain self.updateBrains() def delColumn(self, name, threshold=10000): """Deletes a row from the meta data schema""" names = list(self.names) _index = names.index(name) if not name in self.schema: LOG.error('delColumn attempted to delete nonexistent ' 'column %s.' % str(name)) return del names[_index] # rebuild the schema schema = {} for i, name in enumerate(names): schema[name] = i self.schema = schema self.names = tuple(names) # update the brain self.updateBrains() # remove the column value from each record if len(self): _next_index = _index + 1 pghandler = ZLogHandler(threshold) pghandler.init('Deleting %s column' % name, len(self)) for i, (key, value) in enumerate(self.data.iteritems()): pghandler.report(i) self.data[key] = value[:_index] + value[_next_index:] pghandler.finish() def addIndex(self, name, index_type): """Create a new index, given a name and a index_type. Old format: index_type was a string, 'FieldIndex' 'TextIndex' or 'KeywordIndex' is no longer valid; the actual index must be instantiated and passed in to addIndex. New format: index_type is the actual index object to be stored. """ if name in self.indexes: raise CatalogError('The index %s already exists' % name) if name.startswith('_'): raise CatalogError('Cannot index fields beginning with "_"') if not name: raise CatalogError('Name of index is empty') if name != name.strip(): # Someone could have mistakenly added a space at the end # of the input field. LOG.warn("stripped space from new index %r -> %r", name, name.strip()) name = name.strip() indexes = self.indexes if isinstance(index_type, str): raise TypeError("Catalog addIndex now requires the index type to" "be resolved prior to adding; create the proper " "index in the caller.") indexes[name] = index_type self.indexes = indexes def delIndex(self, name): """ deletes an index """ if not name in self.indexes: raise CatalogError('The index %s does not exist' % name) indexes = self.indexes del indexes[name] self.indexes = indexes def getIndex(self, name): """ get an index wrapped in the catalog """ return self.indexes[name].__of__(self) def updateMetadata(self, object, uid, index): """ Given an object and a uid, update the column data for the uid with the object data iff the object has changed """ data = self.data newDataRecord = self.recordify(object) if index is None: index = getattr(self, '_v_nextid', 0) if index % 4000 == 0: index = randint(-2000000000, 2000000000) while not data.insert(index, newDataRecord): index = randint(-2000000000, 2000000000) # We want ids to be somewhat random, but there are # advantages for having some ids generated # sequentially when many catalog updates are done at # once, such as when reindexing or bulk indexing. # We allocate ids sequentially using a volatile base, # so different threads get different bases. This # further reduces conflict and reduces churn in # here and it result sets when bulk indexing. self._v_nextid = index + 1 else: if data.get(index, 0) != newDataRecord: data[index] = newDataRecord return index # the cataloging API def catalogObject(self, object, uid, threshold=None, idxs=None, update_metadata=True): """ Adds an object to the Catalog by iteratively applying it to all indexes. 'object' is the object to be cataloged 'uid' is the unique Catalog identifier for this object If 'idxs' is specified (as a sequence), apply the object only to the named indexes. If 'update_metadata' is true (the default), also update metadata for the object. If the object is new to the catalog, this flag has no effect (metadata is always created for new objects). """ if idxs is None: idxs = [] index = self.uids.get(uid, None) if index is None: # we are inserting new data index = self.updateMetadata(object, uid, None) self._length.change(1) self.uids[uid] = index self.paths[index] = uid elif update_metadata: # we are updating and we need to update metadata self.updateMetadata(object, uid, index) # do indexing total = 0 if idxs == []: use_indexes = self.indexes.keys() else: use_indexes = idxs for name in use_indexes: x = self.getIndex(name) if hasattr(x, 'index_object'): blah = x.index_object(index, object, threshold) total = total + blah else: LOG.error('catalogObject was passed bad index ' 'object %s.' % str(x)) return total def uncatalogObject(self, uid): """ Uncatalog and object from the Catalog. and 'uid' is a unique Catalog identifier Note, the uid must be the same as when the object was catalogued, otherwise it will not get removed from the catalog This method should not raise an exception if the uid cannot be found in the catalog. """ data = self.data uids = self.uids paths = self.paths indexes = self.indexes.keys() rid = uids.get(uid, None) if rid is not None: for name in indexes: x = self.getIndex(name) if hasattr(x, 'unindex_object'): x.unindex_object(rid) del data[rid] del paths[rid] del uids[uid] self._length.change(-1) else: LOG.error('uncatalogObject unsuccessfully ' 'attempted to uncatalog an object ' 'with a uid of %s. ' % str(uid)) def uniqueValuesFor(self, name): """ return unique values for FieldIndex name """ return tuple(self.getIndex(name).uniqueValues()) def hasuid(self, uid): """ return the rid if catalog contains an object with uid """ return self.uids.get(uid) def recordify(self, object): """ turns an object into a record tuple """ record = [] # the unique id is always the first element for x in self.names: attr = getattr(object, x, MV) if (attr is not MV and safe_callable(attr)): attr = attr() record.append(attr) return tuple(record) def instantiate(self, record): r = self._v_result_class(record[1]) r.data_record_id_ = record[0] return r.__of__(self) def getMetadataForRID(self, rid): record = self.data[rid] result = {} for (key, pos) in self.schema.items(): result[key] = record[pos] return result def getIndexDataForRID(self, rid): result = {} for name in self.indexes.keys(): result[name] = self.getIndex(name).getEntryForObject(rid, "") return result # This is the Catalog search engine. Most of the heavy lifting happens # below def make_query(self, request): # This is a bit of a mess, but the ZCatalog API has traditionally # supported passing in query restrictions in almost arbitary ways real_req = None if isinstance(request, dict): query = request.copy() elif isinstance(request, CatalogSearchArgumentsMap): query = {} query.update(request.keywords) real_req = request.request if isinstance(real_req, dict): query.update(real_req) real_req = None else: real_req = request if real_req: warnings.warn('You have specified a query using either a request ' 'object or a mixture of a query dict and keyword ' 'arguments. Please use only a simple query dict. ' 'Your query contained "%s". This support is ' 'deprecated and will be removed in Zope 4.' % repr(real_req), DeprecationWarning, stacklevel=4) known_keys = query.keys() # The request has too many places where an index restriction # might be specified. Putting all of request.form, # request.other, ... into the query isn't what we want. # So we iterate over all known indexes instead and see if they # are in the request. for iid in self.indexes.keys(): if iid in known_keys: continue value = real_req.get(iid) if value: query[iid] = value return query def _get_index_query_names(self, index): if hasattr(index, 'getIndexQueryNames'): return index.getIndexQueryNames() return (index.getId(),) def _sorted_search_indexes(self, query): # Simple implementation ordering only by limited result support query_keys = query.keys() order = [] for name, index in self.indexes.items(): for attr in self._get_index_query_names(index): if attr in query_keys: order.append((ILimitedResultIndex.providedBy(index), name)) order.sort() return [i[1] for i in order] def _limit_sequence(self, sequence, slen, b_start=0, b_size=None, switched_reverse=False): if b_size is not None: sequence = sequence[b_start:b_start + b_size] if slen: slen = len(sequence) if switched_reverse: sequence.reverse() return (sequence, slen) def search(self, query, sort_index=None, reverse=False, limit=None, merge=True): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" # Indexes fulfill a fairly large contract here. We hand each # index the query mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the query, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) cr = self.getCatalogPlan(query) cr.start() plan = cr.plan() if not plan: plan = self._sorted_search_indexes(query) rs = None # result set indexes = self.indexes.keys() for i in plan: if i not in indexes: # We can have bogus keys or the plan can contain index names # that have been removed in the meantime continue index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue cr.start_split(i) limit_result = ILimitedResultIndex.providedBy(index) if limit_result: r = _apply_index(query, rs) else: r = _apply_index(query) if r is not None: r, u = r # Short circuit if empty result # BBB: We can remove the "r is not None" check in Zope 4 # once we don't need to support the "return everything" case # anymore if r is not None and not r: cr.stop_split(i, result=None, limit=limit_result) return LazyCat([]) # provide detailed info about the pure intersection time intersect_id = i + '#intersection' cr.start_split(intersect_id) # weightedIntersection preserves the values from any mappings # we get, as some indexes don't return simple sets if hasattr(rs, 'items') or hasattr(r, 'items'): _, rs = weightedIntersection(rs, r) else: rs = intersection(rs, r) cr.stop_split(intersect_id) # consider the time it takes to intersect the index result # with the total result set to be part of the index time cr.stop_split(i, result=r, limit=limit_result) if not rs: break else: cr.stop_split(i, result=None, limit=limit_result) # Try to deduce the sort limit from batching arguments b_start = int(query.get('b_start', 0)) b_size = query.get('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit if sort_index is None: sort_report_name = None else: if isinstance(sort_index, list): sort_name = '-'.join(i.getId() for i in sort_index) else: sort_name = sort_index.getId() if isinstance(reverse, list): reverse_name = '-'.join( 'desc' if r else 'asc' for r in reverse) else: reverse_name = 'desc' if reverse else 'asc' sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name if limit is not None: sort_report_name += '#limit-%s' % limit if rs is None: # None of the indexes found anything to do with the query # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog warnings.warn('Your query %s produced no query restriction. ' 'Currently the entire catalog content is returned. ' 'In Zope 4 this will result in an empty LazyCat ' 'to be returned.' % repr(cr.make_key(query)), DeprecationWarning, stacklevel=3) rlen = len(self) if sort_index is None: sequence, slen = self._limit_sequence(self.data.items(), rlen, b_start, b_size) result = LazyMap(self.instantiate, sequence, slen, actual_result_count=rlen) else: cr.start_split(sort_report_name) result = self.sortResults( self.data, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split(sort_report_name, None) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() rlen = len(rs) if sort_index is None and hasattr(rs, 'items'): # having a 'items' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ result = [(score, (1, score, rid), getitem) for rid, score in rs.items()] else: cr.start_split('sort_on#score') # sort it by score rs = rs.byValue(0) max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item data = self.data[key] klass = self._v_result_class schema_len = len(klass.__record_schema__) norm_score = int(100.0 * score / max) if schema_len == len(data) + 3: r = klass(tuple(data) + (key, score, norm_score)) else: r = klass(data) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = norm_score r = r.__of__(aq_parent(self)) return r sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(getScoredResult, sequence, slen, actual_result_count=rlen) cr.stop_split('sort_on#score', None) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(self.__getitem__, sequence, slen, actual_result_count=rlen) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. cr.start_split(sort_report_name) result = self.sortResults(rs, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split(sort_report_name, None) else: # Empty result set result = LazyCat([]) cr.stop() return result def sortResults(self, rs, sort_index, reverse=False, limit=None, merge=True, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using one or more sort indexes. Both sort_index # and reverse can be lists of indexes and reverse specifications. # Return a lazy result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples, where # sortkey can be a tuple on its own. second_indexes = None second_indexes_key_map = None sort_index_length = 1 if isinstance(sort_index, list): sort_index_length = len(sort_index) if sort_index_length > 1: second_indexes = sort_index[1:] second_indexes_key_map = [] for si in second_indexes: second_indexes_key_map.append(si.documentToKeyMap()) sort_index = sort_index[0] _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() result = [] r_append = result.append r_insert = result.insert if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the result set, reverse sorting # order and limit it, then reverse the result set again switched_reverse = False if b_size and b_start and b_start > rlen / 2: if isinstance(reverse, list): reverse = [not r for r in reverse] else: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size # determine sort_spec if isinstance(reverse, list): sort_spec = [r and -1 or 1 for r in reverse] # limit to current maximum of sort indexes sort_spec = sort_spec[:sort_index_length] # use first sort order for choosing the algorithm first_reverse = reverse[0] else: sort_spec = [] for i in xrange(sort_index_length): sort_spec.append(reverse and -1 or 1) first_reverse = reverse if merge and limit is None and ( rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # TODO: len(sort_index) isn't actually what we want for a keyword # index, as it's only the unique values, not the documents. # Don't use this case while using limit, as we return results of # non-flattened intsets, and would have to merge/unflattened those # before limiting. length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) if sort_index_length == 1: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) r_append((k, intset, _self__getitem__)) result.sort(reverse=reverse) else: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) # sort on secondary index keysets = defaultdict(list) for i in intset: full_key = (k, ) for km in second_indexes_key_map: try: full_key += (km[i], ) except KeyError: pass keysets[full_key].append(i) for k2, v2 in keysets.items(): r_append((k2, v2, _self__getitem__)) result = multisort(result, sort_spec) sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index. # If we are interested in at least 25% or more of the result set, # the N-Best algorithm is slower, so we iterate over all. if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result r_append((key, did, _self__getitem__)) if merge: result.sort(reverse=reverse) else: for did in rs: try: full_key = (index_key_map[did], ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: r_append((full_key, did, _self__getitem__)) if merge: result = multisort(result, sort_spec) if merge: if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif first_reverse: # Limit / sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] k_insert = keys.insert n = 0 worst = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence elif not first_reverse: # Limit / sort results using N-Best algorithm in reverse (N-Worst?) keys = [] k_insert = keys.insert n = 0 best = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count) def _get_sort_attr(self, attr, kw): """Helper function to find sort-on or sort-order.""" # There are three different ways to find the attribute: # 1. kw[sort-attr] # 2. self.sort-attr # 3. kw[sort_attr] # kw may be a dict or an ExtensionClass MultiMapping, which # differ in what get() returns with no default value. name = "sort-%s" % attr val = kw.get(name, None) if val is not None: return val val = getattr(self, name, None) if val is not None: return val return kw.get("sort_%s" % attr, None) def _getSortIndex(self, args): """Returns a list of search index objects or None.""" sort_index_names = self._get_sort_attr("on", args) if sort_index_names is not None: # self.indexes is always a dict, so get() w/ 1 arg works sort_indexes = [] if not isinstance(sort_index_names, (list, tuple)): sort_index_names = [sort_index_names] for name in sort_index_names: sort_index = self.indexes.get(name) if sort_index is None: raise CatalogError('Unknown sort_on index: %s' % repr(name)) else: if not hasattr(sort_index, 'documentToKeyMap'): raise CatalogError('The index chosen for sort_on is ' 'not capable of being used as a sort index: ' '%s' % repr(name)) sort_indexes.append(sort_index) if len(sort_indexes) == 1: # be nice and keep the old API intact for single sort_on's return sort_indexes[0] return sort_indexes return None def searchResults(self, REQUEST=None, used=None, _merge=True, **kw): # You should pass in a simple dictionary as the request argument, # which only contains the relevant query. # The used argument is deprecated and is ignored if REQUEST is None and not kw: # Try to acquire request if we get no args for bw compat warnings.warn('Calling searchResults without a query argument nor ' 'keyword arguments is deprecated. In Zope 4 the ' 'query will no longer be automatically taken from ' 'the acquired request.', DeprecationWarning, stacklevel=3) REQUEST = getattr(self, 'REQUEST', None) if isinstance(REQUEST, dict) and not kw: # short cut for the best practice args = REQUEST else: args = CatalogSearchArgumentsMap(REQUEST, kw) sort_indexes = self._getSortIndex(args) sort_limit = self._get_sort_attr('limit', args) reverse = False if sort_indexes is not None: order = self._get_sort_attr("order", args) reverse = [] if order is None: order = [''] elif isinstance(order, str): order = [order] for o in order: reverse.append(o.lower() in ('reverse', 'descending')) if len(reverse) == 1: # be nice and keep the old API intact for single sort_order reverse = reverse[0] # Perform searches with indexes and sort_index return self.search(args, sort_indexes, reverse, sort_limit, _merge) __call__ = searchResults def getCatalogPlan(self, query=None): """Query time reporting and planning. """ parent = aq_base(aq_parent(self)) threshold = getattr(parent, 'long_query_time', 0.1) return CatalogPlan(self, query, threshold)
class IntegerRangesIndex(SimpleItem): """ Index a set of integer ranges: [(1,2), (12,23), (12, 22)] """ implements(IPluggableIndex) meta_type = 'IntegerRangesIndex' def __init__(self, id, caller=None, extra=None): self.id = id self.caller = caller self.clear() self.__genid = 0 def __len__(self): return self._length() def getId(self): """Return Id of index.""" return self.id def clear(self): """Empty the index""" IOBTree = BTrees.family64.IO.BTree self._index = IOBTree() # {rangeid: [document_id, ...]} self._unindex = IOBTree() # {document_id: [rangeid, ...]} self._range_mapping = IOBTree() # {rangeid: range} self._reverse_range_mapping = OIBTree() # {range: rangeid} self._since_index = IOBTree() # {since: [rangeid,...]} self._until_index = IOBTree() # {until: [rangeid,...]} self._length = BTrees.Length.Length() self._unique_values_length = BTrees.Length.Length() def __get_range_id(self, range_): return self._reverse_range_mapping.get(range_, None) def __get_range(self, range_id): return self._range_mapping.get(range_id, None) def __index_range(self, range_): """ index range if needed and return the rangeid """ range_id = self.__get_range_id(range_) if range_id is None: range_id = self.genid() # index range self._unique_values_length.change(1) self._range_mapping[range_id] = range_ self._reverse_range_mapping[range_] = range_id # index range boundaries since, until = range_ self.__insert_in_index_set(self._since_index, since, range_id) self.__insert_in_index_set(self._until_index, until, range_id) return range_id def __unindex_range(self, range_id): range_ = self.__get_range(range_id) if range_ is None: return None since, until = range_ self.__remove_in_index_set(self._since_index, since, range_id) self.__remove_in_index_set(self._until_index, until, range_id) self._unique_values_length.change(-1) del self._range_mapping[range_id] del self._reverse_range_mapping[range_] return range_ def genid(self): self.__genid += 1 return self.__genid def getEntryForObject(self, document_id, default=_marker): """Get all information contained for 'document_id'.""" if default is _marker: return self._unindex.get(document_id) else: return self._index.get(document_id, default) def getIndexSourceNames(self): """Get a sequence of attribute names that are indexed by the index. """ return [self.id] def index_object(self, document_id, obj, threshold=None): """Index an object. 'document_id' is the integer ID of the document. 'obj' is the object to be indexed. 'threshold' is the number of words to process between committing subtransactions. If None, subtransactions are disabled. """ new_ranges = self._get_object_data(obj, self.id) if new_ranges: new_set = IISet(map(self.__index_range, new_ranges)) else: new_set = IISet() old_set = self._unindex.get(document_id, IISet()) new_entries = difference(new_set, old_set) expired_entries = difference(old_set, new_set) if not (new_entries or expired_entries): # nothing to do, bail out ! return 0 for expired_entry in expired_entries: self.__remove_in_index_set(self._unindex, document_id, expired_entry) if self.__remove_in_index_set(self._index, expired_entry, \ document_id): # range is not used anymore, retire it self.__unindex_range(expired_entry) for new_entry in new_entries: if self.__insert_in_index_set(self._unindex, document_id, new_entry): self._length.change(1) self.__insert_in_index_set(self._index, new_entry, document_id) return 1 def unindex_object(self, document_id): """Remove the document_id from the index.""" entries = self._unindex.get(document_id, _marker) if entries is _marker: return if isinstance(entries, int): entries = [entries] for expired_entry in entries: if self.__remove_in_index_set(self._index, expired_entry, \ document_id): # range is not used anymore, retire it self.__unindex_range(expired_entry) self._length.change(-1) del self._unindex[document_id] def __insert_in_index_set(self, index, key, value, set_type=IISet): """ Insert value in the index. If the key was not present and the index row was created it returns True """ index_row = index.get(key, _marker) if index_row is _marker: index[key] = value return True if isinstance(index_row, set_type): index_row.insert(value) return False # it was an int index[key] = set_type((index_row, value,)) return False def __remove_in_index_set(self, index, key, value, set_type=IISet): """ remove the value in the index, index row is a Set It returns true if the index row as been removed (The set was empty) """ index_row = index.get(key, _marker) if index_row is _marker: return True if isinstance(index_row, IISet): index_row.remove(value) if len(index_row) == 0: del index[key] return True if len(index_row) == 1: index[key] = index_row[0] return False del index[key] return True def _apply_index(self, request): record = parseIndexRequest(request, self.id) try: qstart, qend = record.keys except TypeError: return None minint = BTrees.family64.minint maxint = BTrees.family64.maxint qstart = min(maxint, max(minint, qstart)) qend = max(minint, min(maxint, qend)) # start in inside range start = multiunion(self._since_index.values(max=qstart)) end = multiunion(self._until_index.values(min=qstart)) start_into = intersection(start, end) # end inside range start = multiunion(self._since_index.values(max=qend)) end = multiunion(self._until_index.values(min=qend)) end_into = intersection(start, end) # start before range and end after range start = multiunion(self._since_index.values(min=qstart)) end = multiunion(self._until_index.values(max=qend)) start_before_end_after = intersection(start, end) result = union(start_into, end_into) result = union(result, start_before_end_after) return multiunion(map(self._index.__getitem__, result)), (self.id,) def numObjects(self): """Return the number of indexed objects""" return self._length() def indexSize(self): """Return the size of the index in terms of distinct values""" return self._unique_values_length() def _get_object_data(self, obj, attr): # self.id is the name of the index, which is also the name of the # attribute we're interested in. If the attribute is callable, # we'll do so. try: datum = getattr(obj, attr) if safe_callable(datum): datum = datum() except AttributeError: datum = _marker return datum