class DateIndex(UnIndex): """ Index for Dates """ __implements__ = (PluggableIndex.PluggableIndexInterface,) meta_type = 'DateIndex' query_options = ['query', 'range'] manage = manage_main = DTMLFile( 'dtml/manageDateIndex', globals() ) manage_main._setName( 'manage_main' ) manage_options = ( { 'label' : 'Settings' , 'action' : 'manage_main' }, ) def clear( self ): """ Complete reset """ self._index = IOBTree() self._unindex = OIBTree() def index_object( self, documentId, obj, threshold=None ): """index an object, normalizing the indexed value to an integer o Normalized value has granularity of one minute. o Objects which have 'None' as indexed value are *omitted*, by design. """ returnStatus = 0 try: date_attr = getattr( obj, self.id ) if callable( date_attr ): date_attr = date_attr() ConvertedDate = self._convert( value=date_attr, default=_marker ) except AttributeError: ConvertedDate = _marker oldConvertedDate = self._unindex.get( documentId, _marker ) if ConvertedDate != oldConvertedDate: if oldConvertedDate is not _marker: self.removeForwardIndexEntry(oldConvertedDate, documentId) if ConvertedDate is not _marker: self.insertForwardIndexEntry( ConvertedDate, documentId ) self._unindex[documentId] = ConvertedDate returnStatus = 1 return returnStatus def _apply_index( self, request, cid='', type=type, None=None ):
class DocumentMap(Persistent): """ A two-way map between addresses (e.g. location paths) and document ids. The map is a persistent object meant to live in a ZODB storage. Additionally, the map is capable of mapping 'metadata' to docids. """ _v_nextid = None family = BTrees.family32 _randrange = random.randrange docid_to_metadata = None # latch for b/c def __init__(self): self.docid_to_address = IOBTree() self.address_to_docid = OIBTree() self.docid_to_metadata = IOBTree() def docid_for_address(self, address): """ Retrieve a document id for a given address. ``address`` is a string or other hashable object which represents a token known by the application. Return the integer document id corresponding to ``address``. If ``address`` doesn't exist in the document map, return None. """ return self.address_to_docid.get(address) def address_for_docid(self, docid): """ Retrieve an address for a given document id. ``docid`` is an integer document id. Return the address corresponding to ``docid``. If ``docid`` doesn't exist in the document map, return None. """ return self.docid_to_address.get(docid) def add(self, address, docid=_marker): """ Add a new document to the document map. ``address`` is a string or other hashable object which represents a token known by the application. ``docid``, if passed, must be an int. In this case, remove any previous address stored for it before mapping it to the new address. Passing an explicit ``docid`` also removes any metadata associated with that docid. If ``docid`` is not passed, generate a new docid. Return the integer document id mapped to ``address``. """ if docid is _marker: docid = self.new_docid() self.remove_docid(docid) self.remove_address(address) self.docid_to_address[docid] = address self.address_to_docid[address] = docid return docid def remove_docid(self, docid): """ Remove a document from the document map for the given document ID. ``docid`` is an integer document id. Remove any corresponding metadata for ``docid`` as well. Return a True if ``docid`` existed in the map, else return False. """ # It should be an invariant that if one entry exists in # docid_to_address for a docid/address pair, exactly one # corresponding entry exists in address_to_docid for the same # docid/address pair. However, versions of this code before # r.catalog 0.7.3 had a bug which, if this method was called # multiple times, each time with the same address but a # different docid, the ``docid_to_address`` mapping could # contain multiple entries for the same address each with a # different docid, causing this invariant to be violated. The # symptom: in systems that used r.catalog 0.7.2 and lower, # there might be more entries in docid_to_address than there # are in address_to_docid. The conditional fuzziness in the # code directly below is a runtime kindness to systems in that # state. Technically, the administrator of a system in such a # state should normalize the two data structures by running a # script after upgrading to 0.7.3. If we made the admin do # this, some of the code fuzziness below could go away, # replaced with something simpler. But there's no sense in # breaking systems at runtime through being a hardass about # consistency if an unsuspecting upgrader has not yet run the # data fixer script. The "fix the data" mantra rings a # little hollow when you weren't the one who broke the data in # the first place ;-) self._check_metadata() address = self.docid_to_address.get(docid, _marker) if address is _marker: return False old_docid = self.address_to_docid.get(address, _marker) if (old_docid is not _marker) and (old_docid != docid): self.remove_docid(old_docid) if docid in self.docid_to_address: del self.docid_to_address[docid] if address in self.address_to_docid: del self.address_to_docid[address] if docid in self.docid_to_metadata: del self.docid_to_metadata[docid] return True def remove_address(self, address): """ Remove a document from the document map using an address. ``address`` is a string or other hashable object which represents a token known by the application. Remove any corresponding metadata for ``address`` as well. Return a True if ``address`` existed in the map, else return False. """ # See the comment in remove_docid for complexity rationalization self._check_metadata() docid = self.address_to_docid.get(address, _marker) if docid is _marker: return False old_address = self.docid_to_address.get(docid, _marker) if (old_address is not _marker) and (old_address != address): self.remove_address(old_address) if docid in self.docid_to_address: del self.docid_to_address[docid] if address in self.address_to_docid: del self.address_to_docid[address] if docid in self.docid_to_metadata: del self.docid_to_metadata[docid] return True def _check_metadata(self): # backwards compatibility if self.docid_to_metadata is None: self.docid_to_metadata = IOBTree() def add_metadata(self, docid, data): """ Add metadata related to a given document id. ``data`` must be a mapping, such as a dictionary. For each key/value pair in ``data`` insert a metadata key/value pair into the metadata stored for ``docid``. Overwrite any existing values for the keys in ``data``, leaving values unchanged for other existing keys. Raise a KeyError If ``docid`` doesn't relate to an address in the document map. """ if not docid in self.docid_to_address: raise KeyError(docid) if len(list(data.keys())) == 0: return self._check_metadata() meta = self.docid_to_metadata.setdefault(docid, OOBTree()) for k in data: meta[k] = data[k] def remove_metadata(self, docid, *keys): """ Remove metadata related to a given document id. If ``docid`` doesn't exist in the metadata map, raise a KeyError. For each key in ``keys``, remove the metadata value for the docid related to that key. Do not raise any error if no value exists for a given key. If no keys are specified, remove all metadata related to the docid. """ self._check_metadata() if keys: meta = self.docid_to_metadata.get(docid, _marker) if meta is _marker: raise KeyError(docid) for k in keys: if k in meta: del meta[k] if not meta: del self.docid_to_metadata[docid] else: if not (docid in self.docid_to_metadata): raise KeyError(docid) del self.docid_to_metadata[docid] def get_metadata(self, docid): """ Return the metadata for ``docid``. Return a mapping of the keys and values set using ``add_metadata``. Raise a KeyError If metadata does not exist for ``docid``. """ if self.docid_to_metadata is None: raise KeyError(docid) meta = self.docid_to_metadata[docid] return meta def new_docid(self): """ Return a new document id. The returned value is guaranteed not to be used already in this document map. """ while True: if self._v_nextid is None: self._v_nextid = self._randrange(self.family.minint, self.family.maxint) uid = self._v_nextid self._v_nextid += 1 if uid not in self.docid_to_address: return uid self._v_nextid = None
class DateIndex(UnIndex, PropertyManager): """Index for dates. """ implements(IDateIndex) meta_type = 'DateIndex' query_options = ('query', 'range') index_naive_time_as_local = True # False means index as UTC _properties = ({ 'id': 'index_naive_time_as_local', 'type': 'boolean', 'mode': 'w' }, ) manage = manage_main = DTMLFile('dtml/manageDateIndex', globals()) manage_browse = DTMLFile('../dtml/browseIndex', globals()) manage_main._setName('manage_main') manage_options = ( { 'label': 'Settings', 'action': 'manage_main' }, { 'label': 'Browse', 'action': 'manage_browse', }, ) + PropertyManager.manage_options def clear(self): """ Complete reset """ self._index = IOBTree() self._unindex = OIBTree() self._length = Length() def index_object(self, documentId, obj, threshold=None): """index an object, normalizing the indexed value to an integer o Normalized value has granularity of one minute. o Objects which have 'None' as indexed value are *omitted*, by design. """ returnStatus = 0 try: date_attr = getattr(obj, self.id) if safe_callable(date_attr): date_attr = date_attr() ConvertedDate = self._convert(value=date_attr, default=_marker) except AttributeError: ConvertedDate = _marker oldConvertedDate = self._unindex.get(documentId, _marker) if ConvertedDate != oldConvertedDate: if oldConvertedDate is not _marker: self.removeForwardIndexEntry(oldConvertedDate, documentId) if ConvertedDate is _marker: try: del self._unindex[documentId] except ConflictError: raise except: LOG.error("Should not happen: ConvertedDate was there," " now it's not, for document with id %s" % documentId) if ConvertedDate is not _marker: self.insertForwardIndexEntry(ConvertedDate, documentId) self._unindex[documentId] = ConvertedDate returnStatus = 1 return returnStatus def _apply_index(self, request, resultset=None): """Apply the index to query parameters given in the argument Normalize the 'query' arguments into integer values at minute precision before querying. """ record = parseIndexRequest(request, self.id, self.query_options) if record.keys is None: return None keys = map(self._convert, record.keys) index = self._index r = None opr = None #experimental code for specifing the operator operator = record.get('operator', self.useOperator) if not operator in self.operators: raise RuntimeError("operator not valid: %s" % operator) # depending on the operator we use intersection or union if operator == "or": set_func = union else: set_func = intersection # range parameter range_arg = record.get('range', None) if range_arg: opr = "range" opr_args = [] if range_arg.find("min") > -1: opr_args.append("min") if range_arg.find("max") > -1: opr_args.append("max") if record.get('usage', None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr == "range": # range search if 'min' in opr_args: lo = min(keys) else: lo = None if 'max' in opr_args: hi = max(keys) else: hi = None if hi: setlist = index.values(lo, hi) else: setlist = index.values(lo) r = multiunion(setlist) else: # not a range search for key in keys: set = index.get(key, None) if set is not None: if isinstance(set, int): set = IISet((set, )) else: # set can't be bigger than resultset set = intersection(set, resultset) r = set_func(r, set) if isinstance(r, int): r = IISet((r, )) if r is None: return IISet(), (self.id, ) else: return r, (self.id, ) def _convert(self, value, default=None): """Convert Date/Time value to our internal representation""" # XXX: Code patched 20/May/2003 by Kiran Jonnalagadda to # convert dates to UTC first. if isinstance(value, DateTime): t_tup = value.toZone('UTC').parts() elif isinstance(value, (float, int)): t_tup = time.gmtime(value) elif isinstance(value, str) and value: t_obj = DateTime(value).toZone('UTC') t_tup = t_obj.parts() elif isinstance(value, datetime): if self.index_naive_time_as_local and value.tzinfo is None: value = value.replace(tzinfo=Local) # else if tzinfo is None, naive time interpreted as UTC t_tup = value.utctimetuple() elif isinstance(value, date): t_tup = value.timetuple() else: return default yr = t_tup[0] mo = t_tup[1] dy = t_tup[2] hr = t_tup[3] mn = t_tup[4] t_val = ((((yr * 12 + mo) * 31 + dy) * 24 + hr) * 60 + mn) if t_val > MAX32: # t_val must be integer fitting in the 32bit range raise OverflowError( "%s is not within the range of indexable dates (index: %s)" % (value, self.id)) return t_val
class GlobbingLexicon(Lexicon): """Lexicon which supports basic globbing function ('*' and '?'). This lexicon keeps several data structures around that are useful for searching. They are: '_lexicon' -- Contains the mapping from word => word_id '_inverseLex' -- Contains the mapping from word_id => word '_digrams' -- Contains a mapping from digram => word_id Before going further, it is necessary to understand what a digram is, as it is a core component of the structure of this lexicon. A digram is a two-letter sequence in a word. For example, the word 'zope' would be converted into the digrams:: ['$z', 'zo', 'op', 'pe', 'e$'] where the '$' is a word marker. It is used at the beginning and end of the words. Those digrams are significant. """ multi_wc = '*' single_wc = '?' eow = '$' def __init__(self,useSplitter=None,extra=None): self.clear() self.useSplitter = useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() self._digrams = OOBTree() def _convertBTrees(self, threshold=200): Lexicon._convertBTrees(self, threshold) if type(self._digrams) is OOBTree: return from BTrees.convert import convert _digrams=self._digrams self._digrams=OOBTree() self._digrams._p_jar=self._p_jar convert(_digrams, self._digrams, threshold, IITreeSet) def createDigrams(self, word): """Returns a list with the set of digrams in the word.""" word = '$'+word+'$' return [ word[i:i+2] for i in range(len(word)-1)] def getWordId(self, word): """Provided 'word', return the matching integer word id.""" if self._lexicon.has_key(word): return self._lexicon[word] else: return self.assignWordId(word) set = getWordId # Kludge for old code def getWord(self, wid): return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word, and return it.""" # Double check it's not in the lexicon already, and if it is, just # return it. if self._lexicon.has_key(word): return self._lexicon[word] # Get word id. BBB Backward compat pain. inverse=self._inverseLex try: insert=inverse.insert except AttributeError: # we have an "old" BTree object if inverse: wid=inverse.keys()[-1]+1 else: self._inverseLex=IOBTree() wid=1 inverse[wid] = word else: # we have a "new" IOBTree object wid=randid() while not inverse.insert(wid, word): wid=randid() self._lexicon[word] = wid # Now take all the digrams and insert them into the digram map. for digram in self.createDigrams(word): set = self._digrams.get(digram, None) if set is None: self._digrams[digram] = set = IISet() set.insert(wid) return wid def get(self, pattern): """ Query the lexicon for words matching a pattern.""" # single word pattern produce a slicing problem below. # Because the splitter throws away single characters we can # return an empty tuple here. if len(pattern)==1: return () wc_set = [self.multi_wc, self.single_wc] digrams = [] globbing = 0 for i in range(len(pattern)): if pattern[i] in wc_set: globbing = 1 continue if i == 0: digrams.insert(i, (self.eow + pattern[i]) ) digrams.append((pattern[i] + pattern[i+1])) else: try: if pattern[i+1] not in wc_set: digrams.append( pattern[i] + pattern[i+1] ) except IndexError: digrams.append( (pattern[i] + self.eow) ) if not globbing: result = self._lexicon.get(pattern, None) if result is None: return () return (result, ) ## now get all of the intsets that contain the result digrams result = None for digram in digrams: result=union(result, self._digrams.get(digram, None)) if not result: return () else: ## now we have narrowed the list of possible candidates ## down to those words which contain digrams. However, ## some words may have been returned that match digrams, ## but do not match 'pattern'. This is because some words ## may contain all matching digrams, but in the wrong ## order. expr = re.compile(self.createRegex(pattern)) words = [] hits = IISet() for x in result: if expr.match(self._inverseLex[x]): hits.insert(x) return hits def __getitem__(self, word): """ """ return self.get(word) def query_hook(self, q): """expand wildcards""" ListType = type([]) i = len(q) - 1 while i >= 0: e = q[i] if isinstance(e, ListType): self.query_hook(e) elif isinstance(e, Op): pass elif ( (self.multi_wc in e) or (self.single_wc in e) ): wids = self.get(e) words = [] for wid in wids: if words: words.append(Or) words.append(wid) if not words: # if words is empty, return something that will make # textindex's __getitem__ return an empty result list words.append('') q[i] = words i = i - 1 return q def Splitter(self, astring, words=None, encoding="latin1"): """ wrap the splitter """ ## don't do anything, less efficient but there's not much ## sense in stemming a globbing lexicon. try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding ) except: return self.SplitterFunc(astring, words) def createRegex(self, pat): """Translate a PATTERN to a regular expression. There is no way to quote meta-characters. """ # Remove characters that are meaningful in a regex if not isinstance(pat, UnicodeType): transTable = string.maketrans("", "") result = string.translate(pat, transTable, r'()&|!@#$%^{}\<>.') else: transTable={} for ch in r'()&|!@#$%^{}\<>.': transTable[ord(ch)]=None result=pat.translate(transTable) # First, deal with multi-character globbing result = result.replace( '*', '.*') # Next, we need to deal with single-character globbing result = result.replace( '?', '.') return "%s$" % result
class UUIDIndex(UnIndex): """Index for uuid fields with an unique value per key. The internal structure is: self._index = {datum:documentId]} self._unindex = {documentId:datum} For each datum only one documentId can exist. """ meta_type = "UUIDIndex" manage_options = ( {'label': 'Settings', 'action': 'manage_main'}, {'label': 'Browse', 'action': 'manage_browse'}, ) query_options = ["query", "range"] manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals()) manage_main._setName('manage_main') manage_browse = DTMLFile('../dtml/browseIndex', globals()) def clear(self): self._length = Length() self._index = OIBTree() self._unindex = IOBTree() self._counter = Length() def numObjects(self): """Return the number of indexed objects. Since we have a 1:1 mapping from documents to values, we can reuse the stored length. """ return self.indexSize() def uniqueValues(self, name=None, withLengths=0): """returns the unique values for name if withLengths is true, returns a sequence of tuples of (value, length) """ if name is None: name = self.id elif name != self.id: raise StopIteration if not withLengths: for key in self._index.keys(): yield key else: # We know the length for each value is one for key in self._index.keys(): yield (key, 1) def insertForwardIndexEntry(self, entry, documentId): """Take the entry provided and put it in the correct place in the forward index. """ if entry is None: return old_docid = self._index.get(entry, _marker) if old_docid is _marker: self._index[entry] = documentId self._length.change(1) elif old_docid != documentId: logger.error("A different document with value '%s' already " "exists in the index.'" % entry) def removeForwardIndexEntry(self, entry, documentId): """Take the entry provided and remove any reference to documentId in its entry in the index. """ old_docid = self._index.get(entry, _marker) if old_docid is not _marker: del self._index[entry] self._length.change(-1) def _get_object_datum(self, obj, attr): # for a uuid it never makes sense to acquire a parent value via # Acquisition has_attr = getattr(aq_base(obj), attr, _marker) if has_attr is _marker: return _marker return super(UUIDIndex, self)._get_object_datum(obj, attr)
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): """ An Object Catalog An Object Catalog maintains a table of object metadata, and a series of manageable indexes to quickly search for objects (references in the metadata) that satisfy a search query. This class is not Zope specific, and can be used in any python program to build catalogs of objects. Note that it does require the objects to be Persistent, and thus must be used with ZODB3. """ _v_brains = NoBrainer def __init__(self, vocabulary=None, brains=None): # Catalogs no longer care about vocabularies and lexicons # so the vocabulary argument is ignored. (Casey) self.schema = {} # mapping from attribute name to column number self.names = () # sequence of column names self.indexes = {} # mapping from index name to index object # The catalog maintains a BTree of object meta_data for # convenient display on result pages. meta_data attributes # are turned into brain objects and returned by # searchResults. The indexing machinery indexes all records # by an integer id (rid). self.data is a mapping from the # integer id to the meta_data, self.uids is a mapping of the # object unique identifier to the rid, and self.paths is a # mapping of the rid to the unique identifier. self.clear() if brains is not None: self._v_brains = brains self.updateBrains() def __len__(self): return self._length() def clear(self): """ clear catalog """ self.data = IOBTree() # mapping of rid to meta_data self.uids = OIBTree() # mapping of uid to rid self.paths = IOBTree() # mapping of rid to uid self._length = BTrees.Length.Length() for index in self.indexes: self.getIndex(index).clear() def updateBrains(self): self.useBrains(self._v_brains) def __getitem__(self, index): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ if isinstance(index, tuple): # then it contains a score... normalized_score, score, key = index else: # otherwise no score, set all scores to 1 normalized_score, score, key = (1, 1, index) return self.instantiate((key, self.data[key]), score_data=(score, normalized_score)) def __setstate__(self, state): """ initialize your brains. This method is called when the catalog is first activated (from the persistent storage) """ Persistent.__setstate__(self, state) self.updateBrains() def useBrains(self, brains): """ Sets up the Catalog to return an object (ala ZTables) that is created on the fly from the tuple stored in the self.data Btree. """ class mybrains(AbstractCatalogBrain, brains): # NOQA pass scopy = self.schema.copy() schema_len = len(self.schema.keys()) scopy['data_record_id_'] = schema_len scopy['data_record_score_'] = schema_len + 1 scopy['data_record_normalized_score_'] = schema_len + 2 mybrains.__record_schema__ = scopy self._v_brains = brains self._v_result_class = mybrains def addColumn(self, name, default_value=None, threshold=10000): """Adds a row to the meta data schema""" schema = self.schema names = list(self.names) threshold = threshold if threshold is not None else 10000 if name != name.strip(): # Someone could have mistakenly added a space at the end # of the input field. LOG.warning('stripped space from new column %r -> %r', name, name.strip()) name = name.strip() if name in schema: raise CatalogError('The column %s already exists' % name) if name[0] == '_': raise CatalogError('Cannot cache fields beginning with "_"') values = schema.values() if values: schema[name] = max(values) + 1 else: schema[name] = 0 names.append(name) if default_value in (None, ''): default_value = MV if len(self): pghandler = ZLogHandler(threshold) pghandler.init('Adding %s column' % name, len(self)) for i, (key, value) in enumerate(self.data.iteritems()): pghandler.report(i) self.data[key] = value + (default_value, ) pghandler.finish() self.names = tuple(names) self.schema = schema # new column? update the brain self.updateBrains() def delColumn(self, name, threshold=10000): """Deletes a row from the meta data schema""" names = list(self.names) _index = names.index(name) threshold = threshold if threshold is not None else 10000 if name not in self.schema: LOG.error( 'delColumn attempted to delete nonexistent ' 'column %s.', str(name)) return del names[_index] # rebuild the schema schema = {} for i, name in enumerate(names): schema[name] = i self.schema = schema self.names = tuple(names) # update the brain self.updateBrains() # remove the column value from each record if len(self): _next_index = _index + 1 pghandler = ZLogHandler(threshold) pghandler.init('Deleting %s column' % name, len(self)) for i, (key, value) in enumerate(self.data.iteritems()): pghandler.report(i) self.data[key] = value[:_index] + value[_next_index:] pghandler.finish() def addIndex(self, name, index_type): """Create a new index, given a name and a index_type. Old format: index_type was a string, 'FieldIndex' 'TextIndex' or 'KeywordIndex' is no longer valid; the actual index must be instantiated and passed in to addIndex. New format: index_type is the actual index object to be stored. """ if name in self.indexes: raise CatalogError('The index %s already exists' % name) if name.startswith('_'): raise CatalogError('Cannot index fields beginning with "_"') if not name: raise CatalogError('Name of index is empty') if name != name.strip(): # Someone could have mistakenly added a space at the end # of the input field. LOG.warning('stripped space from new index %r -> %r', name, name.strip()) name = name.strip() indexes = self.indexes if isinstance(index_type, str): raise TypeError("Catalog addIndex now requires the index type to" "be resolved prior to adding; create the proper " "index in the caller.") indexes[name] = index_type self.indexes = indexes def delIndex(self, name): """ deletes an index """ if name not in self.indexes: raise CatalogError('The index %s does not exist' % name) indexes = self.indexes del indexes[name] self.indexes = indexes def getIndex(self, name): """ get an index wrapped in the catalog """ return self.indexes[name].__of__(self) def updateMetadata(self, object, uid, index): """ Given an object and a uid, update the column data for the uid with the object data iff the object has changed """ data = self.data newDataRecord = self.recordify(object) if index is None: index = getattr(self, '_v_nextid', 0) if index % 4000 == 0: index = randint(-2000000000, 2000000000) while not data.insert(index, newDataRecord): index = randint(-2000000000, 2000000000) # We want ids to be somewhat random, but there are # advantages for having some ids generated # sequentially when many catalog updates are done at # once, such as when reindexing or bulk indexing. # We allocate ids sequentially using a volatile base, # so different threads get different bases. This # further reduces conflict and reduces churn in # here and it result sets when bulk indexing. self._v_nextid = index + 1 else: if data.get(index, 0) != newDataRecord: data[index] = newDataRecord return index # the cataloging API def catalogObject(self, object, uid, threshold=None, idxs=None, update_metadata=True): """ Adds an object to the Catalog by iteratively applying it to all indexes. 'object' is the object to be cataloged 'uid' is the unique Catalog identifier for this object If 'idxs' is specified (as a sequence), apply the object only to the named indexes. If 'update_metadata' is true (the default), also update metadata for the object. If the object is new to the catalog, this flag has no effect (metadata is always created for new objects). """ if idxs is None: idxs = [] index = self.uids.get(uid, None) if index is None: # we are inserting new data index = self.updateMetadata(object, uid, None) self._length.change(1) self.uids[uid] = index self.paths[index] = uid elif update_metadata: # we are updating and we need to update metadata self.updateMetadata(object, uid, index) # do indexing total = 0 if idxs == []: use_indexes = self.indexes.keys() else: use_indexes = set(idxs) for iid in self.indexes: x = self.getIndex(iid) if ITransposeQuery.providedBy(x): # supported index names for query optimization names = x.getIndexNames() intersec = use_indexes.intersection(names) # add current index for indexing if supported index # names are member of idxs if intersec: use_indexes.update([iid]) use_indexes = list(use_indexes) for name in use_indexes: x = self.getIndex(name) if hasattr(x, 'index_object'): blah = x.index_object(index, object, threshold) total = total + blah else: LOG.error('catalogObject was passed bad index ' 'object %s.', str(x)) return total def uncatalogObject(self, uid): """ Uncatalog and object from the Catalog. and 'uid' is a unique Catalog identifier Note, the uid must be the same as when the object was catalogued, otherwise it will not get removed from the catalog This method should not raise an exception if the uid cannot be found in the catalog. """ data = self.data uids = self.uids paths = self.paths indexes = self.indexes.keys() rid = uids.get(uid, None) if rid is not None: for name in indexes: x = self.getIndex(name) if hasattr(x, 'unindex_object'): x.unindex_object(rid) del data[rid] del paths[rid] del uids[uid] self._length.change(-1) else: LOG.error( 'uncatalogObject unsuccessfully ' 'attempted to uncatalog an object ' 'with a uid of %s. ', str(uid)) def uniqueValuesFor(self, name): """ return unique values for FieldIndex name """ return tuple(self.getIndex(name).uniqueValues()) def hasuid(self, uid): """ return the rid if catalog contains an object with uid """ return self.uids.get(uid) def recordify(self, object): """ turns an object into a record tuple """ record = [] # the unique id is always the first element for x in self.names: attr = getattr(object, x, MV) if (attr is not MV and safe_callable(attr)): attr = attr() record.append(attr) return tuple(record) def _maintain_zodb_cache(self): parent = aq_parent(self) if hasattr(aq_base(parent), 'maintain_zodb_cache'): parent.maintain_zodb_cache() def instantiate(self, record, score_data=None): """ internal method: create and initialise search result object. record should be a tuple of (document RID, metadata columns tuple), score_data can be a tuple of (scode, normalized score) or be omitted""" self._maintain_zodb_cache() key, data = record klass = self._v_result_class if score_data: score, normalized_score = score_data schema_len = len(klass.__record_schema__) if schema_len == len(data) + 3: # if we have complete data, create in a single pass data = tuple(data) + (key, score, normalized_score) return klass(data).__of__(aq_parent(self)) r = klass(data) r.data_record_id_ = key if score_data: # preserved during refactoring for compatibility reasons: # can only be reached if score_data is present, # but schema length is not equal to len(data) + 3 # no known use cases r.data_record_score_ = score r.data_record_normalized_score_ = normalized_score return r.__of__(aq_parent(self)) return r.__of__(self) def getMetadataForRID(self, rid): record = self.data[rid] result = {} for (key, pos) in self.schema.items(): result[key] = record[pos] return result def getIndexDataForRID(self, rid): result = {} for name in self.indexes: result[name] = self.getIndex(name).getEntryForObject(rid, "") return result def merge_query_args(self, query=None, **kw): if not kw and isinstance(query, dict): # Short cut for the best practice. return query merged_query = {} if isinstance(query, dict): merged_query.update(query) merged_query.update(kw) return merged_query def make_query(self, query): for iid in self.indexes: index = self.getIndex(iid) if ITransposeQuery.providedBy(index): query = index.make_query(query) # Canonicalize tuple/list query arguments. new_query = {} for key, value in query.items(): if isinstance(value, (list, tuple)): new_query[key] = list(sorted(value)) else: new_query[key] = value return new_query def _get_index_query_names(self, index): if hasattr(index, 'getIndexQueryNames'): return index.getIndexQueryNames() return (index.getId(), ) def _sort_limit_arguments(self, query, sort_index, reverse, limit): b_start = int(query.get('b_start', 0)) b_size = query.get('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit if sort_index is None: sort_report_name = None else: if isinstance(sort_index, list): sort_name = '-'.join(i.getId() for i in sort_index) else: sort_name = sort_index.getId() if isinstance(reverse, list): reverse_name = '-'.join('desc' if r else 'asc' for r in reverse) else: reverse_name = 'desc' if reverse else 'asc' sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name if limit is not None: sort_report_name += '#limit-%s' % limit return (b_start, b_size, limit, sort_report_name) def _sorted_search_indexes(self, query): # Simple implementation ordering only by limited result support query_keys = query.keys() order = [] for name, index in self.indexes.items(): for attr in self._get_index_query_names(index): if attr in query_keys: order.append((ILimitedResultIndex.providedBy(index), name)) order.sort() return [i[1] for i in order] def _limit_sequence(self, sequence, slen, b_start=0, b_size=None, switched_reverse=False): if b_size is not None: sequence = sequence[b_start:b_start + b_size] if slen: slen = len(sequence) if switched_reverse: sequence.reverse() return (sequence, slen) def _search_index(self, cr, index_id, query, rs): cr.start_split(index_id) index_rs = None index = self.getIndex(index_id) limit_result = ILimitedResultIndex.providedBy(index) if IQueryIndex.providedBy(index): index_query = IndexQuery(query, index.id, index.query_options, index.operators, index.useOperator) if index_query.keys is not None: index_rs = index.query_index(index_query, rs) else: if limit_result: index_result = index._apply_index(query, rs) else: index_result = index._apply_index(query) # Parse (resultset, used_attributes) index return value. if index_result: index_rs, _ = index_result if not index_rs: # Short circuit if empty index result. rs = None else: # Provide detailed info about the pure intersection time. intersect_id = index_id + '#intersection' cr.start_split(intersect_id) # weightedIntersection preserves the values from any mappings # we get, as some indexes don't return simple sets. if hasattr(rs, 'items') or hasattr(index_rs, 'items'): _, rs = weightedIntersection(rs, index_rs) else: rs = intersection(rs, index_rs) cr.stop_split(intersect_id) # Consider the time it takes to intersect the index result # with the total result set to be part of the index time. cr.stop_split(index_id, result=index_rs, limit=limit_result) return rs def search(self, query, sort_index=None, reverse=False, limit=None, merge=True): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjunction with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" # Indexes fulfill a fairly large contract here. We hand each # index the query mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the query, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) cr = self.getCatalogPlan(query) cr.start() plan = cr.plan() if not plan: plan = self._sorted_search_indexes(query) rs = None # result set for index_id in plan: # The actual core loop over all indices. if index_id not in self.indexes: # We can have bogus keys or the plan can contain index names # that have been removed in the meantime. continue rs = self._search_index(cr, index_id, query, rs) if not rs: break if not rs: # None of the indexes found anything to do with the query. result = LazyCat([]) cr.stop() return result # Try to deduce the sort limit from batching arguments. b_start, b_size, limit, sort_report_name = self._sort_limit_arguments( query, sort_index, reverse, limit) # We got some results from the indexes, sort and convert to sequences. rlen = len(rs) if sort_index is None and hasattr(rs, 'items'): # Having a 'items' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults. # Note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case. result = [(score, (1, score, rid), self.__getitem__) for rid, score in rs.items()] else: cr.start_split('sort_on#score') # Sort it by score. rs = rs.byValue(0) max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item norm_score = int(100.0 * score / max) return self.instantiate((key, self.data[key]), score_data=(score, norm_score)) sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(getScoredResult, sequence, slen, actual_result_count=rlen) cr.stop_split('sort_on#score', None) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(self.__getitem__, sequence, slen, actual_result_count=rlen) else: # Sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. cr.start_split(sort_report_name) result = self.sortResults(rs, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split(sort_report_name, None) cr.stop() return result def _sort_iterate_index(self, actual_result_count, result, rs, limit, merge, reverse, sort_index, sort_index_length, sort_spec, second_indexes_key_map): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # TODO: len(sort_index) isn't actually what we want for a keyword # index, as it's only the unique values, not the documents. # Don't use this case while using limit, as we return results of # non-flattened intsets, and would have to merge/unflattened those # before limiting. length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) if sort_index_length == 1: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) result.append((k, intset, self.__getitem__)) result.sort(reverse=reverse) else: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) # sort on secondary index keysets = defaultdict(list) for i in intset: full_key = (k, ) for km in second_indexes_key_map: try: full_key += (km[i], ) except KeyError: pass keysets[full_key].append(i) for k2, v2 in keysets.items(): result.append((k2, v2, self.__getitem__)) result = multisort(result, sort_spec) return (actual_result_count, length, result) def _sort_iterate_resultset(self, actual_result_count, result, rs, limit, merge, reverse, sort_index, sort_index_length, sort_spec, second_indexes_key_map): # Iterate over the result set getting sort keys from the index. # If we are interested in at least 25% or more of the result set, # the N-Best algorithm is slower, so we iterate over all. index_key_map = sort_index.documentToKeyMap() if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result result.append((key, did, self.__getitem__)) if merge: result = sorted(result, key=lambda x: (0, ) if x[0] is None else x, reverse=reverse) else: for did in rs: try: full_key = (index_key_map[did], ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: result.append((full_key, did, self.__getitem__)) if merge: result = multisort(result, sort_spec) if merge and limit is not None: result = result[:limit] return (actual_result_count, 0, result) def _sort_nbest(self, actual_result_count, result, rs, limit, merge, reverse, sort_index, sort_index_length, sort_spec, second_indexes_key_map): # Limit / sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory index_key_map = sort_index.documentToKeyMap() keys = [] n = 0 worst = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, self.__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (full_key, did, self.__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result = multisort(result, sort_spec) return (actual_result_count, 0, result) def _sort_nbest_reverse(self, actual_result_count, result, rs, limit, merge, reverse, sort_index, sort_index_length, sort_spec, second_indexes_key_map): # Limit / sort results using N-Best algorithm in reverse (N-Worst?) index_key_map = sort_index.documentToKeyMap() keys = [] n = 0 best = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, self.__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (full_key, did, self.__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] result = multisort(result, sort_spec) return (actual_result_count, 0, result) def sortResults(self, rs, sort_index, reverse=False, limit=None, merge=True, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using one or more sort indexes. Both sort_index # and reverse can be lists of indexes and reverse specifications. # Return a lazy result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples, where # sortkey can be a tuple on its own. second_indexes = None second_indexes_key_map = None sort_index_length = 1 if isinstance(sort_index, list): sort_index_length = len(sort_index) if sort_index_length > 1: second_indexes = sort_index[1:] second_indexes_key_map = [] for si in second_indexes: second_indexes_key_map.append(si.documentToKeyMap()) sort_index = sort_index[0] result = [] if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the result set, reverse sorting # order and limit it, then reverse the result set again switched_reverse = False if b_size and b_start and b_start > rlen / 2: if isinstance(reverse, list): reverse = [not r for r in reverse] else: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size # determine sort_spec if isinstance(reverse, list): sort_spec = [r and -1 or 1 for r in reverse] # limit to current maximum of sort indexes sort_spec = sort_spec[:sort_index_length] # use first sort order for choosing the algorithm first_reverse = reverse[0] else: sort_spec = [] for i in xrange(sort_index_length): sort_spec.append(reverse and -1 or 1) first_reverse = reverse # Special first condition, as it changes post-processing. iterate_sort_index = (merge and limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1)))) # Choose one of the sort algorithms. if iterate_sort_index: sort_func = self._sort_iterate_index elif limit is None or (limit * 4 > rlen): sort_func = self._sort_iterate_resultset elif first_reverse: sort_func = self._sort_nbest else: sort_func = self._sort_nbest_reverse actual_result_count, length, result = sort_func( actual_result_count, result, rs, limit, merge, reverse, sort_index, sort_index_length, sort_spec, second_indexes_key_map) sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) if iterate_sort_index: result = LazyCat(LazyValues(sequence), slen, actual_result_count) else: if not merge: return sequence result = LazyValues(sequence) result.actual_result_count = actual_result_count return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count) def _get_sort_attr(self, attr, kw): """Helper function to find sort-on or sort-order.""" # There are three different ways to find the attribute: # 1. kw[sort-attr] # 2. self.sort-attr # 3. kw[sort_attr] # kw may be a dict or an ExtensionClass MultiMapping, which # differ in what get() returns with no default value. name = "sort-%s" % attr val = kw.get(name, None) if val is not None: return val val = getattr(self, name, None) if val is not None: return val return kw.get("sort_%s" % attr, None) def _getSortIndex(self, args): """Returns a list of search index objects or None.""" sort_index_names = self._get_sort_attr("on", args) if sort_index_names is not None: # self.indexes is always a dict, so get() w/ 1 arg works sort_indexes = [] if not isinstance(sort_index_names, (list, tuple)): sort_index_names = [sort_index_names] for name in sort_index_names: sort_index = self.indexes.get(name) if sort_index is None: raise CatalogError('Unknown sort_on index: %s' % repr(name)) else: if not hasattr(sort_index, 'documentToKeyMap'): raise CatalogError( 'The index chosen for sort_on is ' 'not capable of being used as a sort index: ' '%s' % repr(name)) sort_indexes.append(sort_index) if len(sort_indexes) == 1: # be nice and keep the old API intact for single sort_on's return sort_indexes[0] return sort_indexes return None def searchResults(self, query=None, _merge=True, **kw): # You should pass in a simple dictionary as the first argument, # which only contains the relevant query. query = self.merge_query_args(query, **kw) sort_indexes = self._getSortIndex(query) sort_limit = self._get_sort_attr('limit', query) reverse = False if sort_indexes is not None: order = self._get_sort_attr("order", query) reverse = [] if order is None: order = [''] elif isinstance(order, str): order = [order] for o in order: reverse.append(o.lower() in ('reverse', 'descending')) if len(reverse) == 1: # be nice and keep the old API intact for single sort_order reverse = reverse[0] # Perform searches with indexes and sort_index return self.search(query, sort_indexes, reverse, sort_limit, _merge) __call__ = searchResults def getCatalogPlan(self, query=None): """Query time reporting and planning. """ parent = aq_base(aq_parent(self)) threshold = getattr(parent, 'long_query_time', 0.1) return CatalogPlan(self, query, threshold)
class Indexer(object): filestorage = database = connection = root = None def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words") def dumpfreqs(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) L = [] for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f L.append((freq, wid, lexicon.get_word(wid))) L.sort() L.reverse() for freq, wid, word in L: print("%10d %10d %s" % (wid, freq, word)) def dumpwids(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid))) def dumpwords(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for word in lexicon.words(): wid = lexicon.get_wid(word) freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, word)) def close(self): self.root = None if self.connection is not None: self.connection.close() self.connection = None if self.database is not None: self.database.close() self.database = None if self.filestorage is not None: self.filestorage.close() self.filestorage = None def interact(self, nbest=NBEST, maxlines=MAXLINES): try: import readline except ImportError: pass text = "" top = 0 results = [] while 1: try: line = raw_input("Query: ") except EOFError: print("\nBye.") break line = line.strip() if line.startswith("/"): self.specialcommand(line, results, top - nbest) continue if line: text = line top = 0 else: if not text: continue try: results, n = self.timequery(text, top + nbest) except KeyboardInterrupt: raise except: reportexc() text = "" continue if len(results) <= top: if not n: print("No hits for %r." % text) else: print("No more hits for %r." % text) text = "" continue print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=" ") print("for query %s]" % repr(text)) self.formatresults(text, results, maxlines, top, top+nbest) top += nbest def specialcommand(self, line, results, first): assert line.startswith("/") line = line[1:] if not line: n = first else: try: n = int(line) - 1 except: print("Huh?") return if n < 0 or n >= len(results): print("Out of range") return docid, score = results[n] path = self.docpaths[docid] i = path.rfind("/") assert i > 0 folder = path[:i] n = path[i+1:] cmd = "show +%s %s" % (folder, n) if os.getenv("DISPLAY"): os.system("xterm -e sh -c '%s | less' &" % cmd) else: os.system(cmd) def query(self, text, nbest=NBEST, maxlines=MAXLINES): results, n = self.timequery(text, nbest) if not n: print("No hits for %r." % text) return print("[Results 1-%d from %d]" % (len(results), n)) self.formatresults(text, results, maxlines) def timequery(self, text, nbest): t0 = time.time() c0 = time.clock() results, n = self.index.query(text, 0, nbest) t1 = time.time() c1 = time.clock() print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)) return results, n def formatresults(self, text, results, maxlines=MAXLINES, lo=0, hi=sys.maxint): stop = self.stopdict.has_key words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)] pattern = r"\b(" + "|".join(words) + r")\b" pattern = pattern.replace("*", ".*") # glob -> re syntax prog = re.compile(pattern, re.IGNORECASE) print('='*70) rank = lo for docid, score in results[lo:hi]: rank += 1 path = self.docpaths[docid] score *= 100.0 print("Rank: %d Score: %d%% File: %s" % (rank, score, path)) path = os.path.join(self.mh.getpath(), path) try: fp = open(path) except (IOError, OSError) as msg: print("Can't open:", msg) continue msg = mhlib.Message("<folder>", 0, fp) for header in "From", "To", "Cc", "Bcc", "Subject", "Date": h = msg.getheader(header) if h: print("%-8s %s" % (header+":", h)) text = self.getmessagetext(msg) if text: print() nleft = maxlines for part in text: for line in part.splitlines(): if prog.search(line): print(line) nleft -= 1 if nleft <= 0: break if nleft <= 0: break print('-'*70) def update(self, args): folder = None seqs = [] for arg in args: if arg.startswith("+"): if folder is None: folder = arg[1:] else: print("only one folder at a time") return else: seqs.append(arg) if not folder: folder = self.mh.getcontext() if not seqs: seqs = ['all'] try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) return dict = {} for seq in seqs: try: nums = f.parsesequence(seq) except mhlib.Error as msg: print(msg or "unparsable message sequence: %s" % repr(seq)) return for n in nums: dict[n] = n msgs = dict.keys() msgs.sort() self.updatefolder(f, msgs) self.commit() def optimize(self, args): uniqwords = {} for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nOPTIMIZE FOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.prescan(f, f.listmessages(), uniqwords) L = [(uniqwords[word], word) for word in uniqwords.keys()] L.sort() L.reverse() for i in range(100): print("%3d. %6d %s" % ((i+1,) + L[i])) self.index.lexicon.sourceToWordIds([word for (count, word) in L]) def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1 def bulkupdate(self, args): if not args: print("No folders specified; use ALL to bulk-index all folders") return if "ALL" in args: i = args.index("ALL") args[i:i+1] = self.mh.listfolders() for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nFOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.updatefolder(f, f.listmessages()) print("Total", len(self.docpaths)) self.commit() print("Indexed", self.index.lexicon._nbytes, "bytes and",) print(self.index.lexicon._nwords, "words;",) print(len(self.index.lexicon._words), "unique words.") def updatefolder(self, f, msgs): self.watchfolders[f.name] = self.getmtime(f.name) for n in msgs: path = "%s/%s" % (f.name, n) docid = self.path2docid.get(path, 0) if docid and self.getmtime(path) == self.doctimes.get(docid, 0): print("unchanged", docid, path) continue docid = self.newdocid(path) try: m = f.openmessage(n) except IOError: print("disappeared", docid, path) self.unindexpath(path) continue text = self.getmessagetext(m, f.name) if not text: self.unindexpath(path) continue print("indexing", docid, path) self.index.index_doc(docid, text) self.maycommit() # Remove messages from the folder that no longer exist for path in list(self.path2docid.keys(f.name)): if not path.startswith(f.name + "/"): break if self.getmtime(path) == 0: self.unindexpath(path) print("done.") def unindexpath(self, path): if self.path2docid.has_key(path): docid = self.path2docid[path] print("unindexing", docid, path) del self.docpaths[docid] del self.doctimes[docid] del self.path2docid[path] try: self.index.unindex_doc(docid) except KeyError as msg: print("KeyError", msg) self.maycommit() def getmessagetext(self, m, name=None): L = [] if name: L.append("_folder " + name) # To restrict search to a folder self.getheaders(m, L) try: self.getmsgparts(m, L, 0) except KeyboardInterrupt: raise except: print("(getmsgparts failed:)") reportexc() return L def getmsgparts(self, m, L, level): ctype = m.gettype() if level or ctype != "text/plain": print(". "*level + str(ctype)) if ctype == "text/plain": L.append(m.getbodytext()) elif ctype in ("multipart/alternative", "multipart/mixed"): for part in m.getbodyparts(): self.getmsgparts(part, L, level+1) elif ctype == "message/rfc822": f = StringIO(m.getbodytext()) m = mhlib.Message("<folder>", 0, f) self.getheaders(m, L) self.getmsgparts(m, L, level+1) def getheaders(self, m, L): H = [] for key in "from", "to", "cc", "bcc", "subject": value = m.get(key) if value: H.append(value) if H: L.append("\n".join(H)) def newdocid(self, path): docid = self.path2docid.get(path) if docid is not None: self.doctimes[docid] = self.getmtime(path) return docid docid = self.maxdocid + 1 self.maxdocid = docid self.docpaths[docid] = path self.doctimes[docid] = self.getmtime(path) self.path2docid[path] = docid return docid def getmtime(self, path): path = os.path.join(self.mh.getpath(), path) try: st = os.stat(path) except os.error as msg: return 0 return int(st[ST_MTIME]) def maycommit(self): self.trans_count += 1 if self.trans_count >= self.trans_limit > 0: self.commit() def commit(self): if self.trans_count > 0: print("committing...") transaction.commit() self.trans_count = 0 self.pack_count += 1 if self.pack_count >= self.pack_limit > 0: self.pack() def pack(self): if self.pack_count > 0: print("packing...") self.database.pack() self.pack_count = 0
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): """ An Object Catalog An Object Catalog maintains a table of object metadata, and a series of manageable indexes to quickly search for objects (references in the metadata) that satisfy a search query. This class is not Zope specific, and can be used in any python program to build catalogs of objects. Note that it does require the objects to be Persistent, and thus must be used with ZODB3. """ _v_brains = NoBrainer def __init__(self, vocabulary=None, brains=None): # Catalogs no longer care about vocabularies and lexicons # so the vocabulary argument is ignored. (Casey) self.schema = {} # mapping from attribute name to column number self.names = () # sequence of column names self.indexes = {} # maping from index name to index object # The catalog maintains a BTree of object meta_data for # convenient display on result pages. meta_data attributes # are turned into brain objects and returned by # searchResults. The indexing machinery indexes all records # by an integer id (rid). self.data is a mapping from the # integer id to the meta_data, self.uids is a mapping of the # object unique identifier to the rid, and self.paths is a # mapping of the rid to the unique identifier. self.clear() if brains is not None: self._v_brains = brains self.updateBrains() def __len__(self): return self._length() def migrate__len__(self): """ migration of old __len__ magic for Zope 2.8 """ if not hasattr(self, '_length'): n = self.__dict__['__len__']() del self.__dict__['__len__'] self._length = BTrees.Length.Length(n) def clear(self): """ clear catalog """ self.data = IOBTree() # mapping of rid to meta_data self.uids = OIBTree() # mapping of uid to rid self.paths = IOBTree() # mapping of rid to uid self._length = BTrees.Length.Length() for index in self.indexes.keys(): self.getIndex(index).clear() def updateBrains(self): self.useBrains(self._v_brains) def __getitem__(self, index, ttype=type(())): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ if type(index) is ttype: # then it contains a score... normalized_score, score, key = index r=self._v_result_class(self.data[key]).__of__(self.aq_parent) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = normalized_score else: # otherwise no score, set all scores to 1 r=self._v_result_class(self.data[index]).__of__(self.aq_parent) r.data_record_id_ = index r.data_record_score_ = 1 r.data_record_normalized_score_ = 1 return r def __setstate__(self, state): """ initialize your brains. This method is called when the catalog is first activated (from the persistent storage) """ Persistent.__setstate__(self, state) self.updateBrains() def useBrains(self, brains): """ Sets up the Catalog to return an object (ala ZTables) that is created on the fly from the tuple stored in the self.data Btree. """ class mybrains(AbstractCatalogBrain, brains): pass scopy = self.schema.copy() scopy['data_record_id_']=len(self.schema.keys()) scopy['data_record_score_']=len(self.schema.keys())+1 scopy['data_record_normalized_score_']=len(self.schema.keys())+2 mybrains.__record_schema__ = scopy self._v_brains = brains self._v_result_class = mybrains def addColumn(self, name, default_value=None): """ adds a row to the meta data schema """ schema = self.schema names = list(self.names) if schema.has_key(name): raise CatalogError, 'The column %s already exists' % name if name[0] == '_': raise CatalogError, \ 'Cannot cache fields beginning with "_"' if not schema.has_key(name): if schema.values(): schema[name] = max(schema.values())+1 else: schema[name] = 0 names.append(name) if default_value is None or default_value == '': default_value = MV for key in self.data.keys(): rec = list(self.data[key]) rec.append(default_value) self.data[key] = tuple(rec) self.names = tuple(names) self.schema = schema # new column? update the brain self.updateBrains() self._p_changed = 1 # why? def delColumn(self, name): """ deletes a row from the meta data schema """ names = list(self.names) _index = names.index(name) if not self.schema.has_key(name): LOG.error('delColumn attempted to delete nonexistent column %s.' % str(name)) return del names[_index] # rebuild the schema i=0; schema = {} for name in names: schema[name] = i i = i + 1 self.schema = schema self.names = tuple(names) # update the brain self.updateBrains() # remove the column value from each record for key in self.data.keys(): rec = list(self.data[key]) del rec[_index] self.data[key] = tuple(rec) def addIndex(self, name, index_type): """Create a new index, given a name and a index_type. Old format: index_type was a string, 'FieldIndex' 'TextIndex' or 'KeywordIndex' is no longer valid; the actual index must be instantiated and passed in to addIndex. New format: index_type is the actual index object to be stored. """ if self.indexes.has_key(name): raise CatalogError, 'The index %s already exists' % name if name.startswith('_'): raise CatalogError, 'Cannot index fields beginning with "_"' if not name: raise CatalogError, 'Name of index is empty' indexes = self.indexes if isinstance(index_type, str): raise TypeError,"""Catalog addIndex now requires the index type to be resolved prior to adding; create the proper index in the caller.""" indexes[name] = index_type; self.indexes = indexes def delIndex(self, name): """ deletes an index """ if not self.indexes.has_key(name): raise CatalogError, 'The index %s does not exist' % name indexes = self.indexes del indexes[name] self.indexes = indexes def getIndex(self, name): """ get an index wrapped in the catalog """ return self.indexes[name].__of__(self) def updateMetadata(self, object, uid): """ Given an object and a uid, update the column data for the uid with the object data iff the object has changed """ data = self.data index = self.uids.get(uid, None) newDataRecord = self.recordify(object) if index is None: if type(data) is IOBTree: # New style, get random id index=getattr(self, '_v_nextid', 0) if index % 4000 == 0: index = randint(-2000000000, 2000000000) while not data.insert(index, newDataRecord): index = randint(-2000000000, 2000000000) # We want ids to be somewhat random, but there are # advantages for having some ids generated # sequentially when many catalog updates are done at # once, such as when reindexing or bulk indexing. # We allocate ids sequentially using a volatile base, # so different threads get different bases. This # further reduces conflict and reduces churn in # here and it result sets when bulk indexing. self._v_nextid=index+1 else: if data: # find the next available unique id index = data.keys()[-1] + 1 else: index=0 # meta_data is stored as a tuple for efficiency data[index] = newDataRecord else: if data.get(index, 0) != newDataRecord: data[index] = newDataRecord return index # the cataloging API def catalogObject(self, object, uid, threshold=None, idxs=None, update_metadata=1): """ Adds an object to the Catalog by iteratively applying it to all indexes. 'object' is the object to be cataloged 'uid' is the unique Catalog identifier for this object If 'idxs' is specified (as a sequence), apply the object only to the named indexes. If 'update_metadata' is true (the default), also update metadata for the object. If the object is new to the catalog, this flag has no effect (metadata is always created for new objects). """ if idxs is None: idxs = [] data = self.data index = self.uids.get(uid, None) if index is None: # we are inserting new data index = self.updateMetadata(object, uid) if not hasattr(self, '_length'): self.migrate__len__() self._length.change(1) self.uids[uid] = index self.paths[index] = uid elif update_metadata: # we are updating and we need to update metadata self.updateMetadata(object, uid) # do indexing total = 0 if idxs==[]: use_indexes = self.indexes.keys() else: use_indexes = idxs for name in use_indexes: x = self.getIndex(name) if hasattr(x, 'index_object'): blah = x.index_object(index, object, threshold) total = total + blah else: LOG.error('catalogObject was passed bad index object %s.' % str(x)) return total def uncatalogObject(self, uid): """ Uncatalog and object from the Catalog. and 'uid' is a unique Catalog identifier Note, the uid must be the same as when the object was catalogued, otherwise it will not get removed from the catalog This method should not raise an exception if the uid cannot be found in the catalog. """ data = self.data uids = self.uids paths = self.paths indexes = self.indexes.keys() rid = uids.get(uid, None) if rid is not None: for name in indexes: x = self.getIndex(name) if hasattr(x, 'unindex_object'): x.unindex_object(rid) del data[rid] del paths[rid] del uids[uid] if not hasattr(self, '_length'): self.migrate__len__() self._length.change(-1) else: LOG.error('uncatalogObject unsuccessfully ' 'attempted to uncatalog an object ' 'with a uid of %s. ' % str(uid)) def uniqueValuesFor(self, name): """ return unique values for FieldIndex name """ return self.getIndex(name).uniqueValues() def hasuid(self, uid): """ return the rid if catalog contains an object with uid """ return self.uids.get(uid) def recordify(self, object): """ turns an object into a record tuple """ record = [] # the unique id is allways the first element for x in self.names: attr=getattr(object, x, MV) if(attr is not MV and safe_callable(attr)): attr=attr() record.append(attr) return tuple(record) def instantiate(self, record): r=self._v_result_class(record[1]) r.data_record_id_ = record[0] return r.__of__(self) def getMetadataForRID(self, rid): record = self.data[rid] result = {} for (key, pos) in self.schema.items(): result[key] = record[pos] return result def getIndexDataForRID(self, rid): result = {} for name in self.indexes.keys(): result[name] = self.getIndex(name).getEntryForObject(rid, "") return result ## This is the Catalog search engine. Most of the heavy lifting happens below def search(self, request, sort_index=None, reverse=0, limit=None, merge=1): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" rs = None # resultset # Indexes fulfill a fairly large contract here. We hand each # index the request mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the request, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # For hysterical reasons, if all indexes return None for a given # request (and no attributes were used) then we append all results # in the Catalog. This generally happens when the search values # in request are all empty strings or do not coorespond to any of # the indexes. # Note that if the indexes find query arguments, but the end result # is an empty sequence, we do nothing for i in self.indexes.keys(): index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue r = _apply_index(request) if r is not None: r, u = r w, rs = weightedIntersection(rs, r) if rs is None: # None of the indexes found anything to do with the request # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog if sort_index is None: return LazyMap(self.instantiate, self.data.items(), len(self)) else: return self.sortResults( self.data, sort_index, reverse, limit, merge) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() if sort_index is None and hasattr(rs, 'values'): # having a 'values' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ return [(score, (1, score, rid), getitem) for rid, score in rs.items()] rs = rs.byValue(0) # sort it by score max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item r=self._v_result_class(self.data[key])\ .__of__(self.aq_parent) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = int(100. * score / max) return r return LazyMap(getScoredResult, rs, len(rs)) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() return LazyMap(self.__getitem__, rs, len(rs)) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. return self.sortResults(rs, sort_index, reverse, limit, merge) else: # Empty result set return LazyCat([]) def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1): # Sort a result set using a sort index. Return a lazy # result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples # # The two 'for' loops in here contribute a significant # proportion of the time to perform an indexed search. # Try to avoid all non-local attribute lookup inside # those loops. assert limit is None or limit > 0, 'Limit value must be 1 or greater' _lazymap = LazyMap _intersection = intersection _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() _None = None _keyerror = KeyError result = [] append = result.append if hasattr(rs, 'keys'): rs = rs.keys() rlen = len(rs) if merge and limit is None and ( rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # This is rarely exercised in practice... length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = _intersection(rs, intset) if intset: keys = getattr(intset, 'keys', _None) if keys is not _None: # Is this ever true? intset = keys() length += len(intset) append((k, intset, _self__getitem__)) # Note that sort keys are unique. result.sort() if reverse: result.reverse() result = LazyCat(LazyValues(result), length) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: append((key, did, _self__getitem__)) # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result if merge: result.sort() if reverse: result.reverse() if limit is not None: result = result[:limit] result = LazyValues(result) else: return result elif reverse: # Limit/sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] n = 0 worst = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() if merge: result = LazyValues(result) else: return result elif not reverse: # Limit/sort results using N-Best algorithm in reverse (N-Worst?) keys = [] n = 0 best = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] if merge: result = LazyValues(result) else: return result result = LazyMap(self.__getitem__, result, len(result)) result.actual_result_count = rlen return result def _get_sort_attr(self, attr, kw): """Helper function to find sort-on or sort-order.""" # There are three different ways to find the attribute: # 1. kw[sort-attr] # 2. self.sort-attr # 3. kw[sort_attr] # kw may be a dict or an ExtensionClass MultiMapping, which # differ in what get() returns with no default value. name = "sort-%s" % attr val = kw.get(name, None) if val is not None: return val val = getattr(self, name, None) if val is not None: return val return kw.get("sort_%s" % attr, None) def _getSortIndex(self, args): """Returns a search index object or None.""" sort_index_name = self._get_sort_attr("on", args) if sort_index_name is not None: # self.indexes is always a dict, so get() w/ 1 arg works sort_index = self.indexes.get(sort_index_name) if sort_index is None: raise CatalogError, 'Unknown sort_on index (%s)' % sort_index_name else: if not hasattr(sort_index, 'keyForDocument'): raise CatalogError( 'The index chosen for sort_on (%s) is not capable of being' ' used as a sort index.' % sort_index_name ) return sort_index else: return None def searchResults(self, REQUEST=None, used=None, _merge=1, **kw): # The used argument is deprecated and is ignored if REQUEST is None and not kw: # Try to acquire request if we get no args for bw compat REQUEST = getattr(self, 'REQUEST', None) args = CatalogSearchArgumentsMap(REQUEST, kw) sort_index = self._getSortIndex(args) sort_limit = self._get_sort_attr('limit', args) reverse = 0 if sort_index is not None: order = self._get_sort_attr("order", args) if (isinstance(order, str) and order.lower() in ('reverse', 'descending')): reverse = 1 # Perform searches with indexes and sort_index return self.search(args, sort_index, reverse, sort_limit, _merge) __call__ = searchResults
class ExtendedPathIndex(PathIndex): """A path index stores all path components of the physical path of an object. Internal datastructure (regular pathindex): - a physical path of an object is split into its components - every component is kept as a key of a OOBTree in self._indexes - the value is a mapping 'level of the path component' to 'all docids with this path component on this level' In addition - there is a terminator (None) signifying the last component in the path - 2 additional indexes map absolute path to either the doc id or doc ids of contained objects. This allows for rapid answering of common queries. """ meta_type = "ExtendedPathIndex" manage_options = ({'label': 'Settings', 'action': 'manage_main'}, ) indexed_attrs = None query_options = ("query", "level", "operator", "depth", "navtree", "navtree_start") def __init__(self, id, extra=None, caller=None): """ ExtendedPathIndex supports indexed_attrs """ PathIndex.__init__(self, id, caller) if isinstance(extra, dict): attrs = extra.get('indexed_attrs', None) else: attrs = getattr(extra, 'indexed_attrs', None) if attrs is None: return if isinstance(attrs, str): attrs = attrs.split(',') attrs = [a.strip() for a in attrs] attrs = [a for a in attrs if a] if attrs: # We only index the first attribute so snip off the rest self.indexed_attrs = tuple(attrs[:1]) def clear(self): PathIndex.clear(self) self._index_parents = OOBTree() self._index_items = OIBTree() def index_object(self, docid, obj, threshold=100): """ hook for (Z)Catalog """ # PathIndex first checks for an attribute matching its id and # falls back to getPhysicalPath only when failing to get one. # If self.indexed_attrs is not None, it's value overrides this behavior attrs = self.indexed_attrs index = attrs is None and self.id or attrs[0] path = getattr(obj, index, None) if path is not None: if safe_callable(path): path = path() if not isinstance(path, (str, tuple)): raise TypeError('path value must be string or tuple ' 'of strings: (%r, %s)' % (index, repr(path))) else: try: path = obj.getPhysicalPath() except AttributeError: return 0 if isinstance(path, (list, tuple)): path = '/' + '/'.join(path[1:]) comps = [p for p in path.split('/') if p] # Make sure we reindex properly when path change old_path = self._unindex.get(docid, _marker) if old_path is not _marker: if old_path != path: self.unindex_object(docid, _old=old_path) # unindex reduces length, we need to counter that self._length.change(1) else: # We only get a new entry if the value wasn't there before. # If it already existed the length is unchanged self._length.change(1) for i, comp in enumerate(comps): self.insertEntry(comp, docid, i) # Add terminator self.insertEntry(None, docid, len(comps) - 1) # Add full-path indexes, to optimize certain edge cases parent_path = '/' + '/'.join(comps[:-1]) parents = self._index_parents.get(parent_path, _marker) if parents is _marker: self._index_parents[parent_path] = parents = IITreeSet() parents.insert(docid) self._index_items[path] = docid self._unindex[docid] = path return 1 def unindex_object(self, docid, _old=_marker): """ hook for (Z)Catalog """ if _old is not _marker: old_value = _old else: old_value = self._unindex.get(docid, _marker) if old_value is _marker: logger.log( logging.INFO, 'Attempt to unindex nonexistent object with id ' '%s' % docid) return # There is an assumption that paths start with / comps = [p for p in old_value.split('/') if p] def unindex(comp, level, docid=docid): index_comp = self._index[comp] index_comp[level].remove(docid) if not index_comp[level]: del index_comp[level] if not index_comp: del self._index[comp] try: for level, comp in enumerate(comps): unindex(comp, level) # Remove the terminator unindex(None, len(comps) - 1) # Remove full-path indexes parent_path = '/' + '/'.join(comps[:-1]) parents = self._index_parents.get(parent_path, _marker) if parents is not _marker: parents.remove(docid) if not parents: del self._index_parents[parent_path] del self._index_items['/'.join([parent_path, comps[-1]])] except KeyError: logger.log( logging.INFO, 'Attempt to unindex object with id ' '%s failed' % docid) self._length.change(-1) del self._unindex[docid] def search(self, path, default_level=0, depth=-1, navtree=0, navtree_start=0, resultset=None): """ path is either a string representing a relative URL or a part of a relative URL or a tuple (path, level). default_level specifies the level to use when no more specific level has been passed in with the path. level >= 0 starts searching at the given level level < 0 finds matches at *any* level depth let's you limit the results to items at most depth levels deeper than the matched path. depth == 0 means no subitems are included at all, with depth == 1 only direct children are included, etc. depth == -1, the default, returns all children at any depth. navtree is treated as a boolean; if it evaluates to True, not only the query match is returned, but also each container in the path. If depth is greater than 0, also all siblings of those containers, as well as the siblings of the match are included as well, plus *all* documents at the starting level. navtree_start limits what containers are included in a navtree search. If greater than 0, only containers (and possibly their siblings) at that level and up will be included in the resultset. """ if isinstance(path, string_types): level = default_level else: level = int(path[1]) path = path[0] if level < 0: # Search at every level, return the union of all results return multiunion([ self.search(path, level, depth, navtree, navtree_start) for level in range(self._depth + 1) ]) comps = [p for p in path.split('/') if p] if navtree and depth == -1: # Navtrees don't do recursive depth = 1 # Optimizations pathlength = level + len(comps) - 1 if navtree and navtree_start > min(pathlength + depth, self._depth): # This navtree_start excludes all items that match the depth return IISet() if level == 0 and depth in (0, 1): # We have easy indexes for absolute paths where # we are looking for depth 0 or 1 result sets if navtree: # Optimized absolute path navtree and breadcrumbs cases result = [] add = lambda x: x is not None and result.append(x) if depth == 1: # Navtree case, all sibling elements along the path convert = multiunion index = self._index_parents else: # Breadcrumbs case, all direct elements along the path convert = IISet index = self._index_items # Collect all results along the path for i in range(len(comps), navtree_start - 1, -1): parent_path = '/' + '/'.join(comps[:i]) add(index.get(parent_path)) return convert(result) if not path.startswith('/'): path = '/' + path if depth == 0: # Specific object search res = self._index_items.get(path) return res and IISet([res]) or IISet() else: # Single depth search return self._index_parents.get(path, IISet()) # Avoid using the root set # as it is common for all objects anyway and add overhead # There is an assumption about all indexed values having the # same common base path if level == 0: indexpath = [p for p in self.getPhysicalPath() if p] minlength = min(len(indexpath), len(comps)) # Truncate path to first different element for i in range(minlength): if indexpath[i] != comps[i]: break level += 1 comps = comps[level:] if not comps and depth == -1: # Recursive search for everything return IISet(self._unindex) # Core application of the indexes pathset = None depthset = None # For limiting depth if navtree and depth > 0: # Include the elements up to the matching path depthset = multiunion([ self._index.get(None, {}).get(i, IISet()) for i in range(min(navtree_start, level), max(navtree_start, level) + 1) ]) indexedcomps = enumerate(comps) if not navtree: # Optimize relative-path searches by starting with the # presumed smaller sets at the end of the path first # We can't do this for the navtree case because it needs # the bigger rootset to include siblings along the way. indexedcomps = list(indexedcomps) indexedcomps.reverse() for i, comp in indexedcomps: # Find all paths that have comp at the given level res = self._index.get(comp, {}).get(i + level) if res is None: # Non-existing path; navtree is inverse, keep going pathset = IISet() if not navtree: return pathset pathset = intersection(pathset, res) if navtree and i + level >= navtree_start: depthset = union( depthset, intersection(pathset, self._index.get(None, {}).get(i + level))) if depth >= 0: # Limit results to those that terminate within depth levels start = len(comps) - 1 if navtree: start = max(start, (navtree_start - level)) depthset = [depthset] + [ intersection(pathset, self._index.get(None, {}).get(i + level)) for i in range(start, start + depth + 1) ] depthset = multiunion([d for d in depthset if d]) if navtree or depth >= 0: return depthset return pathset def _apply_index(self, request, resultset=None): """ hook for (Z)Catalog 'request' -- mapping type (usually {"path": "..." } additionaly a parameter "path_level" might be passed to specify the level (see search()) """ record = IndexQuery(request, self.id, self.query_options) if record.keys is None: return None return (self.query_index(record), (self.id, )) def query_index(self, record, resultset=None): level = record.get("level", 0) operator = record.get('operator', self.useOperator).lower() depth = getattr(record, 'depth', -1) # use getattr to get 0 value navtree = record.get('navtree', 0) navtree_start = record.get('navtree_start', 0) # depending on the operator we use intersection of union if operator == "or": set_func = union else: set_func = intersection result = None for k in record.keys: rows = self.search(k, level, depth, navtree, navtree_start, resultset=resultset) result = set_func(result, rows) if result: return result return IISet() def getIndexSourceNames(self): """ return names of indexed attributes """ attrs = self.indexed_attrs or ('getPhysicalPath', ) return tuple(attrs)
class UdbBtreeIndex(UdbIndex): is_prefixed = True is_ranged = True is_sorted_asc = True type = 'btree' def __init__(self, schema, name=None): from BTrees.OIBTree import OIBTree UdbIndex.__init__(self, schema, name) self._btree = OIBTree() def __len__(self): return len(self._btree) def clear(self): self._btree.clear() return self def delete(self, key, uid=None): self._btree.pop(key, EMPTY) return self def insert(self, key, uid): self._btree.insert(key, uid) return self def search_by_key(self, key): val = self._btree.get(key, EMPTY) if val != EMPTY: yield val def search_by_key_in(self, keys): for key in keys: val = self._btree.get(key, EMPTY) if val != EMPTY: yield val def search_by_key_prefix(self, key): for val in self._btree.values(key, key + CHAR255): yield val def search_by_key_prefix_in(self, keys): for key in keys: for val in self._btree.values(key, key + CHAR255): yield val def search_by_key_range(self, gte=None, lte=None, gte_excluded=False, lte_excluded=False): for val in self._btree.values(gte, lte, gte_excluded, lte_excluded): yield val def upsert(self, old, new, uid): if old != new: self._btree.pop(old) self._btree.insert(new, uid) return self
class DateIndex(UnIndex, PropertyManager): """Index for dates. """ __implements__ = UnIndex.__implements__ implements(IDateIndex) meta_type = 'DateIndex' query_options = ['query', 'range'] index_naive_time_as_local = True # False means index as UTC _properties=({'id':'index_naive_time_as_local', 'type':'boolean', 'mode':'w'},) manage = manage_main = DTMLFile( 'dtml/manageDateIndex', globals() ) manage_browse = DTMLFile('../dtml/browseIndex', globals()) manage_main._setName( 'manage_main' ) manage_options = ( { 'label' : 'Settings' , 'action' : 'manage_main' }, {'label': 'Browse', 'action': 'manage_browse', }, ) + PropertyManager.manage_options def clear( self ): """ Complete reset """ self._index = IOBTree() self._unindex = OIBTree() self._length = BTrees.Length.Length() def index_object( self, documentId, obj, threshold=None ): """index an object, normalizing the indexed value to an integer o Normalized value has granularity of one minute. o Objects which have 'None' as indexed value are *omitted*, by design. """ returnStatus = 0 try: date_attr = getattr( obj, self.id ) if safe_callable( date_attr ): date_attr = date_attr() ConvertedDate = self._convert( value=date_attr, default=_marker ) except AttributeError: ConvertedDate = _marker oldConvertedDate = self._unindex.get( documentId, _marker ) if ConvertedDate != oldConvertedDate: if oldConvertedDate is not _marker: self.removeForwardIndexEntry(oldConvertedDate, documentId) if ConvertedDate is _marker: try: del self._unindex[documentId] except ConflictError: raise except: logger.error( ("Should not happen: ConvertedDate was there," " now it's not, for document with id %s" % documentId)) if ConvertedDate is not _marker: self.insertForwardIndexEntry( ConvertedDate, documentId ) self._unindex[documentId] = ConvertedDate returnStatus = 1 return returnStatus def _apply_index( self, request, cid='', type=type ): """Apply the index to query parameters given in the argument Normalize the 'query' arguments into integer values at minute precision before querying. """ record = parseIndexRequest( request, self.id, self.query_options ) if record.keys == None: return None keys = map( self._convert, record.keys ) index = self._index r = None opr = None #experimental code for specifing the operator operator = record.get( 'operator', self.useOperator ) if not operator in self.operators : raise RuntimeError, "operator not valid: %s" % operator # depending on the operator we use intersection or union if operator=="or": set_func = union else: set_func = intersection # range parameter range_arg = record.get('range',None) if range_arg: opr = "range" opr_args = [] if range_arg.find("min") > -1: opr_args.append("min") if range_arg.find("max") > -1: opr_args.append("max") if record.get('usage',None): # see if any usage params are sent to field opr = record.usage.lower().split(':') opr, opr_args = opr[0], opr[1:] if opr=="range": # range search if 'min' in opr_args: lo = min(keys) else: lo = None if 'max' in opr_args: hi = max(keys) else: hi = None if hi: setlist = index.values(lo,hi) else: setlist = index.values(lo) #for k, set in setlist: #if type(set) is IntType: #set = IISet((set,)) #r = set_func(r, set) # XXX: Use multiunion! r = multiunion(setlist) else: # not a range search for key in keys: set = index.get(key, None) if set is not None: if type(set) is IntType: set = IISet((set,)) r = set_func(r, set) if type(r) is IntType: r = IISet((r,)) if r is None: return IISet(), (self.id,) else: return r, (self.id,) def _convert( self, value, default=None ): """Convert Date/Time value to our internal representation""" # XXX: Code patched 20/May/2003 by Kiran Jonnalagadda to # convert dates to UTC first. if isinstance( value, DateTime ): t_tup = value.toZone('UTC').parts() elif type( value ) in (FloatType, IntType): t_tup = time.gmtime( value ) elif type( value ) is StringType and value: t_obj = DateTime( value ).toZone('UTC') t_tup = t_obj.parts() elif type( value ) is date: t_tup = value.timetuple() elif type( value ) is datetime: if self.index_naive_time_as_local and value.tzinfo is None: value = value.replace(tzinfo=Local) # else if tzinfo is None, naive time interpreted as UTC t_tup = value.utctimetuple() else: return default yr = t_tup[0] mo = t_tup[1] dy = t_tup[2] hr = t_tup[3] mn = t_tup[4] t_val = ( ( ( ( yr * 12 + mo ) * 31 + dy ) * 24 + hr ) * 60 + mn ) if isinstance(t_val, long): # t_val must be IntType, not LongType raise OverflowError, ( "%s is not within the range of indexable dates (index: %s)" % (value, self.id)) return t_val
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id + '_encoding') if safe_callable(encoding): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source, encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word == last: continue last = word wordScores[word] = wordScores.get(word, 0) + 1 # Convert scores to use wids: widScores = IIBucket() getWid = lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)] = score del wordScores currentWids = IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert = self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids = widScores.keys() if wids != currentWids.keys(): self._unindex[documentId] = wids return len(wids)
class Indexer(object): filestorage = database = connection = root = None def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words") def dumpfreqs(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) L = [] for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f L.append((freq, wid, lexicon.get_word(wid))) L.sort() L.reverse() for freq, wid, word in L: print("%10d %10d %s" % (wid, freq, word)) def dumpwids(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid))) def dumpwords(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for word in lexicon.words(): wid = lexicon.get_wid(word) freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, word)) def close(self): self.root = None if self.connection is not None: self.connection.close() self.connection = None if self.database is not None: self.database.close() self.database = None if self.filestorage is not None: self.filestorage.close() self.filestorage = None def interact(self, nbest=NBEST, maxlines=MAXLINES): try: import readline except ImportError: pass text = "" top = 0 results = [] while 1: try: line = raw_input("Query: ") except EOFError: print("\nBye.") break line = line.strip() if line.startswith("/"): self.specialcommand(line, results, top - nbest) continue if line: text = line top = 0 else: if not text: continue try: results, n = self.timequery(text, top + nbest) except KeyboardInterrupt: raise except: reportexc() text = "" continue if len(results) <= top: if not n: print("No hits for %r." % text) else: print("No more hits for %r." % text) text = "" continue print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=' ') print("for query %s]" % repr(text)) self.formatresults(text, results, maxlines, top, top+nbest) top += nbest def specialcommand(self, line, results, first): assert line.startswith("/") line = line[1:] if not line: n = first else: try: n = int(line) - 1 except: print("Huh?") return if n < 0 or n >= len(results): print("Out of range") return docid, score = results[n] path = self.docpaths[docid] i = path.rfind("/") assert i > 0 folder = path[:i] n = path[i+1:] cmd = "show +%s %s" % (folder, n) if os.getenv("DISPLAY"): os.system("xterm -e sh -c '%s | less' &" % cmd) else: os.system(cmd) def query(self, text, nbest=NBEST, maxlines=MAXLINES): results, n = self.timequery(text, nbest) if not n: print("No hits for %r." % text) return print("[Results 1-%d from %d]" % (len(results), n)) self.formatresults(text, results, maxlines) def timequery(self, text, nbest): t0 = time.time() c0 = time.clock() results, n = self.index.query(text, 0, nbest) t1 = time.time() c1 = time.clock() print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)) return results, n def formatresults(self, text, results, maxlines=MAXLINES, lo=0, hi=sys.maxint): stop = self.stopdict.has_key words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)] pattern = r"\b(" + "|".join(words) + r")\b" pattern = pattern.replace("*", ".*") # glob -> re syntax prog = re.compile(pattern, re.IGNORECASE) print('='*70) rank = lo for docid, score in results[lo:hi]: rank += 1 path = self.docpaths[docid] score *= 100.0 print("Rank: %d Score: %d%% File: %s" % (rank, score, path)) path = os.path.join(self.mh.getpath(), path) try: fp = open(path) except (IOError, OSError) as msg: print("Can't open:", msg) continue msg = mhlib.Message("<folder>", 0, fp) for header in "From", "To", "Cc", "Bcc", "Subject", "Date": h = msg.getheader(header) if h: print("%-8s %s" % (header+":", h)) text = self.getmessagetext(msg) if text: print() nleft = maxlines for part in text: for line in part.splitlines(): if prog.search(line): print(line) nleft -= 1 if nleft <= 0: break if nleft <= 0: break print('-'*70) def update(self, args): folder = None seqs = [] for arg in args: if arg.startswith("+"): if folder is None: folder = arg[1:] else: print("only one folder at a time") return else: seqs.append(arg) if not folder: folder = self.mh.getcontext() if not seqs: seqs = ['all'] try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) return dict = {} for seq in seqs: try: nums = f.parsesequence(seq) except mhlib.Error as msg: print(msg or "unparsable message sequence: %s" % repr(seq)) return for n in nums: dict[n] = n msgs = dict.keys() msgs.sort() self.updatefolder(f, msgs) self.commit() def optimize(self, args): uniqwords = {} for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nOPTIMIZE FOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.prescan(f, f.listmessages(), uniqwords) L = [(uniqwords[word], word) for word in uniqwords.keys()] L.sort() L.reverse() for i in range(100): print("%3d. %6d %s" % ((i+1,) + L[i])) self.index.lexicon.sourceToWordIds([word for (count, word) in L]) def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1 def bulkupdate(self, args): if not args: print("No folders specified; use ALL to bulk-index all folders") return if "ALL" in args: i = args.index("ALL") args[i:i+1] = self.mh.listfolders() for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nFOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.updatefolder(f, f.listmessages()) print("Total", len(self.docpaths)) self.commit() print("Indexed", self.index.lexicon._nbytes, "bytes and", end=' ') print(self.index.lexicon._nwords, "words;", end=' ') print(len(self.index.lexicon._words), "unique words.") def updatefolder(self, f, msgs): self.watchfolders[f.name] = self.getmtime(f.name) for n in msgs: path = "%s/%s" % (f.name, n) docid = self.path2docid.get(path, 0) if docid and self.getmtime(path) == self.doctimes.get(docid, 0): print("unchanged", docid, path) continue docid = self.newdocid(path) try: m = f.openmessage(n) except IOError: print("disappeared", docid, path) self.unindexpath(path) continue text = self.getmessagetext(m, f.name) if not text: self.unindexpath(path) continue print("indexing", docid, path) self.index.index_doc(docid, text) self.maycommit() # Remove messages from the folder that no longer exist for path in list(self.path2docid.keys(f.name)): if not path.startswith(f.name + "/"): break if self.getmtime(path) == 0: self.unindexpath(path) print("done.") def unindexpath(self, path): if path in self.path2docid: docid = self.path2docid[path] print("unindexing", docid, path) del self.docpaths[docid] del self.doctimes[docid] del self.path2docid[path] try: self.index.unindex_doc(docid) except KeyError as msg: print("KeyError", msg) self.maycommit() def getmessagetext(self, m, name=None): L = [] if name: L.append("_folder " + name) # To restrict search to a folder self.getheaders(m, L) try: self.getmsgparts(m, L, 0) except KeyboardInterrupt: raise except: print("(getmsgparts failed:)") reportexc() return L def getmsgparts(self, m, L, level): ctype = m.gettype() if level or ctype != "text/plain": print(". "*level + str(ctype)) if ctype == "text/plain": L.append(m.getbodytext()) elif ctype in ("multipart/alternative", "multipart/mixed"): for part in m.getbodyparts(): self.getmsgparts(part, L, level+1) elif ctype == "message/rfc822": f = StringIO(m.getbodytext()) m = mhlib.Message("<folder>", 0, f) self.getheaders(m, L) self.getmsgparts(m, L, level+1) def getheaders(self, m, L): H = [] for key in "from", "to", "cc", "bcc", "subject": value = m.get(key) if value: H.append(value) if H: L.append("\n".join(H)) def newdocid(self, path): docid = self.path2docid.get(path) if docid is not None: self.doctimes[docid] = self.getmtime(path) return docid docid = self.maxdocid + 1 self.maxdocid = docid self.docpaths[docid] = path self.doctimes[docid] = self.getmtime(path) self.path2docid[path] = docid return docid def getmtime(self, path): path = os.path.join(self.mh.getpath(), path) try: st = os.stat(path) except os.error as msg: return 0 return int(st[ST_MTIME]) def maycommit(self): self.trans_count += 1 if self.trans_count >= self.trans_limit > 0: self.commit() def commit(self): if self.trans_count > 0: print("committing...") transaction.commit() self.trans_count = 0 self.pack_count += 1 if self.pack_count >= self.pack_limit > 0: self.pack() def pack(self): if self.pack_count > 0: print("packing...") self.database.pack() self.pack_count = 0
class LinkCheckTool(SimpleItem): security = ClassSecurityInfo() def __init__(self, id=None): super(LinkCheckTool, self).__init__(id) # This is the work queue; items in this queue are scheduled # for link validity check. self.queue = CompositeQueue() # Additional queue for internal crawler to revalidate the site self.crawl_queue = CompositeQueue() # This is the link database. It maps a hyperlink index to a # tuple (timestamp, status, referers). self.checked = IOBTree() # Indexes self.index = OIBTree() self.links = IOBTree() # This is a counter that allows us to add new hyperlinks and # provide an indexc quickly. self.counter = 0 security.declarePrivate("is_available") def is_available(self): return hasattr(self, 'index') and \ hasattr(self, 'checked') and \ hasattr(self, 'queue') and \ hasattr(self, 'counter') security.declarePrivate("clear") def clear(self): while True: try: self.queue.pull() except IndexError: break while True: try: self.crawl_queue.pull() except IndexError: break self.checked.clear() self.index.clear() self.links.clear() self.counter = 0 security.declarePrivate("crawl") def crawl(self): self.clear() query = {} registry = getUtility(IRegistry) settings = registry.forInterface(ISettings) if settings.content_types: query['portal_type'] = settings.content_types if settings.workflow_states: query['review_state'] = settings.workflow_states catalog = api.portal.get_tool('portal_catalog') brains = catalog(query) for brain in brains: # asyncronous crawling not working yet # self.crawl_enqueue(brain.UID) obj = brain.getObject() obj.restrictedTraverse('@@linkcheck')() logger.info('Crawling: checked {0}'.format(brain.getURL())) security.declarePrivate("enqueue") def enqueue(self, url): index = self.index.get(url) if index is None: # a really new url index = self.store(url) else: entry = self.checked.get(index) if entry is not None and entry: entry = None, entry[1], entry[2] self.checked[index] = entry else: # reset empty entry self.remove(url) index = self.store(url) self.queue.put(index) return index security.declarePrivate("register") def register(self, hrefs, referer, timestamp): """Add or update link presence information. If a link has not been checked since the provided timestamp, it will be added to the queue (or if it is not in the database). """ referer = self.index.get(referer) or self.store(referer) registry = getUtility(IRegistry, context=self.aq_parent) try: settings = registry.forInterface(ISettings) except KeyError as exc: logger.warn(exc) return limit = settings.referers for href in hrefs: if self.should_ignore(href, settings.ignore_list): continue # If the hyperlink is not already in the work queue, # compare the provided timestamp to our database to see if # we need to check its validity. Note that internal links # are excempt if we're not using the publisher. index = self.index.get(href) entry = self.checked.get(-1 if index is None else index) if index not in self.queue: if entry is None or entry[0] < timestamp: if settings.use_publisher or not href.startswith('/'): index = self.enqueue(href) elif href not in self.index: index = self.store(href) assert index is not None if entry is None: self.checked[index] = None, None, IISet((referer, )) else: # If the provided paths are a subset of the already # seen paths, and if there is no new referer, we don't # issue an update. referers = entry[2] if referer not in referers and len(referers) <= limit: referers.add(referer) security.declarePrivate("store") def store(self, url): index = self.index[url] = self.counter self.links[index] = url self.counter += 1 return index security.declarePrivate("remove") def remove(self, url): index = self.index.get(url) if url in self.index: del self.index[url] if index and index in self.checked: del self.checked[index] security.declarePrivate("update") def update(self, href, status): """Update link status.""" now = datetime.datetime.now() timestamp = int(time.mktime(now.timetuple())) index = self.index.get(href) if index is None: return entry = self.checked.get(-1 if index is None else index) if entry is None: self.checked[index] = timestamp, status, IISet() # If the status changed, we update the entry. elif status != entry[1] or not entry[0]: # If the status was previously good, then we clear the # status. What this means is that we'll wait for the next # check to declare a bad status (it might be temporary). if entry[1] == 200: status = None self.checked[index] = timestamp, status, entry[2] @cache(lambda method, self, ignore_list: ignore_list) def get_matchers(self, ignore_list): matchers = [] for expression in ignore_list: try: matcher = re.compile(expression).search except re.error: pass else: matchers.append(matcher) return matchers def should_ignore(self, href, ignore_list): for matcher in self.get_matchers(ignore_list): if matcher(href): return True return False def crawl_enqueue(self, obj): if not isinstance(obj, basestring): obj = obj.UID() self.crawl_queue.put(obj) def crawl_dequeue(self): if self.crawl_queue._data: return self.crawl_queue.pull()
class Table(Persistent): """Simple, generic relational table. """ schema = None _v_record_class = None def __init__(self, schema=None): if schema is not None: self.schema = schema columns = schema.get_columns() self.col_info = [] # [(tuple position, column),] self.positions = {} for i in range(len(columns)): # Leave space for the record ID at position 0. position = i + 1 self.col_info.append((position, columns[i])) self.positions[columns[i].name] = position self.proto_record = [None] * (len(columns) + 1) self.next_rid = 1 self.clear() def clear(self): self.data = IOBTree() # {rid -> record as tuple} self.indexes = {} # {index_name -> OOBTree({value -> IITreeSet})} self.primary_index = OIBTree() # {primary key -> rid} for position, column in self.col_info: if column.indexed: self.indexes[column.name] = OOBTree() def tuplify(self, params): """Accepts a mapping-like object and returns a tuple. """ record = self.proto_record[:] positions = self.positions if hasattr(params, '__record_schema__'): for name in params.__record_schema__.keys(): position = positions[name] record[position] = params[name] else: for name, value in params.items(): position = positions[name] record[position] = value return tuple(record) def insert(self, params): record = self.tuplify(params) # Determine the primary key. primary_key = [] for position, column in self.col_info: if column.primary: if record[position] is None: raise ValueError, ( "No value provided for primary key column %s" % repr(column.name)) primary_key.append(record[position]) if primary_key: primary_key = tuple(primary_key) if self.primary_index.has_key(primary_key): raise DuplicateError("Primary key %s in use" % repr(primary_key)) # Add a record. rid = self.next_rid self.next_rid += 1 # XXX Hotspot! record = (rid, ) + record[1:] self.data[rid] = record if primary_key: self.primary_index[primary_key] = rid # Add to indexes. for position, column in self.col_info: name = column.name value = record[position] if value is not None: if self.indexes.has_key(name): set = self.indexes[name].get(value) if set is None: set = IITreeSet() self.indexes[name][value] = set set.insert(rid) # Return the number of rows inserted. return 1 def delete(self, filter): rids = self._select_rids(self.tuplify(filter)) if rids is None: # Zap everything count = len(self.data) self.clear() return count elif not rids: # No rows selected return 0 rids = tuple(rids) # Make sure rids is a static sequence for rid in rids: old_r = self.data[rid] assert old_r[0] == rid primary_key = [] for position, column in self.col_info: old_value = old_r[position] if old_value is not None: if column.primary: primary_key.append(old_value) # Remove from indexes. index = self.indexes.get(column.name) if index is not None: if index.has_key(old_value): # Remove an index entry. set = index[old_value] set.remove(rid) if not set: del index[old_value] if primary_key: # Remove a primary key. primary_key = tuple(primary_key) assert self.primary_index[primary_key] == rid del self.primary_index[primary_key] # Remove the data. del self.data[rid] return len(rids) def update(self, filter, changes): rids = self._select_rids(self.tuplify(filter)) if rids is None: rids = self.data.keys() elif not rids: # Nothing needs to be updated. return 0 count = len(rids) # Identify changes. old_data = {} # rid -> old tuple new_data = {} # rid -> new tuple old_to_new = {} # old primary key -> new primary key new_to_rid = {} # new primary key -> rid record = self.tuplify(changes) for rid in rids: old_r = self.data[rid] old_data[rid] = old_r new_r = list(old_r) # new_r and old_r contain record tuples. for position, column in self.col_info: if record[position] is not None: new_r[position] = record[position] new_data[rid] = tuple(new_r) # Hmm. The code below allows an update to change the primary # key. It might be better to prevent primary key columns from # being changed by an update() call. opk = [] npk = [] for position, column in self.col_info: if column.primary: opk.append(old_r[position]) npk.append(new_r[position]) if opk != npk: opk = tuple(opk) npk = tuple(npk) old_to_new[opk] = npk new_to_rid[npk] = rid # Look for primary key conflicts. A primary key conflict can # occur when changing a record to a different primary key and # the new primary key is already in use. for pk in old_to_new.values(): if (self.primary_index.has_key(pk) and not old_to_new.has_key(pk)): raise DuplicateError("Primary key %s in use" % repr(pk)) # Update the data. self.data.update(new_data) # Remove old primary key indexes and insert new primary key indexes. for pk in old_to_new.keys(): del self.primary_index[pk] self.primary_index.update(new_to_rid) # Update indexes. for rid, old_r in old_data.items(): for position, column in self.col_info: index = self.indexes.get(column.name) if index is not None: new_value = record[position] old_value = old_r[position] if new_value != old_value: if old_value is not None and index.has_key(old_value): # Remove an index entry. set = index[old_value] set.remove(rid) if not set: del index[old_value] if new_value is not None: # Add an index entry. set = index.get(new_value) if set is None: set = IITreeSet() index[new_value] = set set.insert(rid) # Return the number of rows affected. return count def get_record_class(self): klass = self._v_record_class if klass is None: schema = {'rid': 0} for position, column in self.col_info: schema[column.name] = position class TableRecord(TableRecordMixin, Record): __record_schema__ = schema self._v_record_class = klass = TableRecord return klass def select(self, filter): rids = self._select_rids(self.tuplify(filter)) if rids is None: # All klass = self.get_record_class() return [klass(rec) for rec in self.data.values()] elif rids: # Some klass = self.get_record_class() data = self.data return [klass(data[rid]) for rid in rids] else: # None return [] def _select_rids(self, query): """Searches the table for matches, returning record ids. Returns a sequence of record ids, or None for all records. """ primary_key = [] params = 0 # The number of parameters specified primary_params = 0 # The number of primary params specified for position, column in self.col_info: value = query[position] if value is not None: params += 1 if column.primary: primary_params += 1 if primary_key is not None: primary_key.append(value) elif column.primary: # Didn't fully specify the primary key. # Can't search by primary key. primary_key = None if not params: # No query. Select all. return None # First strategy: try to satisfy the request by consulting # the primary key index. if primary_key: # The primary key is complete. The result set will have # either zero rows or one row. primary_key = tuple(primary_key) rid = self.primary_index.get(primary_key) if rid is None: return () # Possibly filter out the single item. if params > primary_params: cand = self.data[rid] for position, column in self.col_info: if query[position] is not None: if cand[position] != query[position]: # Not a match. return () return (rid, ) # Second strategy: try to satisfy the request by intersecting # indexes. rids = None iteration_filters = [] for position, column in self.col_info: value = query[position] if value is not None: index = self.indexes.get(column.name) if index is None: iteration_filters.append((position, value)) else: set = index.get(value) if set is None: # No rows satisfy this criterion. return () if rids is None: rids = set else: rids = intersection(rids, set) if not rids: # No rows satisfy all criteria. return () if rids is not None: rids = rids.keys() if not iteration_filters: # Indexes did all the work. No need to search each record. return rids # Fallback strategy: Eliminate items one by one. if rids is None: # Use the whole data set. candidates = self.data.values() else: # Use the specified records. candidates = [self.data[rid] for rid in rids] rids = [] append = rids.append for cand in candidates: for position, value in iteration_filters: if cand[position] != value: # Not a match. break else: # A match. append(cand[0]) return rids def __repr__(self): return "<%s(schema=%s)>" % (self.__class__.__name__, repr(self.schema))
class Lexicon(Persistent, Implicit): """Maps words to word ids and then some The Lexicon object is an attempt to abstract vocabularies out of Text indexes. This abstraction is not totally cooked yet, this module still includes the parser for the 'Text Index Query Language' and a few other hacks. """ # default for older objects stop_syn={} def __init__(self, stop_syn=None,useSplitter=None,extra=None): self.clear() if stop_syn is None: self.stop_syn = {} else: self.stop_syn = stop_syn self.useSplitter = Splitter.splitterNames[0] if useSplitter: self.useSplitter=useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() def _convertBTrees(self, threshold=200): if (type(self._lexicon) is OIBTree and type(getattr(self, '_inverseLex', None)) is IOBTree): return from BTrees.convert import convert lexicon=self._lexicon self._lexicon=OIBTree() self._lexicon._p_jar=self._p_jar convert(lexicon, self._lexicon, threshold) try: inverseLex=self._inverseLex self._inverseLex=IOBTree() except AttributeError: # older lexicons didn't have an inverse lexicon self._inverseLex=IOBTree() inverseLex=self._inverseLex self._inverseLex._p_jar=self._p_jar convert(inverseLex, self._inverseLex, threshold) def set_stop_syn(self, stop_syn): """ pass in a mapping of stopwords and synonyms. Format is: {'word' : [syn1, syn2, ..., synx]} Vocabularies do not necesarily need to implement this if their splitters do not support stemming or stoping. """ self.stop_syn = stop_syn def getWordId(self, word): """ return the word id of 'word' """ wid=self._lexicon.get(word, None) if wid is None: wid=self.assignWordId(word) return wid set = getWordId def getWord(self, wid): """ post-2.3.1b2 method, will not work with unconverted lexicons """ return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word and returns it.""" # First make sure it's not already in there if self._lexicon.has_key(word): return self._lexicon[word] try: inverse=self._inverseLex except AttributeError: # woops, old lexicom wo wids inverse=self._inverseLex=IOBTree() for word, wid in self._lexicon.items(): inverse[wid]=word wid=randid() while not inverse.insert(wid, word): wid=randid() if isinstance(word,StringType): self._lexicon[intern(word)] = wid else: self._lexicon[word] = wid return wid def get(self, key, default=None): """Return the matched word against the key.""" r=IISet() wid=self._lexicon.get(key, default) if wid is not None: r.insert(wid) return r def __getitem__(self, key): return self.get(key) def __len__(self): return len(self._lexicon) def Splitter(self, astring, words=None, encoding = "latin1"): """ wrap the splitter """ if words is None: words = self.stop_syn try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding ) except: return self.SplitterFunc(astring, words) def query_hook(self, q): """ we don't want to modify the query cuz we're dumb """ return q
class LinkCheckTool(SimpleItem): security = ClassSecurityInfo() def __init__(self, id=None): super(LinkCheckTool, self).__init__(id) # This is the work queue; items in this queue are scheduled # for link validity check. self.queue = CompositeQueue() # Additional queue for internal crawler to revalidate the site self.crawl_queue = CompositeQueue() # This is the link database. It maps a hyperlink index to a # tuple (timestamp, status, referers). self.checked = IOBTree() # Indexes self.index = OIBTree() self.links = IOBTree() # This is a counter that allows us to add new hyperlinks and # provide an indexc quickly. self.counter = 0 security.declarePrivate("is_available") def is_available(self): return hasattr(self, 'index') and \ hasattr(self, 'checked') and \ hasattr(self, 'queue') and \ hasattr(self, 'counter') security.declarePrivate("clear") def clear(self): while True: try: self.queue.pull() except IndexError: break while True: try: self.crawl_queue.pull() except IndexError: break self.checked.clear() self.index.clear() self.links.clear() self.counter = 0 security.declarePrivate("crawl") def crawl(self): self.clear() query = {} registry = getUtility(IRegistry) settings = registry.forInterface(ISettings) if settings.content_types: query['portal_type'] = settings.content_types if settings.workflow_states: query['review_state'] = settings.workflow_states catalog = api.portal.get_tool('portal_catalog') brains = catalog(query) for brain in brains: # asyncronous crawling not working yet # self.crawl_enqueue(brain.UID) obj = brain.getObject() obj.restrictedTraverse('@@linkcheck')() logger.info('Crawling: checked {0}'.format(brain.getURL())) security.declarePrivate("enqueue") def enqueue(self, url): index = self.index.get(url) if index is None: # a really new url index = self.store(url) else: entry = self.checked.get(index) if entry is not None and entry: entry = None, entry[1], entry[2] self.checked[index] = entry else: # reset empty entry self.remove(url) index = self.store(url) self.queue.put(index) return index security.declarePrivate("register") def register(self, hrefs, referer, timestamp): """Add or update link presence information. If a link has not been checked since the provided timestamp, it will be added to the queue (or if it is not in the database). """ referer = self.index.get(referer) or self.store(referer) registry = getUtility(IRegistry, context=self.aq_parent) try: settings = registry.forInterface(ISettings) except KeyError as exc: logger.warn(exc) return limit = settings.referers for href in hrefs: if self.should_ignore(href, settings.ignore_list): continue # If the hyperlink is not already in the work queue, # compare the provided timestamp to our database to see if # we need to check its validity. Note that internal links # are excempt if we're not using the publisher. index = self.index.get(href) entry = self.checked.get(-1 if index is None else index) if index not in self.queue: if entry is None or entry[0] < timestamp: if settings.use_publisher or not href.startswith('/'): index = self.enqueue(href) elif href not in self.index: index = self.store(href) assert index is not None if entry is None: self.checked[index] = None, None, IISet((referer,)) else: # If the provided paths are a subset of the already # seen paths, and if there is no new referer, we don't # issue an update. referers = entry[2] if referer not in referers and len(referers) <= limit: referers.add(referer) security.declarePrivate("store") def store(self, url): index = self.index[url] = self.counter self.links[index] = url self.counter += 1 return index security.declarePrivate("remove") def remove(self, url): index = self.index.get(url) if url in self.index: del self.index[url] if index and index in self.checked: del self.checked[index] security.declarePrivate("update") def update(self, href, status): """Update link status.""" now = datetime.datetime.now() timestamp = int(time.mktime(now.timetuple())) index = self.index.get(href) if index is None: return entry = self.checked.get(-1 if index is None else index) if entry is None: self.checked[index] = timestamp, status, IISet() # If the status changed, we update the entry. elif status != entry[1] or not entry[0]: # If the status was previously good, then we clear the # status. What this means is that we'll wait for the next # check to declare a bad status (it might be temporary). if entry[1] == 200: status = None self.checked[index] = timestamp, status, entry[2] @cache(lambda method, self, ignore_list: ignore_list) def get_matchers(self, ignore_list): matchers = [] for expression in ignore_list: try: matcher = re.compile(expression).search except re.error: pass else: matchers.append(matcher) return matchers def should_ignore(self, href, ignore_list): for matcher in self.get_matchers(ignore_list): if matcher(href): return True return False def crawl_enqueue(self, obj): if not isinstance(obj, basestring): obj = obj.UID() self.crawl_queue.put(obj) def crawl_dequeue(self): if self.crawl_queue._data: return self.crawl_queue.pull()
class Lexicon(Persistent): _v_nextid = None _wid_length_based = True # Flag to distinguish new and old lexica def __init__(self, *pipeline): self.clear() self._pipeline = pipeline def clear(self): """Empty the lexicon. """ self.length = Length() self._wid_length_based = False self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. def length(self): """Return the number of unique terms in the lexicon. """ # Overridden in instances with a BTrees.Length.Length raise NotImplementedError def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) return list(map(self._getWordIdCreate, last)) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "process_post_glob", element.process) last = process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: # WidCode requires us to use at least 0x4000 as a base number. # The algorithm in versions before 2.13 used the length as a base # number. So we don't even try to generate numbers below the # length as they are likely all taken minimum = 0x4000 if self._wid_length_based: minimum = max(self.length(), 0x4000) while True: if self._v_nextid is None: self._v_nextid = randrange(minimum, 0x10000000) wid = self._v_nextid self._v_nextid += 1 if wid not in self._words: break self._v_nextid = None self.length.change(1) self._wids[word] = wid self._words[wid] = word return wid
class IntegerRangesIndex(SimpleItem): """ Index a set of integer ranges: [(1,2), (12,23), (12, 22)] """ implements(IPluggableIndex) meta_type = 'IntegerRangesIndex' def __init__(self, id, caller=None, extra=None): self.id = id self.caller = caller self.clear() self.__genid = 0 def __len__(self): return self._length() def getId(self): """Return Id of index.""" return self.id def clear(self): """Empty the index""" IOBTree = BTrees.family64.IO.BTree self._index = IOBTree() # {rangeid: [document_id, ...]} self._unindex = IOBTree() # {document_id: [rangeid, ...]} self._range_mapping = IOBTree() # {rangeid: range} self._reverse_range_mapping = OIBTree() # {range: rangeid} self._since_index = IOBTree() # {since: [rangeid,...]} self._until_index = IOBTree() # {until: [rangeid,...]} self._length = BTrees.Length.Length() self._unique_values_length = BTrees.Length.Length() def __get_range_id(self, range_): return self._reverse_range_mapping.get(range_, None) def __get_range(self, range_id): return self._range_mapping.get(range_id, None) def __index_range(self, range_): """ index range if needed and return the rangeid """ range_id = self.__get_range_id(range_) if range_id is None: range_id = self.genid() # index range self._unique_values_length.change(1) self._range_mapping[range_id] = range_ self._reverse_range_mapping[range_] = range_id # index range boundaries since, until = range_ self.__insert_in_index_set(self._since_index, since, range_id) self.__insert_in_index_set(self._until_index, until, range_id) return range_id def __unindex_range(self, range_id): range_ = self.__get_range(range_id) if range_ is None: return None since, until = range_ self.__remove_in_index_set(self._since_index, since, range_id) self.__remove_in_index_set(self._until_index, until, range_id) self._unique_values_length.change(-1) del self._range_mapping[range_id] del self._reverse_range_mapping[range_] return range_ def genid(self): self.__genid += 1 return self.__genid def getEntryForObject(self, document_id, default=_marker): """Get all information contained for 'document_id'.""" if default is _marker: return self._unindex.get(document_id) else: return self._index.get(document_id, default) def getIndexSourceNames(self): """Get a sequence of attribute names that are indexed by the index. """ return [self.id] def index_object(self, document_id, obj, threshold=None): """Index an object. 'document_id' is the integer ID of the document. 'obj' is the object to be indexed. 'threshold' is the number of words to process between committing subtransactions. If None, subtransactions are disabled. """ new_ranges = self._get_object_data(obj, self.id) if new_ranges: new_set = IISet(map(self.__index_range, new_ranges)) else: new_set = IISet() old_set = self._unindex.get(document_id, IISet()) new_entries = difference(new_set, old_set) expired_entries = difference(old_set, new_set) if not (new_entries or expired_entries): # nothing to do, bail out ! return 0 for expired_entry in expired_entries: self.__remove_in_index_set(self._unindex, document_id, expired_entry) if self.__remove_in_index_set(self._index, expired_entry, \ document_id): # range is not used anymore, retire it self.__unindex_range(expired_entry) for new_entry in new_entries: if self.__insert_in_index_set(self._unindex, document_id, new_entry): self._length.change(1) self.__insert_in_index_set(self._index, new_entry, document_id) return 1 def unindex_object(self, document_id): """Remove the document_id from the index.""" entries = self._unindex.get(document_id, _marker) if entries is _marker: return if isinstance(entries, int): entries = [entries] for expired_entry in entries: if self.__remove_in_index_set(self._index, expired_entry, \ document_id): # range is not used anymore, retire it self.__unindex_range(expired_entry) self._length.change(-1) del self._unindex[document_id] def __insert_in_index_set(self, index, key, value, set_type=IISet): """ Insert value in the index. If the key was not present and the index row was created it returns True """ index_row = index.get(key, _marker) if index_row is _marker: index[key] = value return True if isinstance(index_row, set_type): index_row.insert(value) return False # it was an int index[key] = set_type((index_row, value,)) return False def __remove_in_index_set(self, index, key, value, set_type=IISet): """ remove the value in the index, index row is a Set It returns true if the index row as been removed (The set was empty) """ index_row = index.get(key, _marker) if index_row is _marker: return True if isinstance(index_row, IISet): index_row.remove(value) if len(index_row) == 0: del index[key] return True if len(index_row) == 1: index[key] = index_row[0] return False del index[key] return True def _apply_index(self, request): record = parseIndexRequest(request, self.id) try: qstart, qend = record.keys except TypeError: return None minint = BTrees.family64.minint maxint = BTrees.family64.maxint qstart = min(maxint, max(minint, qstart)) qend = max(minint, min(maxint, qend)) # start in inside range start = multiunion(self._since_index.values(max=qstart)) end = multiunion(self._until_index.values(min=qstart)) start_into = intersection(start, end) # end inside range start = multiunion(self._since_index.values(max=qend)) end = multiunion(self._until_index.values(min=qend)) end_into = intersection(start, end) # start before range and end after range start = multiunion(self._since_index.values(min=qstart)) end = multiunion(self._until_index.values(max=qend)) start_before_end_after = intersection(start, end) result = union(start_into, end_into) result = union(result, start_before_end_after) return multiunion(map(self._index.__getitem__, result)), (self.id,) def numObjects(self): """Return the number of indexed objects""" return self._length() def indexSize(self): """Return the size of the index in terms of distinct values""" return self._unique_values_length() def _get_object_data(self, obj, attr): # self.id is the name of the index, which is also the name of the # attribute we're interested in. If the attribute is callable, # we'll do so. try: datum = getattr(obj, attr) if safe_callable(datum): datum = datum() except AttributeError: datum = _marker return datum
class PreferenceTool(BaseTool): """ PreferenceTool manages User Preferences / User profiles. TODO: - make the preference tool an action provider (templates) """ id = 'portal_preferences' meta_type = 'ERP5 Preference Tool' portal_type = 'Preference Tool' title = 'Preferences' allowed_types = ('ERP5 Preference', ) security = ClassSecurityInfo() aq_preference_generated = False security.declareProtected(Permissions.ManagePortal, 'manage_overview') manage_overview = DTMLFile('explainPreferenceTool', _dtmldir) security.declarePrivate('manage_afterAdd') def manage_afterAdd(self, item, container): """ init the permissions right after creation """ item.manage_permission(Permissions.AddPortalContent, ['Member', 'Author', 'Manager']) item.manage_permission(Permissions.AddPortalFolders, ['Member', 'Author', 'Manager']) item.manage_permission(Permissions.View, ['Member', 'Auditor', 'Manager']) item.manage_permission(Permissions.CopyOrMove, ['Member', 'Auditor', 'Manager']) item.manage_permission(Permissions.ManageProperties, ['Manager'], acquire=0) item.manage_permission(Permissions.SetOwnPassword, ['Member', 'Author', 'Manager']) BaseTool.inheritedAttribute('manage_afterAdd')(self, item, container) security.declarePublic('getPreference') def getPreference(self, pref_name, default=_marker): """ get the preference on the most appopriate Preference object. """ method = getattr(self, 'get%s' % convertToUpperCase(pref_name), None) if method is not None: return method(default) if default is _marker: return None return default security.declareProtected(Permissions.ModifyPortalContent, "setPreference") def setPreference(self, pref_name, value): """ set the preference on the active Preference object""" self.getActivePreference()._edit(**{pref_name: value}) def _getSortedPreferenceList(self, sql_catalog_id=None): """ return the most appropriate preferences objects, sorted so that the first in the list should be applied first """ tv = getTransactionalVariable() security_manager = getSecurityManager() user = security_manager.getUser() acl_users = self.getPortalObject().acl_users try: # reset a security manager without any proxy role or unrestricted method, # wich affects the catalog search that we do to find applicable # preferences. actual_user = acl_users.getUserById(user.getId()) if actual_user is not None: newSecurityManager(None, actual_user.__of__(acl_users)) tv_key = 'PreferenceTool._getSortedPreferenceList/%s/%s' % ( user.getId(), sql_catalog_id) if tv.get(tv_key, None) is None: prefs = [] # XXX will also cause problems with Manager (too long) # XXX For manager, create a manager specific preference # or better solution user_is_manager = 'Manager' in user.getRolesInContext(self) for pref in self.searchFolder(portal_type='Preference', sql_catalog_id=sql_catalog_id): pref = pref.getObject() # XXX quick workaround so that managers only see user preference # they actually own. if pref is not None and ( not user_is_manager or pref.getPriority() != Priority.USER or pref.getOwnerTuple()[1] == user.getId()): if pref.getProperty('preference_state', 'broken') in ('enabled', 'global'): prefs.append(pref) prefs.sort(key=lambda x: x.getPriority(), reverse=True) # add system preferences before user preferences sys_prefs = [x.getObject() for x in self.searchFolder(portal_type='System Preference', sql_catalog_id=sql_catalog_id) \ if x.getObject().getProperty('preference_state', 'broken') in ('enabled', 'global')] sys_prefs.sort(key=lambda x: x.getPriority(), reverse=True) preference_list = sys_prefs + prefs tv[tv_key] = preference_list return tv[tv_key] finally: setSecurityManager(security_manager) def _getActivePreferenceByPortalType(self, portal_type): enabled_prefs = self._getSortedPreferenceList() if len(enabled_prefs) > 0: try: return [ x for x in enabled_prefs if x.getPortalType() == portal_type ][0] except IndexError: pass return None security.declareProtected(Permissions.View, 'getActivePreference') def getActivePreference(self): """ returns the current preference for the user. Note that this preference may be read only. """ return self._getActivePreferenceByPortalType('Preference') security.declareProtected(Permissions.View, 'clearCache') def clearCache(self, preference): """ clear cache when a preference is modified. This is called by an interaction workflow on preferences. """ self._getCacheId() # initialize _preference_cache if needed. if preference.getPriority() == Priority.USER: user_id = getSecurityManager().getUser().getId() self._preference_cache[user_id] = \ self._preference_cache.get(user_id, 0) + 1 self._preference_cache[None] = self._preference_cache.get(None, 0) + 1 def _getCacheId(self): """Return a cache id for preferences. We use: - user_id: because preferences are always different by user - self._preference_cache[user_id] which is increased everytime a user preference is modified - self._preference_cache[None] which is increased everytime a global preference is modified """ user_id = getSecurityManager().getUser().getId() try: self._preference_cache except AttributeError: self._preference_cache = OIBTree() return self._preference_cache.get(None), self._preference_cache.get( user_id), user_id security.declareProtected(Permissions.View, 'getActiveUserPreference') def getActiveUserPreference(self): """ returns the current user preference for the user. If no preference exists, then try to create one with `createUserPreference` type based method. This method returns a preference that the user will be able to edit or None, if `createUserPreference` refused to create a preference. It is intendended for "click here to edit your preferences" actions. """ active_preference = self.getActivePreference() if active_preference is None or active_preference.getPriority( ) != Priority.USER: # If user does not have a preference, let's try to create one user = self.getPortalObject( ).portal_membership.getAuthenticatedMember().getUserValue() if user is not None: createUserPreference = user.getTypeBasedMethod( 'createUserPreference') if createUserPreference is not None: active_preference = createUserPreference() return active_preference security.declareProtected(Permissions.View, 'getActiveSystemPreference') def getActiveSystemPreference(self): """ returns the current system preference for the user. Note that this preference may be read only. """ return self._getActivePreferenceByPortalType('System Preference') security.declareProtected(Permissions.View, 'getDocumentTemplateList') def getDocumentTemplateList(self, folder=None): """ returns all document templates that are in acceptable Preferences based on different criteria such as folder, portal_type, etc. """ if folder is None: # as the preference tool is also a Folder, this method is called by # page templates to get the list of document templates for self. folder = self # We must set the user_id as a parameter to make sure each # user can get a different cache def _getDocumentTemplateList(user_id, portal_type=None): acceptable_template_list = [] for pref in self._getSortedPreferenceList(): for doc in pref.contentValues(portal_type=portal_type): acceptable_template_list.append(doc.getRelativeUrl()) return acceptable_template_list _getDocumentTemplateList = CachingMethod( _getDocumentTemplateList, 'portal_preferences.getDocumentTemplateList.{}'.format( self._getCacheId()), cache_factory='erp5_ui_long') allowed_content_types = [ pti.id for pti in folder.allowedContentTypes() ] user_id = getToolByName( self, 'portal_membership').getAuthenticatedMember().getId() template_list = [] for portal_type in allowed_content_types: for template_url in _getDocumentTemplateList( user_id, portal_type=portal_type): template = self.restrictedTraverse(template_url, None) if template is not None: template_list.append(template) return template_list security.declareProtected(Permissions.ManagePortal, 'createActiveSystemPreference') def createActiveSystemPreference(self): """ Create a System Preference and enable it if there is no other enabled System Preference in present. """ if self.getActiveSystemPreference() is not None: raise ValueError("Another Active Preference already exists.") system_preference = self.newContent(portal_type='System Preference') system_preference.enable() security.declareProtected(Permissions.ManagePortal, 'createPreferenceForUser') def createPreferenceForUser(self, user_id, enable=True): """Creates a preference for a given user, and optionnally enable the preference. """ user_folder = self.acl_users user = user_folder.getUserById(user_id) if user is None: raise ValueError("User %r not found" % (user_id, )) security_manager = getSecurityManager() try: newSecurityManager(None, user.__of__(user_folder)) preference = self.newContent(portal_type='Preference') if enable: preference.enable() return preference finally: setSecurityManager(security_manager) security.declarePublic('isAuthenticationPolicyEnabled') def isAuthenticationPolicyEnabled(self): """ Return True if authentication policy is enabled. This method exists here due to bootstrap issues. It should work even if erp5_authentication_policy bt5 is not installed. """ # isPreferredAuthenticationPolicyEnabled exisss if property sheets from # erp5_authentication_policy are installed. method = getattr(self, 'isPreferredAuthenticationPolicyEnabled', None) if method is not None and method(): return True # if it does not exist, for sure authentication policy is not enabled. return False
def index_object(self, documentId, obj, threshold=None): """ Index an object: 'documentId' is the integer id of the document 'obj' is the object to be indexed 'threshold' is the number of words to process between commiting subtransactions. If 'None' subtransactions are disabled. """ # sniff the object for our 'id', the 'document source' of the # index is this attribute. If it smells callable, call it. try: source = getattr(obj, self.id) if safe_callable(source): source = source() if not isinstance(source, UnicodeType): source = str(source) except (AttributeError, TypeError): return 0 # sniff the object for 'id'+'_encoding' try: encoding = getattr(obj, self.id+'_encoding') if safe_callable(encoding ): encoding = str(encoding()) else: encoding = str(encoding) except (AttributeError, TypeError): encoding = 'latin1' lexicon = self.getLexicon() splitter = lexicon.Splitter wordScores = OIBTree() last = None # Run through the words and score them for word in list(splitter(source,encoding=encoding)): if word[0] == '\"': last = self._subindex(word[1:-1], wordScores, last, splitter) else: if word==last: continue last=word wordScores[word]=wordScores.get(word,0)+1 # Convert scores to use wids: widScores=IIBucket() getWid=lexicon.getWordId for word, score in wordScores.items(): widScores[getWid(word)]=score del wordScores currentWids=IISet(self._unindex.get(documentId, [])) # Get rid of document words that are no longer indexed self.unindex_objectWids(documentId, difference(currentWids, widScores)) # Now index the words. Note that the new xIBTrees are clever # enough to do nothing when there isn't a change. Woo hoo. insert=self.insertForwardIndexEntry for wid, score in widScores.items(): insert(wid, documentId, score) # Save the unindexing info if it's changed: wids=widScores.keys() if wids != currentWids.keys(): self._unindex[documentId]=wids return len(wids)
class Path(String): root = None # root as passed to Catalog() path2rid = None # OIBTree mapping path to rid (one:one) rid2path = None # IOBTree mapping rid to path (one:one) parts = None # OOBTree mapping (level, part) to rids (one:many) levels = None # IOBTree mapping level to a list of rids (one:many) case_sensitive = None sorted = None # OOBTree for sorting; inherited from Path def __init__(self, root, case_sensitive=None): # Root # ==== if not isinstance(root, basestring): raise TypeError("root is not a string: '%s'" % root) elif not isdir(root): raise ValueError("root doesn't point to a directory: '%s'" % root) self.root = root.rstrip(os.sep) # Case Sensitivity # ================ if case_sensitive is None: if 'win' in sys.platform: case_sensitive = False else: case_sensitive = True if case_sensitive not in (False, True, 0, 1): raise TypeError( "case_sensitive isn't a boolean: " + "'%s'" % case_sensitive ) self.case_sensitive = bool(case_sensitive) self.reset() # Index contract # ============== __name__ = 'Path' # used in command-line interface def reset(self): """Forget everything; usually called from __init__. """ String.reset(self) self.path2rid = OIBTree() # {path:rid} self.rid2path = IOBTree() # {rid:path} self.parts = OOBTree() # {(level,part):rids} self.rids = IOBTree() # {rid:(level,part)s} self.levels = IOBTree() # {level:rids} def learn(self, rid, value): """Given an rid and a value, associate them. """ String.learn(self, rid, value) # Parse and validate. # =================== # Value is an absolute path, rooted in self.root. if not isinstance(value, basestring): raise TypeError("string expected") elif value and not value.startswith(os.sep): raise ValueError("path not specified absolutely: '%s'" % value) if self.case_sensitive: path = value else: path = value.lower() path = path.rstrip(os.sep) # safety net; should never need this parts = value.split(os.sep) #parts = value.split(os.sep)[1:] # Add to simple identity indices. # =============================== self.path2rid[path] = rid self.rid2path[rid] = path # Add to complex level/part indices. # ================================== for level in range(len(parts)): token_ = (level, parts[level]) # Add to (one:many) mapping of (level,part) to [rids]. # ==================================================== if token_ not in self.parts: self.parts[token_] = IITreeSet([rid]) else: self.parts[token_].insert(rid) # Add to the (one:many) mapping of rid to (level,part)s. # ====================================================== # This exists so we know how to forget about this rid when the time # comes. if rid not in self.rids: self.rids[rid] = OOSet([token_]) else: self.rids[rid].insert(token_) # Add to (one:many) mapping of levels to rids. # ============================================ # This is used to implement level limits. if level not in self.levels: self.levels[level] = IITreeSet([rid]) else: self.levels[level].insert(rid) def forget(self, rid): """Given an rid, remove it from all indices. """ String.forget(self, rid) # Remove from the (one:many) mapping of (level, part) to rids. # ============================================================ # We also track the level here and remove the rid from the (one:many) # mapping of levels to rids. level = -1 for token_ in self.rids[rid]: if token_[0] > level: level = token_[0] self.parts[token_].remove(rid) if len(self.parts[token_]) == 0: del self.parts[token_] self.levels[level].remove(rid) if len(self.levels[level]) == 0: del self.levels[level] # Remove from the (one:many) mapping of rid to tokens. # ==================================================== del self.rids[rid] # Remove from simple identity indices. # ==================================== path = self.rid2path[rid] del self.path2rid[path] del self.rid2path[rid] # Searches # ======== def above(self, arg): """Find all resources at or above path, within the limits given. Here we actually call below() on <path> and all of its ancestors, passing the limits straight through, with the exception that limits default to 0:1 rather than None:None. Use '0:' for the latter. """ # Parse and validate. # =================== path, upper, lower = self._path_and_limits(arg) rid = self.path2rid.get(path, None) if rid is None: return # Build # ===== tmpl = "%s " if (upper, lower) == (None, None): tmpl += '0:1' # default: breadcrumbs else: if upper is not None: tmpl += str(upper) tmpl += ":" if lower is not None: tmpl += str(lower) parts = path.split(os.sep) rids = [] for level in range(len(parts)): ancestor = os.sep.join(parts[:level+1]) ancestor = ancestor and ancestor or '/' rids.append(self.below(tmpl % ancestor)) rids = multiunion(rids) def below(self, arg): """Find all resources at or below path, within the limits given. """ # Parse and validate. # =================== path, upper, lower = self._path_and_limits(arg) rid = self.path2rid.get(path, None) if rid is None: return # Build # ===== parts = path.split(os.sep) rids = None for level in range(len(parts)): rids = intersection(rids, self.parts[(level, parts[level])]) if rids is None: return IISet() # short-cut # Limits # ====== # Remove rids that are above any upper limit, and then only include rids # that are above any lower limit. Limits are relative to the level of # the requested path. if upper is not None: upper += level for i in range(level, upper): if i not in self.levels: break rids = difference(rids, self.levels[i]) if lower is not None: lower += level _rids = [] for i in range(level, lower): if i not in self.levels: break _rids.append(self.levels[i]) rids = intersection(rids, multiunion(_rids)) return rids def is_(self, arg): """Return the rid corresponding to a single path. Root is special-cased. """ path, foo, bar = self._path_and_limits(arg) return self.path2rid.get(arg, None) # Parser # ====== def _path_and_limits(self, arg): """Given an argument from a Collection constraint, return three params. Arg is of the form: /some/path 0:4 The first token is the path, the second is a limits specification. The path must not contain a space (@@: really should support that). The limits spec is optional; if given, it must have a colon and at least one end specified. To the left of the colon is the upper bound; to the right is the lower bound. These bounds specify the tree levels that the path filter should apply to, but the specifics of how it applies depend on the searches above. (Yes this nomenclature is all wacky. The root is conceptually 'higher' for some reason, even though the root is 0 and a real tree's roots are lower than its branches. Go figure.) """ path = '' upper = None lower = None parts = arg.split() nparts = len(parts) assert nparts in (1, 2), "either need path or path and limits" # Path # ==== if nparts == 1: path = parts[0] elif nparts == 2: path = parts[0] # Limits # ====== limits = parts[1] if not limits.count(':') == 1: raise ValueError("malformed limits (no colon): '%s'" % limits) upper, lower = limits.split(':') #if not (upper + lower): # raise ValueError("no limits given: '%s'" % limits) if not upper: upper = None else: if not upper.isdigit(): raise ValueError("bad upper limit: '%s'" % upper) upper = int(upper) if not lower: lower = None else: if not lower.isdigit(): raise ValueError("bad lower limit: '%s'" % lower) lower = int(lower) if None not in (upper, lower): if upper > lower: raise ValueError( "upper limit greater than lower: " + "%d > %d" % (upper, lower) ) if path == os.sep: path = '' if not self.case_sensitive: path = path.lower() return path, upper, lower
class ExtendedPathIndex(PathIndex): """A path index stores all path components of the physical path of an object. Internal datastructure (regular pathindex): - a physical path of an object is split into its components - every component is kept as a key of a OOBTree in self._indexes - the value is a mapping 'level of the path component' to 'all docids with this path component on this level' In addition - there is a terminator (None) signifying the last component in the path - 2 additional indexes map absolute path to either the doc id or doc ids of contained objects. This allows for rapid answering of common queries. """ meta_type = "ExtendedPathIndex" manage_options = ( {'label': 'Settings', 'action': 'manage_main'}, ) indexed_attrs = None multi_valued = False query_options = ("query", "level", "operator", "depth", "navtree", "navtree_start") def __init__(self, id, extra=None, caller=None): """ ExtendedPathIndex supports indexed_attrs """ PathIndex.__init__(self, id, caller) if isinstance(extra, dict): attrs = extra.get('indexed_attrs', None) self.multi_valued = extra.get('multi_valued', False) else: attrs = getattr(extra, 'indexed_attrs', None) self.multi_valued = getattr(extra, 'multi_valued', False) if attrs is None: return if isinstance(attrs, str): attrs = attrs.split(',') attrs = [a.strip() for a in attrs] attrs = [a for a in attrs if a] if attrs: # We only index the first attribute so snip off the rest self.indexed_attrs = tuple(attrs[:1]) def clear(self): PathIndex.clear(self) self._index_parents = OOBTree() self._index_items = OIBTree() def index_object(self, docid, obj, threshold=100): """ hook for (Z)Catalog """ # PathIndex first checks for an attribute matching its id and # falls back to getPhysicalPath only when failing to get one. # If self.indexed_attrs is not None, it's value overrides this behavior attrs = self.indexed_attrs index = attrs is None and self.id or attrs[0] path = getattr(obj, index, None) if path is not None: if safe_callable(path): path = path() if not isinstance(path, (str, tuple)): raise TypeError('path value must be string or tuple ' 'of strings: (%r, %s)' % (index, repr(path))) else: try: path = obj.getPhysicalPath() except AttributeError: return 0 if isinstance(path, (list, tuple)): path = '/' + '/'.join(path[1:]) comps = [p for p in path.split('/') if p] # Make sure we reindex properly when path change old_path = self._unindex.get(docid, _marker) if old_path is not _marker: if old_path != path: self.unindex_object(docid, _old=old_path) # unindex reduces length, we need to counter that self._length.change(1) else: # We only get a new entry if the value wasn't there before. # If it already existed the length is unchanged self._length.change(1) for i, comp in enumerate(comps): self.insertEntry(comp, docid, i) # Add terminator self.insertEntry(None, docid, len(comps) - 1) # Add full-path indexes, to optimize certain edge cases parent_path = '/' + '/'.join(comps[:-1]) parents = self._index_parents.get(parent_path, _marker) if parents is _marker: self._index_parents[parent_path] = parents = IITreeSet() parents.insert(docid) self._index_items[path] = docid self._unindex[docid] = path return 1 def unindex_object(self, docid, _old=_marker): """ hook for (Z)Catalog """ if _old is not _marker: old_value = _old else: old_value = self._unindex.get(docid, _marker) if old_value is _marker: logger.log(logging.INFO, 'Attempt to unindex nonexistent object with id ' '%s' % docid) return # There is an assumption that paths start with / comps = [p for p in old_value.split('/') if p] def unindex(comp, level, docid=docid): index_comp = self._index[comp] index_comp[level].remove(docid) if not index_comp[level]: del index_comp[level] if not index_comp: del self._index[comp] try: for level, comp in enumerate(comps): unindex(comp, level) # Remove the terminator unindex(None, len(comps) - 1) # Remove full-path indexes parent_path = '/' + '/'.join(comps[:-1]) parents = self._index_parents.get(parent_path, _marker) if parents is not _marker: parents.remove(docid) if not parents: del self._index_parents[parent_path] del self._index_items['/'.join([parent_path, comps[-1]])] except KeyError: logger.log(logging.INFO, 'Attempt to unindex object with id ' '%s failed' % docid) self._length.change(-1) del self._unindex[docid] def search(self, path, default_level=0, depth=-1, navtree=0, navtree_start=0, resultset=None): """ path is either a string representing a relative URL or a part of a relative URL or a tuple (path, level). default_level specifies the level to use when no more specific level has been passed in with the path. level >= 0 starts searching at the given level level < 0 finds matches at *any* level depth let's you limit the results to items at most depth levels deeper than the matched path. depth == 0 means no subitems are included at all, with depth == 1 only direct children are included, etc. depth == -1, the default, returns all children at any depth. navtree is treated as a boolean; if it evaluates to True, not only the query match is returned, but also each container in the path. If depth is greater than 0, also all siblings of those containers, as well as the siblings of the match are included as well, plus *all* documents at the starting level. navtree_start limits what containers are included in a navtree search. If greater than 0, only containers (and possibly their siblings) at that level and up will be included in the resultset. """ if isinstance(path, string_types): level = default_level else: level = int(path[1]) path = path[0] if level < 0: # Search at every level, return the union of all results return multiunion( [self.search(path, level, depth, navtree, navtree_start) for level in range(self._depth + 1)]) comps = [p for p in path.split('/') if p] if navtree and depth == -1: # Navtrees don't do recursive depth = 1 # Optimizations pathlength = level + len(comps) - 1 if navtree and navtree_start > min(pathlength + depth, self._depth): # This navtree_start excludes all items that match the depth return IISet() if level == 0 and depth in (0, 1): # We have easy indexes for absolute paths where # we are looking for depth 0 or 1 result sets if navtree: # Optimized absolute path navtree and breadcrumbs cases result = [] add = lambda x: x is not None and result.append(x) if depth == 1 and not self.multi_valued: # Navtree case, all sibling elements along the path convert = multiunion index = self._index_parents else: # Breadcrumbs case, all direct elements along the path convert = IISet index = self._index_items # Collect all results along the path for i in range(len(comps), navtree_start - 1, -1): parent_path = '/' + '/'.join(comps[:i]) add(index.get(parent_path)) return convert(result) if not path.startswith('/'): path = '/' + path if depth == 0 and not self.multi_valued: # Specific object search res = self._index_items.get(path) return res and IISet([res]) or IISet() else: # Single depth search return self._index_parents.get(path, IISet()) # Avoid using the root set # as it is common for all objects anyway and add overhead # There is an assumption about all indexed values having the # same common base path if level == 0: indexpath = [p for p in self.getPhysicalPath() if p] minlength = min(len(indexpath), len(comps)) # Truncate path to first different element for i in range(minlength): if indexpath[i] != comps[i]: break level += 1 comps = comps[level:] if not comps and depth == -1: # Recursive search for everything return IISet(self._unindex) # Core application of the indexes pathset = None depthset = None # For limiting depth if navtree and depth > 0: # Include the elements up to the matching path depthset = multiunion([ self._index.get(None, {}).get(i, IISet()) for i in range(min(navtree_start, level), max(navtree_start, level) + 1)]) indexedcomps = enumerate(comps) if not navtree: # Optimize relative-path searches by starting with the # presumed smaller sets at the end of the path first # We can't do this for the navtree case because it needs # the bigger rootset to include siblings along the way. indexedcomps = list(indexedcomps) indexedcomps.reverse() for i, comp in indexedcomps: # Find all paths that have comp at the given level res = self._index.get(comp, {}).get(i + level) if res is None: # Non-existing path; navtree is inverse, keep going pathset = IISet() if not navtree: return pathset pathset = intersection(pathset, res) if navtree and i + level >= navtree_start: depthset = union(depthset, intersection(pathset, self._index.get(None, {}).get(i + level))) if depth >= 0: # Limit results to those that terminate within depth levels start = len(comps) - 1 if navtree: start = max(start, (navtree_start - level)) depthset = [depthset] + [ intersection(pathset, self._index.get(None, {}).get(i + level)) for i in range(start, start + depth + 1) ] depthset = multiunion([d for d in depthset if d]) if navtree or depth >= 0: return depthset return pathset def _apply_index(self, request, resultset=None): """ hook for (Z)Catalog 'request' -- mapping type (usually {"path": "..." } additionaly a parameter "path_level" might be passed to specify the level (see search()) """ record = IndexQuery(request, self.id, self.query_options) if record.keys is None: return None return (self.query_index(record), (self.id, )) def query_index(self, record, resultset=None): level = record.get("level", 0) operator = record.get('operator', self.useOperator).lower() depth = getattr(record, 'depth', -1) # use getattr to get 0 value navtree = record.get('navtree', 0) navtree_start = record.get('navtree_start', 0) # depending on the operator we use intersection of union if operator == "or": set_func = union else: set_func = intersection result = None for k in record.keys: rows = self.search(k, level, depth, navtree, navtree_start, resultset=resultset) result = set_func(result, rows) if result: return result return IISet() def getIndexSourceNames(self): """ return names of indexed attributes """ attrs = self.indexed_attrs or ('getPhysicalPath', ) return tuple(attrs)
class Lexicon(Persistent): """ Implementation of :class:`zope.index.text.interfaces.ILexicon`. """ def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self.wordCount = Length() self._pipeline = pipeline def wordCount(self): """Return the number of unique terms in the lexicon.""" # overridden per instance return len(self._wids) def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): if text is None: text = '' last = _text2list(text) for element in self._pipeline: last = element.process(last) if not isinstance(self.wordCount, Length): # Make sure wordCount is overridden with a BTrees.Length.Length self.wordCount = Length(self.wordCount()) # Strategically unload the length value so that we get the most # recent value written to the database to minimize conflicting wids # Because length is independent, this will load the most # recent value stored, regardless of whether MVCC is enabled self.wordCount._p_deactivate() return list(map(self._getWordIdCreate, last)) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: wid = self._new_wid() self._wids[word] = wid self._words[wid] = word return wid def _new_wid(self): count = self.wordCount count.change(1) while count() in self._words: # just to be safe count.change(1) return count()
class DocumentMap(Persistent): """ A two-way map between addresses (e.g. location paths) and document ids. The map is a persistent object meant to live in a ZODB storage. Additionally, the map is capable of mapping 'metadata' to docids. """ _v_nextid = None family = BTrees.family32 _randrange = random.randrange docid_to_metadata = None # latch for b/c def __init__(self): self.docid_to_address = IOBTree() self.address_to_docid = OIBTree() self.docid_to_metadata = IOBTree() def docid_for_address(self, address): """ Retrieve a document id for a given address. ``address`` is a string or other hashable object which represents a token known by the application. Return the integer document id corresponding to ``address``. If ``address`` doesn't exist in the document map, return None. """ return self.address_to_docid.get(address) def address_for_docid(self, docid): """ Retrieve an address for a given document id. ``docid`` is an integer document id. Return the address corresponding to ``docid``. If ``docid`` doesn't exist in the document map, return None. """ return self.docid_to_address.get(docid) def add(self, address, docid=_marker): """ Add a new document to the document map. ``address`` is a string or other hashable object which represents a token known by the application. ``docid``, if passed, must be an int. In this case, remove any previous address stored for it before mapping it to the new address. Passing an explicit ``docid`` also removes any metadata associated with that docid. If ``docid`` is not passed, generate a new docid. Return the integer document id mapped to ``address``. """ if docid is _marker: docid = self.new_docid() self.remove_docid(docid) self.remove_address(address) self.docid_to_address[docid] = address self.address_to_docid[address] = docid return docid def remove_docid(self, docid): """ Remove a document from the document map for the given document ID. ``docid`` is an integer document id. Remove any corresponding metadata for ``docid`` as well. Return a True if ``docid`` existed in the map, else return False. """ # It should be an invariant that if one entry exists in # docid_to_address for a docid/address pair, exactly one # corresponding entry exists in address_to_docid for the same # docid/address pair. However, versions of this code before # r.catalog 0.7.3 had a bug which, if this method was called # multiple times, each time with the same address but a # different docid, the ``docid_to_address`` mapping could # contain multiple entries for the same address each with a # different docid, causing this invariant to be violated. The # symptom: in systems that used r.catalog 0.7.2 and lower, # there might be more entries in docid_to_address than there # are in address_to_docid. The conditional fuzziness in the # code directly below is a runtime kindness to systems in that # state. Technically, the administrator of a system in such a # state should normalize the two data structures by running a # script after upgrading to 0.7.3. If we made the admin do # this, some of the code fuzziness below could go away, # replaced with something simpler. But there's no sense in # breaking systems at runtime through being a hardass about # consistency if an unsuspecting upgrader has not yet run the # data fixer script. The "fix the data" mantra rings a # little hollow when you weren't the one who broke the data in # the first place ;-) self._check_metadata() address = self.docid_to_address.get(docid, _marker) if address is _marker: return False old_docid = self.address_to_docid.get(address, _marker) if (old_docid is not _marker) and (old_docid != docid): self.remove_docid(old_docid) if docid in self.docid_to_address: del self.docid_to_address[docid] if address in self.address_to_docid: del self.address_to_docid[address] if docid in self.docid_to_metadata: del self.docid_to_metadata[docid] return True def remove_address(self, address): """ Remove a document from the document map using an address. ``address`` is a string or other hashable object which represents a token known by the application. Remove any corresponding metadata for ``address`` as well. Return a True if ``address`` existed in the map, else return False. """ # See the comment in remove_docid for complexity rationalization self._check_metadata() docid = self.address_to_docid.get(address, _marker) if docid is _marker: return False old_address = self.docid_to_address.get(docid, _marker) if (old_address is not _marker) and (old_address != address): self.remove_address(old_address) if docid in self.docid_to_address: del self.docid_to_address[docid] if address in self.address_to_docid: del self.address_to_docid[address] if docid in self.docid_to_metadata: del self.docid_to_metadata[docid] return True def _check_metadata(self): # backwards compatibility if self.docid_to_metadata is None: self.docid_to_metadata = IOBTree() def add_metadata(self, docid, data): """ Add metadata related to a given document id. ``data`` must be a mapping, such as a dictionary. For each key/value pair in ``data`` insert a metadata key/value pair into the metadata stored for ``docid``. Overwrite any existing values for the keys in ``data``, leaving values unchanged for other existing keys. Raise a KeyError If ``docid`` doesn't relate to an address in the document map. """ if not docid in self.docid_to_address: raise KeyError(docid) if len(data.keys()) == 0: return self._check_metadata() meta = self.docid_to_metadata.setdefault(docid, OOBTree()) for k in data: meta[k] = data[k] def remove_metadata(self, docid, *keys): """ Remove metadata related to a given document id. If ``docid`` doesn't exist in the metadata map, raise a KeyError. For each key in ``keys``, remove the metadata value for the docid related to that key. Do not raise any error if no value exists for a given key. If no keys are specified, remove all metadata related to the docid. """ self._check_metadata() if keys: meta = self.docid_to_metadata.get(docid, _marker) if meta is _marker: raise KeyError(docid) for k in keys: if k in meta: del meta[k] if not meta: del self.docid_to_metadata[docid] else: if not (docid in self.docid_to_metadata): raise KeyError(docid) del self.docid_to_metadata[docid] def get_metadata(self, docid): """ Return the metadata for ``docid``. Return a mapping of the keys and values set using ``add_metadata``. Raise a KeyError If metadata does not exist for ``docid``. """ if self.docid_to_metadata is None: raise KeyError(docid) meta = self.docid_to_metadata[docid] return meta def new_docid(self): """ Return a new document id. The returned value is guaranteed not to be used already in this document map. """ while True: if self._v_nextid is None: self._v_nextid = self._randrange(self.family.minint, self.family.maxint) uid = self._v_nextid self._v_nextid += 1 if uid not in self.docid_to_address: return uid self._v_nextid = None
class UUIDIndex(UnIndex): """Index for uuid fields with an unique value per key. The internal structure is: self._index = {datum:documentId]} self._unindex = {documentId:datum} For each datum only one documentId can exist. """ meta_type = "UUIDIndex" manage_options = ( { 'label': 'Settings', 'action': 'manage_main' }, { 'label': 'Browse', 'action': 'manage_browse' }, ) query_options = ["query", "range"] manage = manage_main = DTMLFile('dtml/manageUUIDIndex', globals()) manage_main._setName('manage_main') manage_browse = DTMLFile('../dtml/browseIndex', globals()) def clear(self): self._length = Length() self._index = OIBTree() self._unindex = IOBTree() def numObjects(self): """Return the number of indexed objects. Since we have a 1:1 mapping from documents to values, we can reuse the stored length. """ return self.indexSize() def uniqueValues(self, name=None, withLengths=0): """returns the unique values for name if withLengths is true, returns a sequence of tuples of (value, length) """ if name is None: name = self.id elif name != self.id: return [] if not withLengths: return tuple(self._index.keys()) # We know the length for each value is one return [(k, 1) for k in self._index.keys()] def insertForwardIndexEntry(self, entry, documentId): """Take the entry provided and put it in the correct place in the forward index. """ if entry is None: return old_docid = self._index.get(entry, _marker) if old_docid is _marker: self._index[entry] = documentId self._length.change(1) elif old_docid != documentId: logger.error("A different document with value '%s' already " "exists in the index.'" % entry) def removeForwardIndexEntry(self, entry, documentId): """Take the entry provided and remove any reference to documentId in its entry in the index. """ old_docid = self._index.get(entry, _marker) if old_docid is not _marker: del self._index[entry] self._length.change(-1) def _get_object_datum(self, obj, attr): # for a uuid it never makes sense to acquire a parent value via # Acquisition has_attr = getattr(aq_base(obj), attr, _marker) if has_attr is _marker: return _marker return super(UUIDIndex, self)._get_object_datum(obj, attr)
class Lexicon(Persistent): implements(ILexicon) def __init__(self, *pipeline): self._wids = OIBTree() # word -> wid self._words = IOBTree() # wid -> word # wid 0 is reserved for words that aren't in the lexicon (OOV -- out # of vocabulary). This can happen, e.g., if a query contains a word # we never saw before, and that isn't a known stopword (or otherwise # filtered out). Returning a special wid value for OOV words is a # way to let clients know when an OOV word appears. self._nextwid = 1 self._pipeline = pipeline # Keep some statistics about indexing self._nbytes = 0 # Number of bytes indexed (at start of pipeline) self._nwords = 0 # Number of words indexed (after pipeline) def wordCount(self): """Return the number of unique terms in the lexicon.""" return self._nextwid - 1 def words(self): return self._wids.keys() def wids(self): return self._words.keys() def items(self): return self._wids.items() def sourceToWordIds(self, text): last = _text2list(text) for t in last: self._nbytes += len(t) for element in self._pipeline: last = element.process(last) self._nwords += len(last) return map(self._getWordIdCreate, last) def termToWordIds(self, text): last = _text2list(text) for element in self._pipeline: last = element.process(last) wids = [] for word in last: wids.append(self._wids.get(word, 0)) return wids def parseTerms(self, text): last = _text2list(text) for element in self._pipeline: process = getattr(element, "processGlob", element.process) last = process(last) return last def isGlob(self, word): return "*" in word or "?" in word def get_word(self, wid): return self._words[wid] def get_wid(self, word): return self._wids.get(word, 0) def globToWordIds(self, pattern): # Implement * and ? just as in the shell, except the pattern # must not start with either of these prefix = "" while pattern and pattern[0] not in "*?": prefix += pattern[0] pattern = pattern[1:] if not pattern: # There were no globbing characters in the pattern wid = self._wids.get(prefix, 0) if wid: return [wid] else: return [] if not prefix: # The pattern starts with a globbing character. # This is too efficient, so we raise an exception. raise QueryError( "pattern %r shouldn't start with glob character" % pattern) pat = prefix for c in pattern: if c == "*": pat += ".*" elif c == "?": pat += "." else: pat += re.escape(c) pat += "$" prog = re.compile(pat) keys = self._wids.keys(prefix) # Keys starting at prefix wids = [] for key in keys: if not key.startswith(prefix): break if prog.match(key): wids.append(self._wids[key]) return wids def _getWordIdCreate(self, word): wid = self._wids.get(word) if wid is None: wid = self._new_wid() self._wids[word] = wid self._words[wid] = word return wid def _new_wid(self): wid = self._nextwid self._nextwid += 1 return wid
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): """ An Object Catalog An Object Catalog maintains a table of object metadata, and a series of manageable indexes to quickly search for objects (references in the metadata) that satisfy a search query. This class is not Zope specific, and can be used in any python program to build catalogs of objects. Note that it does require the objects to be Persistent, and thus must be used with ZODB3. """ _v_brains = NoBrainer def __init__(self, vocabulary=None, brains=None): # Catalogs no longer care about vocabularies and lexicons # so the vocabulary argument is ignored. (Casey) self.schema = {} # mapping from attribute name to column number self.names = () # sequence of column names self.indexes = {} # maping from index name to index object # The catalog maintains a BTree of object meta_data for # convenient display on result pages. meta_data attributes # are turned into brain objects and returned by # searchResults. The indexing machinery indexes all records # by an integer id (rid). self.data is a mapping from the # integer id to the meta_data, self.uids is a mapping of the # object unique identifier to the rid, and self.paths is a # mapping of the rid to the unique identifier. self.clear() if brains is not None: self._v_brains = brains self.updateBrains() def __len__(self): return self._length() def clear(self): """ clear catalog """ self.data = IOBTree() # mapping of rid to meta_data self.uids = OIBTree() # mapping of uid to rid self.paths = IOBTree() # mapping of rid to uid self._length = BTrees.Length.Length() for index in self.indexes.keys(): self.getIndex(index).clear() def updateBrains(self): self.useBrains(self._v_brains) def __getitem__(self, index, ttype=type(())): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ if type(index) is ttype: # then it contains a score... normalized_score, score, key = index r = self._v_result_class(self.data[key]).__of__(aq_parent(self)) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = normalized_score else: # otherwise no score, set all scores to 1 r = self._v_result_class(self.data[index]).__of__(aq_parent(self)) r.data_record_id_ = index r.data_record_score_ = 1 r.data_record_normalized_score_ = 1 return r def __setstate__(self, state): """ initialize your brains. This method is called when the catalog is first activated (from the persistent storage) """ Persistent.__setstate__(self, state) self.updateBrains() def useBrains(self, brains): """ Sets up the Catalog to return an object (ala ZTables) that is created on the fly from the tuple stored in the self.data Btree. """ class mybrains(AbstractCatalogBrain, brains): pass scopy = self.schema.copy() scopy['data_record_id_'] = len(self.schema.keys()) scopy['data_record_score_'] = len(self.schema.keys()) + 1 scopy['data_record_normalized_score_'] = len(self.schema.keys()) + 2 mybrains.__record_schema__ = scopy self._v_brains = brains self._v_result_class = mybrains def addColumn(self, name, default_value=None): """ adds a row to the meta data schema """ schema = self.schema names = list(self.names) if name in schema: raise CatalogError('The column %s already exists' % name) if name[0] == '_': raise CatalogError('Cannot cache fields beginning with "_"') values = schema.values() if values: schema[name] = max(values) + 1 else: schema[name] = 0 names.append(name) if default_value in (None, ''): default_value = MV for key, value in self.data.items(): rec = list(value) rec.append(default_value) self.data[key] = tuple(rec) self.names = tuple(names) self.schema = schema # new column? update the brain self.updateBrains() self._p_changed = 1 # why? def delColumn(self, name): """ deletes a row from the meta data schema """ names = list(self.names) _index = names.index(name) if not name in self.schema: LOG.error('delColumn attempted to delete nonexistent ' 'column %s.' % str(name)) return del names[_index] # rebuild the schema i = 0 schema = {} for name in names: schema[name] = i i = i + 1 self.schema = schema self.names = tuple(names) # update the brain self.updateBrains() # remove the column value from each record for key, value in self.data.items(): rec = list(value) del rec[_index] self.data[key] = tuple(rec) def addIndex(self, name, index_type): """Create a new index, given a name and a index_type. Old format: index_type was a string, 'FieldIndex' 'TextIndex' or 'KeywordIndex' is no longer valid; the actual index must be instantiated and passed in to addIndex. New format: index_type is the actual index object to be stored. """ if name in self.indexes: raise CatalogError('The index %s already exists' % name) if name.startswith('_'): raise CatalogError('Cannot index fields beginning with "_"') if not name: raise CatalogError('Name of index is empty') indexes = self.indexes if isinstance(index_type, str): raise TypeError("Catalog addIndex now requires the index type to" "be resolved prior to adding; create the proper " "index in the caller.") indexes[name] = index_type self.indexes = indexes def delIndex(self, name): """ deletes an index """ if not name in self.indexes: raise CatalogError('The index %s does not exist' % name) indexes = self.indexes del indexes[name] self.indexes = indexes def getIndex(self, name): """ get an index wrapped in the catalog """ return self.indexes[name].__of__(self) def updateMetadata(self, object, uid): """ Given an object and a uid, update the column data for the uid with the object data iff the object has changed """ data = self.data index = self.uids.get(uid, None) newDataRecord = self.recordify(object) if index is None: if type(data) is IOBTree: # New style, get random id index = getattr(self, '_v_nextid', 0) if index % 4000 == 0: index = randint(-2000000000, 2000000000) while not data.insert(index, newDataRecord): index = randint(-2000000000, 2000000000) # We want ids to be somewhat random, but there are # advantages for having some ids generated # sequentially when many catalog updates are done at # once, such as when reindexing or bulk indexing. # We allocate ids sequentially using a volatile base, # so different threads get different bases. This # further reduces conflict and reduces churn in # here and it result sets when bulk indexing. self._v_nextid = index + 1 else: if data: # find the next available unique id index = data.keys()[-1] + 1 else: index = 0 # meta_data is stored as a tuple for efficiency data[index] = newDataRecord else: if data.get(index, 0) != newDataRecord: data[index] = newDataRecord return index # the cataloging API def catalogObject(self, object, uid, threshold=None, idxs=None, update_metadata=1): """ Adds an object to the Catalog by iteratively applying it to all indexes. 'object' is the object to be cataloged 'uid' is the unique Catalog identifier for this object If 'idxs' is specified (as a sequence), apply the object only to the named indexes. If 'update_metadata' is true (the default), also update metadata for the object. If the object is new to the catalog, this flag has no effect (metadata is always created for new objects). """ if idxs is None: idxs = [] index = self.uids.get(uid, None) if index is None: # we are inserting new data index = self.updateMetadata(object, uid) self._length.change(1) self.uids[uid] = index self.paths[index] = uid elif update_metadata: # we are updating and we need to update metadata self.updateMetadata(object, uid) # do indexing total = 0 if idxs == []: use_indexes = self.indexes.keys() else: use_indexes = idxs for name in use_indexes: x = self.getIndex(name) if hasattr(x, 'index_object'): blah = x.index_object(index, object, threshold) total = total + blah else: LOG.error('catalogObject was passed bad index ' 'object %s.' % str(x)) return total def uncatalogObject(self, uid): """ Uncatalog and object from the Catalog. and 'uid' is a unique Catalog identifier Note, the uid must be the same as when the object was catalogued, otherwise it will not get removed from the catalog This method should not raise an exception if the uid cannot be found in the catalog. """ data = self.data uids = self.uids paths = self.paths indexes = self.indexes.keys() rid = uids.get(uid, None) if rid is not None: for name in indexes: x = self.getIndex(name) if hasattr(x, 'unindex_object'): x.unindex_object(rid) del data[rid] del paths[rid] del uids[uid] self._length.change(-1) else: LOG.error('uncatalogObject unsuccessfully ' 'attempted to uncatalog an object ' 'with a uid of %s. ' % str(uid)) def uniqueValuesFor(self, name): """ return unique values for FieldIndex name """ return self.getIndex(name).uniqueValues() def hasuid(self, uid): """ return the rid if catalog contains an object with uid """ return self.uids.get(uid) def recordify(self, object): """ turns an object into a record tuple """ record = [] # the unique id is always the first element for x in self.names: attr = getattr(object, x, MV) if (attr is not MV and safe_callable(attr)): attr = attr() record.append(attr) return tuple(record) def instantiate(self, record): r = self._v_result_class(record[1]) r.data_record_id_ = record[0] return r.__of__(self) def getMetadataForRID(self, rid): record = self.data[rid] result = {} for (key, pos) in self.schema.items(): result[key] = record[pos] return result def getIndexDataForRID(self, rid): result = {} for name in self.indexes.keys(): result[name] = self.getIndex(name).getEntryForObject(rid, "") return result ## This is the Catalog search engine. Most of the heavy lifting happens # below def make_query(self, request): # This is a bit of a mess, but the ZCatalog API has traditionally # supported passing in query restrictions in almost arbitary ways real_req = None if isinstance(request, dict): query = request.copy() elif isinstance(request, CatalogSearchArgumentsMap): query = {} query.update(request.keywords) real_req = request.request if isinstance(real_req, dict): query.update(real_req) real_req = None else: real_req = request if real_req: warnings.warn('You have specified a query using either a request ' 'object or a mixture of a query dict and keyword ' 'arguments. Please use only a simple query dict. ' 'Your query contained "%s". This support is ' 'deprecated and will be removed in Zope 2.14.' % repr(real_req), DeprecationWarning, stacklevel=4) known_keys = query.keys() # The request has too many places where an index restriction # might be specified. Putting all of request.form, # request.other, ... into the query isn't what we want. # So we iterate over all known indexes instead and see if they # are in the request. for iid in self.indexes.keys(): if iid in known_keys: continue value = real_req.get(iid) if value: query[iid] = value return query def _sorted_search_indexes(self, query): # Simple implementation doing no ordering. query_keys = query.keys() order = [] for name, index in self.indexes.items(): if name not in query_keys: continue order.append((ILimitedResultIndex.providedBy(index), name)) order.sort() return [i[1] for i in order] def _limit_sequence(self, sequence, slen, b_start=0, b_size=None, switched_reverse=False): if b_size is not None: sequence = sequence[b_start:b_start + b_size] if slen: slen = len(sequence) if switched_reverse: sequence.reverse() return (sequence, slen) def search(self, query, sort_index=None, reverse=0, limit=None, merge=1): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" rs = None # resultset # Indexes fulfill a fairly large contract here. We hand each # index the query mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the query, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) cr = self.getCatalogPlan(query) cr.start() plan = cr.plan() if not plan: plan = self._sorted_search_indexes(query) indexes = self.indexes.keys() for i in plan: if i not in indexes: # We can have bogus keys or the plan can contain index names # that have been removed in the meantime continue index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue cr.start_split(i) limit_result = ILimitedResultIndex.providedBy(index) if limit_result: r = _apply_index(query, rs) else: r = _apply_index(query) if r is not None: r, u = r # Short circuit if empty result # BBB: We can remove the "r is not None" check in Zope 2.14 # once we don't need to support the "return everything" case # anymore if r is not None and not r: cr.stop_split(i, result=None, limit=limit_result) return LazyCat([]) # provide detailed info about the pure intersection time intersect_id = i + '#intersection' cr.start_split(intersect_id) w, rs = weightedIntersection(rs, r) cr.stop_split(intersect_id) # consider the time it takes to intersect the index result with # the total resultset to be part of the index time cr.stop_split(i, result=r, limit=limit_result) if not rs: break else: cr.stop_split(i, result=None, limit=limit_result) # Try to deduce the sort limit from batching arguments b_start = int(query.get('b_start', 0)) b_size = query.get('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit if rs is None: # None of the indexes found anything to do with the query # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog warnings.warn('Your query %s produced no query restriction. ' 'Currently the entire catalog content is returned. ' 'In Zope 2.14 this will result in an empty LazyCat ' 'to be returned.' % repr(make_key(self, query)), DeprecationWarning, stacklevel=3) rlen = len(self) if sort_index is None: sequence, slen = self._limit_sequence(self.data.items(), rlen, b_start, b_size) result = LazyMap(self.instantiate, sequence, slen, actual_result_count=rlen) else: cr.start_split('sort_on') result = self.sortResults(self.data, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split('sort_on', None) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() rlen = len(rs) if sort_index is None and hasattr(rs, 'values'): # having a 'values' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ result = [(score, (1, score, rid), getitem) for rid, score in rs.items()] else: cr.start_split('sort_on') rs = rs.byValue(0) # sort it by score max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item r=self._v_result_class(self.data[key])\ .__of__(aq_parent(self)) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = int(100. * score / max) return r sequence, slen = self._limit_sequence( rs, rlen, b_start, b_size) result = LazyMap(getScoredResult, sequence, slen, actual_result_count=rlen) cr.stop_split('sort_on', None) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(self.__getitem__, sequence, slen, actual_result_count=rlen) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. cr.start_split('sort_on') result = self.sortResults(rs, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split('sort_on', None) else: # Empty result set result = LazyCat([]) cr.stop() return result def sortResults(self, rs, sort_index, reverse=0, limit=None, merge=1, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using a sort index. Return a lazy # result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples # # The two 'for' loops in here contribute a significant # proportion of the time to perform an indexed search. # Try to avoid all non-local attribute lookup inside # those loops. _intersection = intersection _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() _None = None _keyerror = KeyError result = [] append = result.append if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the resultset, reverse sorting # order and limit it, then reverse the resultset again switched_reverse = False if b_size and b_start and b_start > rlen / 2: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = b_end - b_start limit = b_start + b_size if merge and limit is None and (rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # This is rarely exercised in practice... length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = _intersection(rs, intset) if intset: keys = getattr(intset, 'keys', _None) if keys is not _None: # Is this ever true? intset = keys() length += len(intset) append((k, intset, _self__getitem__)) # Note that sort keys are unique. if reverse: result.sort(reverse=True) else: result.sort() sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: append((key, did, _self__getitem__)) # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result if merge: if reverse: result.sort(reverse=True) else: result.sort() if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif reverse: # Limit/sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] n = 0 worst = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key <= worst: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif not reverse: # Limit/sort results using N-Best algorithm in reverse (N-Worst?) keys = [] n = 0 best = None for did in rs: try: key = index_key_map[did] except _keyerror: # This document is not in the sort key index, skip it. pass else: if n >= limit and key >= best: continue i = bisect(keys, key) keys.insert(i, key) result.insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] if merge: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count) def _get_sort_attr(self, attr, kw): """Helper function to find sort-on or sort-order.""" # There are three different ways to find the attribute: # 1. kw[sort-attr] # 2. self.sort-attr # 3. kw[sort_attr] # kw may be a dict or an ExtensionClass MultiMapping, which # differ in what get() returns with no default value. name = "sort-%s" % attr val = kw.get(name, None) if val is not None: return val val = getattr(self, name, None) if val is not None: return val return kw.get("sort_%s" % attr, None) def _getSortIndex(self, args): """Returns a search index object or None.""" sort_index_name = self._get_sort_attr("on", args) if sort_index_name is not None: # self.indexes is always a dict, so get() w/ 1 arg works sort_index = self.indexes.get(sort_index_name) if sort_index is None: raise CatalogError('Unknown sort_on index (%s)' % sort_index_name) else: if not hasattr(sort_index, 'documentToKeyMap'): raise CatalogError( 'The index chosen for sort_on (%s) is not capable of ' 'being used as a sort index.' % sort_index_name) return sort_index else: return None def searchResults(self, REQUEST=None, used=None, _merge=1, **kw): # You should pass in a simple dictionary as the request argument, # which only contains the relevant query. # The used argument is deprecated and is ignored if REQUEST is None and not kw: # Try to acquire request if we get no args for bw compat warnings.warn( 'Calling searchResults without a query argument nor ' 'keyword arguments is deprecated. In Zope 2.14 the ' 'query will no longer be automatically taken from ' 'the acquired request.', DeprecationWarning, stacklevel=3) REQUEST = getattr(self, 'REQUEST', None) if isinstance(REQUEST, dict) and not kw: # short cut for the best practice args = REQUEST else: args = CatalogSearchArgumentsMap(REQUEST, kw) sort_index = self._getSortIndex(args) sort_limit = self._get_sort_attr('limit', args) reverse = 0 if sort_index is not None: order = self._get_sort_attr("order", args) if (isinstance(order, str) and order.lower() in ('reverse', 'descending')): reverse = 1 # Perform searches with indexes and sort_index return self.search(args, sort_index, reverse, sort_limit, _merge) __call__ = searchResults def getCatalogPlan(self, query=None): """Query time reporting and planning. """ parent = aq_base(aq_parent(self)) threshold = getattr(parent, 'long_query_time', 0.1) return CatalogPlan(self, query, threshold)
class GlobbingLexicon(Lexicon): """Lexicon which supports basic globbing function ('*' and '?'). This lexicon keeps several data structures around that are useful for searching. They are: '_lexicon' -- Contains the mapping from word => word_id '_inverseLex' -- Contains the mapping from word_id => word '_digrams' -- Contains a mapping from digram => word_id Before going further, it is necessary to understand what a digram is, as it is a core component of the structure of this lexicon. A digram is a two-letter sequence in a word. For example, the word 'zope' would be converted into the digrams:: ['$z', 'zo', 'op', 'pe', 'e$'] where the '$' is a word marker. It is used at the beginning and end of the words. Those digrams are significant. """ multi_wc = '*' single_wc = '?' eow = '$' def __init__(self, useSplitter=None, extra=None): self.clear() self.useSplitter = useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() self._digrams = OOBTree() def _convertBTrees(self, threshold=200): Lexicon._convertBTrees(self, threshold) if type(self._digrams) is OOBTree: return from BTrees.convert import convert _digrams = self._digrams self._digrams = OOBTree() self._digrams._p_jar = self._p_jar convert(_digrams, self._digrams, threshold, IITreeSet) def createDigrams(self, word): """Returns a list with the set of digrams in the word.""" word = '$' + word + '$' return [word[i:i + 2] for i in range(len(word) - 1)] def getWordId(self, word): """Provided 'word', return the matching integer word id.""" if self._lexicon.has_key(word): return self._lexicon[word] else: return self.assignWordId(word) set = getWordId # Kludge for old code def getWord(self, wid): return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word, and return it.""" # Double check it's not in the lexicon already, and if it is, just # return it. if self._lexicon.has_key(word): return self._lexicon[word] # Get word id. BBB Backward compat pain. inverse = self._inverseLex try: insert = inverse.insert except AttributeError: # we have an "old" BTree object if inverse: wid = inverse.keys()[-1] + 1 else: self._inverseLex = IOBTree() wid = 1 inverse[wid] = word else: # we have a "new" IOBTree object wid = randid() while not inverse.insert(wid, word): wid = randid() self._lexicon[word] = wid # Now take all the digrams and insert them into the digram map. for digram in self.createDigrams(word): set = self._digrams.get(digram, None) if set is None: self._digrams[digram] = set = IISet() set.insert(wid) return wid def get(self, pattern): """ Query the lexicon for words matching a pattern.""" # single word pattern produce a slicing problem below. # Because the splitter throws away single characters we can # return an empty tuple here. if len(pattern) == 1: return () wc_set = [self.multi_wc, self.single_wc] digrams = [] globbing = 0 for i in range(len(pattern)): if pattern[i] in wc_set: globbing = 1 continue if i == 0: digrams.insert(i, (self.eow + pattern[i])) digrams.append((pattern[i] + pattern[i + 1])) else: try: if pattern[i + 1] not in wc_set: digrams.append(pattern[i] + pattern[i + 1]) except IndexError: digrams.append((pattern[i] + self.eow)) if not globbing: result = self._lexicon.get(pattern, None) if result is None: return () return (result, ) ## now get all of the intsets that contain the result digrams result = None for digram in digrams: result = union(result, self._digrams.get(digram, None)) if not result: return () else: ## now we have narrowed the list of possible candidates ## down to those words which contain digrams. However, ## some words may have been returned that match digrams, ## but do not match 'pattern'. This is because some words ## may contain all matching digrams, but in the wrong ## order. expr = re.compile(self.createRegex(pattern)) words = [] hits = IISet() for x in result: if expr.match(self._inverseLex[x]): hits.insert(x) return hits def __getitem__(self, word): """ """ return self.get(word) def query_hook(self, q): """expand wildcards""" ListType = type([]) i = len(q) - 1 while i >= 0: e = q[i] if isinstance(e, ListType): self.query_hook(e) elif isinstance(e, Op): pass elif ((self.multi_wc in e) or (self.single_wc in e)): wids = self.get(e) words = [] for wid in wids: if words: words.append(Or) words.append(wid) if not words: # if words is empty, return something that will make # textindex's __getitem__ return an empty result list words.append('') q[i] = words i = i - 1 return q def Splitter(self, astring, words=None, encoding="latin1"): """ wrap the splitter """ ## don't do anything, less efficient but there's not much ## sense in stemming a globbing lexicon. try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding) except: return self.SplitterFunc(astring, words) def createRegex(self, pat): """Translate a PATTERN to a regular expression. There is no way to quote meta-characters. """ # Remove characters that are meaningful in a regex if not isinstance(pat, UnicodeType): transTable = string.maketrans("", "") result = string.translate(pat, transTable, r'()&|!@#$%^{}\<>.') else: transTable = {} for ch in r'()&|!@#$%^{}\<>.': transTable[ord(ch)] = None result = pat.translate(transTable) # First, deal with multi-character globbing result = result.replace('*', '.*') # Next, we need to deal with single-character globbing result = result.replace('?', '.') return "%s$" % result
class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): """ An Object Catalog An Object Catalog maintains a table of object metadata, and a series of manageable indexes to quickly search for objects (references in the metadata) that satisfy a search query. This class is not Zope specific, and can be used in any python program to build catalogs of objects. Note that it does require the objects to be Persistent, and thus must be used with ZODB3. """ _v_brains = NoBrainer def __init__(self, vocabulary=None, brains=None): # Catalogs no longer care about vocabularies and lexicons # so the vocabulary argument is ignored. (Casey) self.schema = {} # mapping from attribute name to column number self.names = () # sequence of column names self.indexes = {} # mapping from index name to index object # The catalog maintains a BTree of object meta_data for # convenient display on result pages. meta_data attributes # are turned into brain objects and returned by # searchResults. The indexing machinery indexes all records # by an integer id (rid). self.data is a mapping from the # integer id to the meta_data, self.uids is a mapping of the # object unique identifier to the rid, and self.paths is a # mapping of the rid to the unique identifier. self.clear() if brains is not None: self._v_brains = brains self.updateBrains() def __len__(self): return self._length() def clear(self): """ clear catalog """ self.data = IOBTree() # mapping of rid to meta_data self.uids = OIBTree() # mapping of uid to rid self.paths = IOBTree() # mapping of rid to uid self._length = BTrees.Length.Length() for index in self.indexes.keys(): self.getIndex(index).clear() def updateBrains(self): self.useBrains(self._v_brains) def __getitem__(self, index): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ if isinstance(index, tuple): # then it contains a score... normalized_score, score, key = index else: # otherwise no score, set all scores to 1 normalized_score, score, key = (1, 1, index) data = self.data[key] klass = self._v_result_class schema_len = len(klass.__record_schema__) if schema_len == len(data) + 3: # if we have complete data, create in a single pass r = klass(tuple(data) + (key, score, normalized_score)) else: r = klass(data) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = normalized_score r = r.__of__(aq_parent(self)) return r def __setstate__(self, state): """ initialize your brains. This method is called when the catalog is first activated (from the persistent storage) """ Persistent.__setstate__(self, state) self.updateBrains() def useBrains(self, brains): """ Sets up the Catalog to return an object (ala ZTables) that is created on the fly from the tuple stored in the self.data Btree. """ class mybrains(AbstractCatalogBrain, brains): pass scopy = self.schema.copy() schema_len = len(self.schema.keys()) scopy['data_record_id_'] = schema_len scopy['data_record_score_'] = schema_len + 1 scopy['data_record_normalized_score_'] = schema_len + 2 mybrains.__record_schema__ = scopy self._v_brains = brains self._v_result_class = mybrains def addColumn(self, name, default_value=None, threshold=10000): """Adds a row to the meta data schema""" schema = self.schema names = list(self.names) if name != name.strip(): # Someone could have mistakenly added a space at the end # of the input field. LOG.warn("stripped space from new column %r -> %r", name, name.strip()) name = name.strip() if name in schema: raise CatalogError('The column %s already exists' % name) if name[0] == '_': raise CatalogError('Cannot cache fields beginning with "_"') values = schema.values() if values: schema[name] = max(values) + 1 else: schema[name] = 0 names.append(name) if default_value in (None, ''): default_value = MV if len(self): pghandler = ZLogHandler(threshold) pghandler.init('Adding %s column' % name, len(self)) for i, (key, value) in enumerate(self.data.iteritems()): pghandler.report(i) self.data[key] = value + (default_value, ) pghandler.finish() self.names = tuple(names) self.schema = schema # new column? update the brain self.updateBrains() def delColumn(self, name, threshold=10000): """Deletes a row from the meta data schema""" names = list(self.names) _index = names.index(name) if not name in self.schema: LOG.error('delColumn attempted to delete nonexistent ' 'column %s.' % str(name)) return del names[_index] # rebuild the schema schema = {} for i, name in enumerate(names): schema[name] = i self.schema = schema self.names = tuple(names) # update the brain self.updateBrains() # remove the column value from each record if len(self): _next_index = _index + 1 pghandler = ZLogHandler(threshold) pghandler.init('Deleting %s column' % name, len(self)) for i, (key, value) in enumerate(self.data.iteritems()): pghandler.report(i) self.data[key] = value[:_index] + value[_next_index:] pghandler.finish() def addIndex(self, name, index_type): """Create a new index, given a name and a index_type. Old format: index_type was a string, 'FieldIndex' 'TextIndex' or 'KeywordIndex' is no longer valid; the actual index must be instantiated and passed in to addIndex. New format: index_type is the actual index object to be stored. """ if name in self.indexes: raise CatalogError('The index %s already exists' % name) if name.startswith('_'): raise CatalogError('Cannot index fields beginning with "_"') if not name: raise CatalogError('Name of index is empty') if name != name.strip(): # Someone could have mistakenly added a space at the end # of the input field. LOG.warn("stripped space from new index %r -> %r", name, name.strip()) name = name.strip() indexes = self.indexes if isinstance(index_type, str): raise TypeError("Catalog addIndex now requires the index type to" "be resolved prior to adding; create the proper " "index in the caller.") indexes[name] = index_type self.indexes = indexes def delIndex(self, name): """ deletes an index """ if not name in self.indexes: raise CatalogError('The index %s does not exist' % name) indexes = self.indexes del indexes[name] self.indexes = indexes def getIndex(self, name): """ get an index wrapped in the catalog """ return self.indexes[name].__of__(self) def updateMetadata(self, object, uid, index): """ Given an object and a uid, update the column data for the uid with the object data iff the object has changed """ data = self.data newDataRecord = self.recordify(object) if index is None: index = getattr(self, '_v_nextid', 0) if index % 4000 == 0: index = randint(-2000000000, 2000000000) while not data.insert(index, newDataRecord): index = randint(-2000000000, 2000000000) # We want ids to be somewhat random, but there are # advantages for having some ids generated # sequentially when many catalog updates are done at # once, such as when reindexing or bulk indexing. # We allocate ids sequentially using a volatile base, # so different threads get different bases. This # further reduces conflict and reduces churn in # here and it result sets when bulk indexing. self._v_nextid = index + 1 else: if data.get(index, 0) != newDataRecord: data[index] = newDataRecord return index # the cataloging API def catalogObject(self, object, uid, threshold=None, idxs=None, update_metadata=True): """ Adds an object to the Catalog by iteratively applying it to all indexes. 'object' is the object to be cataloged 'uid' is the unique Catalog identifier for this object If 'idxs' is specified (as a sequence), apply the object only to the named indexes. If 'update_metadata' is true (the default), also update metadata for the object. If the object is new to the catalog, this flag has no effect (metadata is always created for new objects). """ if idxs is None: idxs = [] index = self.uids.get(uid, None) if index is None: # we are inserting new data index = self.updateMetadata(object, uid, None) self._length.change(1) self.uids[uid] = index self.paths[index] = uid elif update_metadata: # we are updating and we need to update metadata self.updateMetadata(object, uid, index) # do indexing total = 0 if idxs == []: use_indexes = self.indexes.keys() else: use_indexes = idxs for name in use_indexes: x = self.getIndex(name) if hasattr(x, 'index_object'): blah = x.index_object(index, object, threshold) total = total + blah else: LOG.error('catalogObject was passed bad index ' 'object %s.' % str(x)) return total def uncatalogObject(self, uid): """ Uncatalog and object from the Catalog. and 'uid' is a unique Catalog identifier Note, the uid must be the same as when the object was catalogued, otherwise it will not get removed from the catalog This method should not raise an exception if the uid cannot be found in the catalog. """ data = self.data uids = self.uids paths = self.paths indexes = self.indexes.keys() rid = uids.get(uid, None) if rid is not None: for name in indexes: x = self.getIndex(name) if hasattr(x, 'unindex_object'): x.unindex_object(rid) del data[rid] del paths[rid] del uids[uid] self._length.change(-1) else: LOG.error('uncatalogObject unsuccessfully ' 'attempted to uncatalog an object ' 'with a uid of %s. ' % str(uid)) def uniqueValuesFor(self, name): """ return unique values for FieldIndex name """ return tuple(self.getIndex(name).uniqueValues()) def hasuid(self, uid): """ return the rid if catalog contains an object with uid """ return self.uids.get(uid) def recordify(self, object): """ turns an object into a record tuple """ record = [] # the unique id is always the first element for x in self.names: attr = getattr(object, x, MV) if (attr is not MV and safe_callable(attr)): attr = attr() record.append(attr) return tuple(record) def instantiate(self, record): r = self._v_result_class(record[1]) r.data_record_id_ = record[0] return r.__of__(self) def getMetadataForRID(self, rid): record = self.data[rid] result = {} for (key, pos) in self.schema.items(): result[key] = record[pos] return result def getIndexDataForRID(self, rid): result = {} for name in self.indexes.keys(): result[name] = self.getIndex(name).getEntryForObject(rid, "") return result # This is the Catalog search engine. Most of the heavy lifting happens # below def make_query(self, request): # This is a bit of a mess, but the ZCatalog API has traditionally # supported passing in query restrictions in almost arbitary ways real_req = None if isinstance(request, dict): query = request.copy() elif isinstance(request, CatalogSearchArgumentsMap): query = {} query.update(request.keywords) real_req = request.request if isinstance(real_req, dict): query.update(real_req) real_req = None else: real_req = request if real_req: warnings.warn('You have specified a query using either a request ' 'object or a mixture of a query dict and keyword ' 'arguments. Please use only a simple query dict. ' 'Your query contained "%s". This support is ' 'deprecated and will be removed in Zope 4.' % repr(real_req), DeprecationWarning, stacklevel=4) known_keys = query.keys() # The request has too many places where an index restriction # might be specified. Putting all of request.form, # request.other, ... into the query isn't what we want. # So we iterate over all known indexes instead and see if they # are in the request. for iid in self.indexes.keys(): if iid in known_keys: continue value = real_req.get(iid) if value: query[iid] = value return query def _get_index_query_names(self, index): if hasattr(index, 'getIndexQueryNames'): return index.getIndexQueryNames() return (index.getId(),) def _sorted_search_indexes(self, query): # Simple implementation ordering only by limited result support query_keys = query.keys() order = [] for name, index in self.indexes.items(): for attr in self._get_index_query_names(index): if attr in query_keys: order.append((ILimitedResultIndex.providedBy(index), name)) order.sort() return [i[1] for i in order] def _limit_sequence(self, sequence, slen, b_start=0, b_size=None, switched_reverse=False): if b_size is not None: sequence = sequence[b_start:b_start + b_size] if slen: slen = len(sequence) if switched_reverse: sequence.reverse() return (sequence, slen) def search(self, query, sort_index=None, reverse=False, limit=None, merge=True): """Iterate through the indexes, applying the query to each one. If merge is true then return a lazy result set (sorted if appropriate) otherwise return the raw (possibly scored) results for later merging. Limit is used in conjuntion with sorting or scored results to inform the catalog how many results you are really interested in. The catalog can then use optimizations to save time and memory. The number of results is not guaranteed to fall within the limit however, you should still slice or batch the results as usual.""" # Indexes fulfill a fairly large contract here. We hand each # index the query mapping we are given (which may be composed # of some combination of web request, kw mappings or plain old dicts) # and the index decides what to do with it. If the index finds work # for itself in the query, it returns the results and a tuple of # the attributes that were used. If the index finds nothing for it # to do then it returns None. # Canonicalize the request into a sensible query before passing it on query = self.make_query(query) cr = self.getCatalogPlan(query) cr.start() plan = cr.plan() if not plan: plan = self._sorted_search_indexes(query) rs = None # result set indexes = self.indexes.keys() for i in plan: if i not in indexes: # We can have bogus keys or the plan can contain index names # that have been removed in the meantime continue index = self.getIndex(i) _apply_index = getattr(index, "_apply_index", None) if _apply_index is None: continue cr.start_split(i) limit_result = ILimitedResultIndex.providedBy(index) if limit_result: r = _apply_index(query, rs) else: r = _apply_index(query) if r is not None: r, u = r # Short circuit if empty result # BBB: We can remove the "r is not None" check in Zope 4 # once we don't need to support the "return everything" case # anymore if r is not None and not r: cr.stop_split(i, result=None, limit=limit_result) return LazyCat([]) # provide detailed info about the pure intersection time intersect_id = i + '#intersection' cr.start_split(intersect_id) # weightedIntersection preserves the values from any mappings # we get, as some indexes don't return simple sets if hasattr(rs, 'items') or hasattr(r, 'items'): _, rs = weightedIntersection(rs, r) else: rs = intersection(rs, r) cr.stop_split(intersect_id) # consider the time it takes to intersect the index result # with the total result set to be part of the index time cr.stop_split(i, result=r, limit=limit_result) if not rs: break else: cr.stop_split(i, result=None, limit=limit_result) # Try to deduce the sort limit from batching arguments b_start = int(query.get('b_start', 0)) b_size = query.get('b_size', None) if b_size is not None: b_size = int(b_size) if b_size is not None: limit = b_start + b_size elif limit and b_size is None: b_size = limit if sort_index is None: sort_report_name = None else: if isinstance(sort_index, list): sort_name = '-'.join(i.getId() for i in sort_index) else: sort_name = sort_index.getId() if isinstance(reverse, list): reverse_name = '-'.join( 'desc' if r else 'asc' for r in reverse) else: reverse_name = 'desc' if reverse else 'asc' sort_report_name = 'sort_on#' + sort_name + '#' + reverse_name if limit is not None: sort_report_name += '#limit-%s' % limit if rs is None: # None of the indexes found anything to do with the query # We take this to mean that the query was empty (an empty filter) # and so we return everything in the catalog warnings.warn('Your query %s produced no query restriction. ' 'Currently the entire catalog content is returned. ' 'In Zope 4 this will result in an empty LazyCat ' 'to be returned.' % repr(cr.make_key(query)), DeprecationWarning, stacklevel=3) rlen = len(self) if sort_index is None: sequence, slen = self._limit_sequence(self.data.items(), rlen, b_start, b_size) result = LazyMap(self.instantiate, sequence, slen, actual_result_count=rlen) else: cr.start_split(sort_report_name) result = self.sortResults( self.data, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split(sort_report_name, None) elif rs: # We got some results from the indexes. # Sort and convert to sequences. # XXX: The check for 'values' is really stupid since we call # items() and *not* values() rlen = len(rs) if sort_index is None and hasattr(rs, 'items'): # having a 'items' means we have a data structure with # scores. Build a new result set, sort it by score, reverse # it, compute the normalized score, and Lazify it. if not merge: # Don't bother to sort here, return a list of # three tuples to be passed later to mergeResults # note that data_record_normalized_score_ cannot be # calculated and will always be 1 in this case getitem = self.__getitem__ result = [(score, (1, score, rid), getitem) for rid, score in rs.items()] else: cr.start_split('sort_on#score') # sort it by score rs = rs.byValue(0) max = float(rs[0][0]) # Here we define our getter function inline so that # we can conveniently store the max value as a default arg # and make the normalized score computation lazy def getScoredResult(item, max=max, self=self): """ Returns instances of self._v_brains, or whatever is passed into self.useBrains. """ score, key = item data = self.data[key] klass = self._v_result_class schema_len = len(klass.__record_schema__) norm_score = int(100.0 * score / max) if schema_len == len(data) + 3: r = klass(tuple(data) + (key, score, norm_score)) else: r = klass(data) r.data_record_id_ = key r.data_record_score_ = score r.data_record_normalized_score_ = norm_score r = r.__of__(aq_parent(self)) return r sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(getScoredResult, sequence, slen, actual_result_count=rlen) cr.stop_split('sort_on#score', None) elif sort_index is None and not hasattr(rs, 'values'): # no scores if hasattr(rs, 'keys'): rs = rs.keys() sequence, slen = self._limit_sequence(rs, rlen, b_start, b_size) result = LazyMap(self.__getitem__, sequence, slen, actual_result_count=rlen) else: # sort. If there are scores, then this block is not # reached, therefore 'sort-on' does not happen in the # context of a text index query. This should probably # sort by relevance first, then the 'sort-on' attribute. cr.start_split(sort_report_name) result = self.sortResults(rs, sort_index, reverse, limit, merge, actual_result_count=rlen, b_start=b_start, b_size=b_size) cr.stop_split(sort_report_name, None) else: # Empty result set result = LazyCat([]) cr.stop() return result def sortResults(self, rs, sort_index, reverse=False, limit=None, merge=True, actual_result_count=None, b_start=0, b_size=None): # Sort a result set using one or more sort indexes. Both sort_index # and reverse can be lists of indexes and reverse specifications. # Return a lazy result set in sorted order if merge is true otherwise # returns a list of (sortkey, uid, getter_function) tuples, where # sortkey can be a tuple on its own. second_indexes = None second_indexes_key_map = None sort_index_length = 1 if isinstance(sort_index, list): sort_index_length = len(sort_index) if sort_index_length > 1: second_indexes = sort_index[1:] second_indexes_key_map = [] for si in second_indexes: second_indexes_key_map.append(si.documentToKeyMap()) sort_index = sort_index[0] _self__getitem__ = self.__getitem__ index_key_map = sort_index.documentToKeyMap() result = [] r_append = result.append r_insert = result.insert if hasattr(rs, 'keys'): rs = rs.keys() if actual_result_count is None: rlen = len(rs) actual_result_count = rlen else: rlen = actual_result_count # don't limit to more than what we have if limit is not None and limit >= rlen: limit = rlen # if we want a batch from the end of the result set, reverse sorting # order and limit it, then reverse the result set again switched_reverse = False if b_size and b_start and b_start > rlen / 2: if isinstance(reverse, list): reverse = [not r for r in reverse] else: reverse = not reverse switched_reverse = True b_end = b_start + b_size if b_end >= rlen: overrun = rlen - b_end if b_start >= rlen: # bail out, we are outside the possible range return LazyCat([], 0, actual_result_count) else: b_size += overrun b_start = 0 else: b_start = rlen - b_end limit = b_start + b_size # determine sort_spec if isinstance(reverse, list): sort_spec = [r and -1 or 1 for r in reverse] # limit to current maximum of sort indexes sort_spec = sort_spec[:sort_index_length] # use first sort order for choosing the algorithm first_reverse = reverse[0] else: sort_spec = [] for i in xrange(sort_index_length): sort_spec.append(reverse and -1 or 1) first_reverse = reverse if merge and limit is None and ( rlen > (len(sort_index) * (rlen / 100 + 1))): # The result set is much larger than the sorted index, # so iterate over the sorted index for speed. # TODO: len(sort_index) isn't actually what we want for a keyword # index, as it's only the unique values, not the documents. # Don't use this case while using limit, as we return results of # non-flattened intsets, and would have to merge/unflattened those # before limiting. length = 0 try: intersection(rs, IISet(())) except TypeError: # rs is not an object in the IIBTree family. # Try to turn rs into an IISet. rs = IISet(rs) if sort_index_length == 1: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) r_append((k, intset, _self__getitem__)) result.sort(reverse=reverse) else: for k, intset in sort_index.items(): # We have an index that has a set of values for # each sort key, so we intersect with each set and # get a sorted sequence of the intersections. intset = intersection(rs, intset) if intset: keys = getattr(intset, 'keys', None) if keys is not None: # Is this ever true? intset = keys() length += len(intset) # sort on secondary index keysets = defaultdict(list) for i in intset: full_key = (k, ) for km in second_indexes_key_map: try: full_key += (km[i], ) except KeyError: pass keysets[full_key].append(i) for k2, v2 in keysets.items(): r_append((k2, v2, _self__getitem__)) result = multisort(result, sort_spec) sequence, slen = self._limit_sequence(result, length, b_start, b_size, switched_reverse) result = LazyCat(LazyValues(sequence), slen, actual_result_count) elif limit is None or (limit * 4 > rlen): # Iterate over the result set getting sort keys from the index. # If we are interested in at least 25% or more of the result set, # the N-Best algorithm is slower, so we iterate over all. if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: # The reference back to __getitem__ is used in case # we do not merge now and need to intermingle the # results with those of other catalogs while avoiding # the cost of instantiating a LazyMap per result r_append((key, did, _self__getitem__)) if merge: result.sort(reverse=reverse) else: for did in rs: try: full_key = (index_key_map[did], ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: r_append((full_key, did, _self__getitem__)) if merge: result = multisort(result, sort_spec) if merge: if limit is not None: result = result[:limit] sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) result = LazyValues(sequence) result.actual_result_count = actual_result_count else: sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) return sequence elif first_reverse: # Limit / sort results using N-Best algorithm # This is faster for large sets then a full sort # And uses far less memory keys = [] k_insert = keys.insert n = 0 worst = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result.reverse() else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key <= worst: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[0], result[0] else: n += 1 worst = keys[0] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence elif not first_reverse: # Limit / sort results using N-Best algorithm in reverse (N-Worst?) keys = [] k_insert = keys.insert n = 0 best = None if sort_index_length == 1: for did in rs: try: key = index_key_map[did] except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] else: for did in rs: try: key = index_key_map[did] full_key = (key, ) for km in second_indexes_key_map: full_key += (km[did], ) except KeyError: # This document is not in the sort key index, skip it. actual_result_count -= 1 else: if n >= limit and key >= best: continue i = bisect(keys, key) k_insert(i, key) r_insert(i, (full_key, did, _self__getitem__)) if n == limit: del keys[-1], result[-1] else: n += 1 best = keys[-1] result = multisort(result, sort_spec) sequence, _ = self._limit_sequence(result, 0, b_start, b_size, switched_reverse) if merge: result = LazyValues(sequence) result.actual_result_count = actual_result_count else: return sequence return LazyMap(self.__getitem__, result, len(result), actual_result_count=actual_result_count) def _get_sort_attr(self, attr, kw): """Helper function to find sort-on or sort-order.""" # There are three different ways to find the attribute: # 1. kw[sort-attr] # 2. self.sort-attr # 3. kw[sort_attr] # kw may be a dict or an ExtensionClass MultiMapping, which # differ in what get() returns with no default value. name = "sort-%s" % attr val = kw.get(name, None) if val is not None: return val val = getattr(self, name, None) if val is not None: return val return kw.get("sort_%s" % attr, None) def _getSortIndex(self, args): """Returns a list of search index objects or None.""" sort_index_names = self._get_sort_attr("on", args) if sort_index_names is not None: # self.indexes is always a dict, so get() w/ 1 arg works sort_indexes = [] if not isinstance(sort_index_names, (list, tuple)): sort_index_names = [sort_index_names] for name in sort_index_names: sort_index = self.indexes.get(name) if sort_index is None: raise CatalogError('Unknown sort_on index: %s' % repr(name)) else: if not hasattr(sort_index, 'documentToKeyMap'): raise CatalogError('The index chosen for sort_on is ' 'not capable of being used as a sort index: ' '%s' % repr(name)) sort_indexes.append(sort_index) if len(sort_indexes) == 1: # be nice and keep the old API intact for single sort_on's return sort_indexes[0] return sort_indexes return None def searchResults(self, REQUEST=None, used=None, _merge=True, **kw): # You should pass in a simple dictionary as the request argument, # which only contains the relevant query. # The used argument is deprecated and is ignored if REQUEST is None and not kw: # Try to acquire request if we get no args for bw compat warnings.warn('Calling searchResults without a query argument nor ' 'keyword arguments is deprecated. In Zope 4 the ' 'query will no longer be automatically taken from ' 'the acquired request.', DeprecationWarning, stacklevel=3) REQUEST = getattr(self, 'REQUEST', None) if isinstance(REQUEST, dict) and not kw: # short cut for the best practice args = REQUEST else: args = CatalogSearchArgumentsMap(REQUEST, kw) sort_indexes = self._getSortIndex(args) sort_limit = self._get_sort_attr('limit', args) reverse = False if sort_indexes is not None: order = self._get_sort_attr("order", args) reverse = [] if order is None: order = [''] elif isinstance(order, str): order = [order] for o in order: reverse.append(o.lower() in ('reverse', 'descending')) if len(reverse) == 1: # be nice and keep the old API intact for single sort_order reverse = reverse[0] # Perform searches with indexes and sort_index return self.search(args, sort_indexes, reverse, sort_limit, _merge) __call__ = searchResults def getCatalogPlan(self, query=None): """Query time reporting and planning. """ parent = aq_base(aq_parent(self)) threshold = getattr(parent, 'long_query_time', 0.1) return CatalogPlan(self, query, threshold)
class Lexicon(Persistent): """Maps words to word ids """ __implements__ = LexiconInterface def __init__(self, truncate_left=0): self.truncate_left = truncate_left self.clear() def clear(self): self._nextid = BTrees.Length.Length() self._forward_idx = OIBTree() self._inverse_idx = IOBTree() if self.truncate_left: self._lforward_idx = OIBTree() else: self._lforward_idx = None def getWordIdList(self, words): """ return a list of wordIds for a list of words """ fw_idx = self._forward_idx fw_idx_get = fw_idx.get rev_idx = self._inverse_idx if self.truncate_left: lfw_idx = self._lforward_idx nextid = self._nextid wids = [] append = wids.append for word in words: wid = fw_idx_get(word) if not wid: nextid.change(1) wid = nextid() fw_idx[word] = wid rev_idx[wid] = word if self.truncate_left: lfw_idx[word[::-1]] = wid append(wid) return wids def getWordId(self, word, default=None): """Return the matched word against the key.""" return self._forward_idx.get(word, default) def getWord(self, wid): """ return a word by its wid""" return self._inverse_idx[wid] def deleteWord(self, word): wid = self._forward_idx[word] del self._inverse_idx[wid] del self._forward_idx[word] def deleteWordId(self, wid): word = self._inverse_idx[wid] del self._forward_idx[word] del self._inverse_idx[wid] def getWordsForRightTruncation(self, prefix): """ Return a list for wordIds that match against prefix. We use the BTrees range search to perform the search """ assert isinstance(prefix, unicode) return self._forward_idx.keys(prefix, prefix + u'\uffff') def getWordsForLeftTruncation(self, suffix): """ Return a sequence of word ids for a common suffix """ suffix = suffix[::-1] assert isinstance(suffix, unicode) return [w[::-1] for w in self._lforward_idx.keys(suffix, suffix + u'\uffff') ] def createRegex(self, pattern): """Translate a PATTERN to a regular expression """ return '%s$' % pattern.replace( '*', '.*').replace( '?', '.') def getSimiliarWords(self, term, threshold=0.75): """ return a list of similar words based on the levenshtein distance """ return [ (w, ratio(w,term)) for w in self._forward_idx.keys() if ratio(w, term) > threshold ] def getWordsForPattern(self, pattern): """ perform full pattern matching """ # search for prefix in word mo = re.search('([\?\*])', pattern) if mo is None: return [ pattern ] pos = mo.start(1) if pos==0: raise QueryParserError, \ 'word "%s" should not start with a globbing character' % pattern prefix = pattern[:pos] words = self._forward_idx.keys(prefix, prefix + u'\uffff') regex = re.compile( self.createRegex(pattern) ) return [word for word in words if regex.match(word) ] def getWordsInRange(self, w1, w2): """ return all words within w1...w2 """ return self._forward_idx.keys(w1, w2) def getWordsForSubstring(self, sub): """ return all words that match *sub* """ return [word for word in self._forward_idx.keys() if sub in word] def getWordIds(self): """ return all wids """ return self._inverse_idx.keys() def removeWordId(self, wid): """ remove word id 'wid' """ word = self._inverse_idx[wid] del self._inverse_idx[wid] del self._forward_idx[word] def __len__(self): return len(self._inverse_idx.keys())