class Repository(Implicit, Persistent): """The repository implementation manages the actual data of versions and version histories. It does not handle user interface issues.""" def __init__(self): # These keep track of symbolic label and branch names that # have been used to ensure that they don't collide. self._branches = OIBTree() self._branches['mainline'] = 1 self._labels = OIBTree() self._histories = OOBTree() self._created = time.time() security = ClassSecurityInfo() security.declarePrivate('createVersionHistory') def createVersionHistory(self, object): """Internal: create a new version history for a resource.""" # When one creates the first version in a version history, neither # the version or version history yet have a _p_jar, which causes # copy operations to fail. To work around that, we share our _p_jar. history_id = None while history_id is None or self._histories.has_key(history_id): history_id = str(randint(1, 9999999999)) history = ZopeVersionHistory(history_id, object) self._histories[history_id] = history return history.__of__(self) security.declarePrivate('getVersionHistory') def getVersionHistory(self, history_id): """Internal: return a version history given a version history id.""" return self._histories[history_id].__of__(self) security.declarePrivate('replaceState') def replaceState(self, obj, new_state): """Internal: replace the state of a persistent object. """ non_versioned = getNonVersionedData(obj) # XXX There ought to be some way to do this more cleanly. # This fills the __dict__ of the old object with new state. # The other way to achieve the desired effect is to replace # the object in its container, but this method preserves the # identity of the object. if obj.__class__ is not new_state.__class__: raise VersionControlError( "The class of the versioned object has changed. %s != %s" % (repr(obj.__class__, new_state.__class__))) obj._p_changed = 1 for key in obj.__dict__.keys(): if not new_state.__dict__.has_key(key): del obj.__dict__[key] for key, value in new_state.__dict__.items(): obj.__dict__[key] = value if non_versioned: # Restore the non-versioned data into the new state. restoreNonVersionedData(obj, non_versioned) return obj ##################################################################### # This is the implementation of the public version control interface. ##################################################################### security.declarePublic('isAVersionableResource') def isAVersionableResource(self, obj): # For now, an object must be persistent (have its own db record) # in order to be considered a versionable resource. return isAVersionableResource(obj) security.declarePublic('isUnderVersionControl') def isUnderVersionControl(self, object): return hasattr(object, '__vc_info__') security.declarePublic('isResourceUpToDate') def isResourceUpToDate(self, object, require_branch=0): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) branch = 'mainline' if info.sticky: if info.sticky[0] == 'B': branch = info.sticky[1] elif require_branch: # The object is updated to a particular version # rather than a branch. The caller # requires a branch. return 0 return history.isLatestVersion(info.version_id, branch) security.declarePublic('isResourceChanged') def isResourceChanged(self, object): # Return true if the state of a resource has changed in a transaction # *after* the version bookkeeping was saved. Note that this method is # not appropriate for detecting changes within a transaction! info = self.getVersionInfo(object) itime = getattr(info, '_p_mtime', None) if itime is None: return 0 mtime = Utility._findModificationTime(object) if mtime is None: return 0 return mtime > itime security.declarePublic('getVersionInfo') def getVersionInfo(self, object): info = getattr(object, '__vc_info__', None) if info is not None: return info raise VersionControlError( 'The specified resource is not under version control.' ) security.declareProtected(use_vc_permission, 'applyVersionControl') def applyVersionControl(self, object, message=None): if self.isUnderVersionControl(object): raise VersionControlError( 'The resource is already under version control.' ) if not self.isAVersionableResource(object): raise VersionControlError( 'This resource cannot be put under version control.' ) # Need to check the parent to see if the container of the object # being put under version control is itself a version-controlled # object. If so, we need to use the branch id of the container. branch = 'mainline' parent = aq_parent(aq_inner(object)) p_info = getattr(parent, '__vc_info__', None) if p_info is not None: sticky = p_info.sticky if sticky and sticky[0] == 'B': branch = sticky[1] # Create a new version history and initial version object. history = self.createVersionHistory(object) version = history.createVersion(object, branch) history_id = history.getId() version_id = version.getId() # Add bookkeeping information to the version controlled object. info = VersionInfo(history_id, version_id, VersionInfo.CHECKED_IN) if branch != 'mainline': info.sticky = ('B', branch) object.__vc_info__ = info # Save an audit record of the action being performed. history.addLogEntry(version_id, LogEntry.ACTION_CHECKIN, _findPath(object), message is None and 'Initial checkin.' or message ) return object security.declareProtected(use_vc_permission, 'checkoutResource') def checkoutResource(self, object): info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource is already checked out.' ) if info.sticky and info.sticky[0] != 'B': raise VersionControlError( 'The selected resource has been updated to a particular ' 'version, label or date. The resource must be updated to ' 'the mainline or a branch before it may be checked out.' ) if not self.isResourceUpToDate(object): raise VersionControlError( 'The selected resource is not up to date!' ) history = self.getVersionHistory(info.history_id) ob_path = _findPath(object) # Save an audit record of the action being performed. history.addLogEntry(info.version_id, LogEntry.ACTION_CHECKOUT, ob_path ) # Update bookkeeping information. newinfo = info.clone() newinfo.status = newinfo.CHECKED_OUT object.__vc_info__ = newinfo return object security.declareProtected(use_vc_permission, 'checkinResource') def checkinResource(self, object, message=''): info = self.getVersionInfo(object) if info.status != info.CHECKED_OUT: raise VersionControlError( 'The selected resource is not checked out.' ) if info.sticky and info.sticky[0] != 'B': raise VersionControlError( 'The selected resource has been updated to a particular ' 'version, label or date. The resource must be updated to ' 'the mainline or a branch before it may be checked in.' ) if not self.isResourceUpToDate(object): raise VersionControlError( 'The selected resource is not up to date!' ) history = self.getVersionHistory(info.history_id) ob_path = _findPath(object) branch = 'mainline' if info.sticky is not None and info.sticky[0] == 'B': branch = info.sticky[1] version = history.createVersion(object, branch) # Save an audit record of the action being performed. history.addLogEntry(version.getId(), LogEntry.ACTION_CHECKIN, ob_path, message ) # Update bookkeeping information. newinfo = info.clone() newinfo.version_id = version.getId() newinfo.status = newinfo.CHECKED_IN object.__vc_info__ = newinfo return object security.declareProtected(use_vc_permission, 'uncheckoutResource') def uncheckoutResource(self, object): info = self.getVersionInfo(object) if info.status != info.CHECKED_OUT: raise VersionControlError( 'The selected resource is not checked out.' ) history = self.getVersionHistory(info.history_id) ob_path = _findPath(object) version = history.getVersionById(info.version_id) new_obj = version.copyState() # Save an audit record of the action being performed. history.addLogEntry(info.version_id, LogEntry.ACTION_UNCHECKOUT, ob_path ) # Replace the state of the object with a reverted state. new_obj = self.replaceState(object, new_obj) # Update bookkeeping information. newinfo = info.clone() newinfo.version_id = version.getId() newinfo.status = newinfo.CHECKED_IN new_obj.__vc_info__ = newinfo return new_obj security.declareProtected(use_vc_permission, 'updateResource') def updateResource(self, object, selector=None): info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource must be checked in to be updated.' ) history = self.getVersionHistory(info.history_id) version = None sticky = info.sticky if not selector: # If selector is null, update to the latest version taking any # sticky attrs into account (branch, date). Note that the sticky # tag could also be a date or version id. We don't bother checking # for those, since in both cases we do nothing (because we'll # always be up to date until the sticky tag changes). if sticky and sticky[0] == 'L': # A label sticky tag, so update to that label (since it is # possible, but unlikely, that the label has been moved). version = history.getVersionByLabel(sticky[1]) elif sticky and sticky[0] == 'B': # A branch sticky tag. Update to latest version on branch. version = history.getLatestVersion(selector) else: # Update to mainline, forgetting any date or version id # sticky tag that was previously associated with the object. version = history.getLatestVersion('mainline') sticky = None else: # If the selector is non-null, we find the version specified # and update the sticky tag. Later we'll check the version we # found and decide whether we really need to update the object. if history.hasVersionId(selector): version = history.getVersionById(selector) sticky = ('V', selector) elif self._labels.has_key(selector): version = history.getVersionByLabel(selector) sticky = ('L', selector) elif self._branches.has_key(selector): version = history.getLatestVersion(selector) if selector == 'mainline': sticky = None else: sticky = ('B', selector) else: try: date = DateTime(selector) except: raise VersionControlError( 'Invalid version selector: %s' % selector ) else: timestamp = date.timeTime() sticky = ('D', timestamp) # Fix! branch = history.findBranchId(info.version_id) version = history.getVersionByDate(branch, timestamp) # If the state of the resource really needs to be changed, do the # update and make a log entry for the update. version_id = version and version.getId() or info.version_id new_object = object if version and (version_id != info.version_id): new_object = version.copyState() new_object = self.replaceState(object, new_object) history.addLogEntry(version_id, LogEntry.ACTION_UPDATE, _findPath(new_object) ) # Update bookkeeping information. newinfo = info.clone(1) newinfo.version_id = version_id newinfo.status = newinfo.CHECKED_IN if sticky is not None: newinfo.sticky = sticky new_object.__vc_info__ = newinfo return new_object security.declareProtected(use_vc_permission, 'labelResource') def labelResource(self, object, label, force=0): info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource must be checked in to be labeled.' ) # Make sure that labels and branch ids do not collide. if self._branches.has_key(label) or label == 'mainline': raise VersionControlError( 'The label value given is already in use as an activity id.' ) if not self._labels.has_key(label): self._labels[label] = 1 history = self.getVersionHistory(info.history_id) history.labelVersion(info.version_id, label, force) return object security.declareProtected(use_vc_permission, 'makeActivity') def makeActivity(self, object, branch_id): # Note - this is not part of the official version control API yet. # It is here to allow unit testing of the architectural aspects # that are already in place to support activities in the future. info = self.getVersionInfo(object) if info.status != info.CHECKED_IN: raise VersionControlError( 'The selected resource must be checked in.' ) branch_id = branch_id or None # Make sure that activity ids and labels do not collide. if self._labels.has_key(branch_id) or branch_id == 'mainline': raise VersionControlError( 'The value given is already in use as a version label.' ) if not self._branches.has_key(branch_id): self._branches[branch_id] = 1 history = self.getVersionHistory(info.history_id) if history._branches.has_key(branch_id): raise VersionControlError( 'The resource is already associated with the given activity.' ) history.createBranch(branch_id, info.version_id) return object security.declareProtected(use_vc_permission, 'getVersionOfResource') def getVersionOfResource(self, history_id, selector): history = self.getVersionHistory(history_id) sticky = None if not selector or selector == 'mainline': version = history.getLatestVersion('mainline') else: if history.hasVersionId(selector): version = history.getVersionById(selector) sticky = ('V', selector) elif self._labels.has_key(selector): version = history.getVersionByLabel(selector) sticky = ('L', selector) elif self._branches.has_key(selector): version = history.getLatestVersion(selector) sticky = ('B', selector) else: try: date = DateTime(selector) except: raise VersionControlError( 'Invalid version selector: %s' % selector ) else: timestamp = date.timeTime() sticky = ('D', timestamp) version = history.getVersionByDate('mainline', timestamp) object = version.copyState() info = VersionInfo(history_id, version.getId(), VersionInfo.CHECKED_IN) if sticky is not None: info.sticky = sticky object.__vc_info__ = info return object security.declareProtected(use_vc_permission, 'getVersionIds') def getVersionIds(self, object): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) return history.getVersionIds() security.declareProtected(use_vc_permission, 'getLabelsForResource') def getLabelsForResource(self, object): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) return history.getLabels() security.declareProtected(use_vc_permission, 'getLogEntries') def getLogEntries(self, object): info = self.getVersionInfo(object) history = self.getVersionHistory(info.history_id) return history.getLogEntries()
class GlobbingLexicon(Lexicon): """Lexicon which supports basic globbing function ('*' and '?'). This lexicon keeps several data structures around that are useful for searching. They are: '_lexicon' -- Contains the mapping from word => word_id '_inverseLex' -- Contains the mapping from word_id => word '_digrams' -- Contains a mapping from digram => word_id Before going further, it is necessary to understand what a digram is, as it is a core component of the structure of this lexicon. A digram is a two-letter sequence in a word. For example, the word 'zope' would be converted into the digrams:: ['$z', 'zo', 'op', 'pe', 'e$'] where the '$' is a word marker. It is used at the beginning and end of the words. Those digrams are significant. """ multi_wc = '*' single_wc = '?' eow = '$' def __init__(self,useSplitter=None,extra=None): self.clear() self.useSplitter = useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() self._digrams = OOBTree() def _convertBTrees(self, threshold=200): Lexicon._convertBTrees(self, threshold) if type(self._digrams) is OOBTree: return from BTrees.convert import convert _digrams=self._digrams self._digrams=OOBTree() self._digrams._p_jar=self._p_jar convert(_digrams, self._digrams, threshold, IITreeSet) def createDigrams(self, word): """Returns a list with the set of digrams in the word.""" word = '$'+word+'$' return [ word[i:i+2] for i in range(len(word)-1)] def getWordId(self, word): """Provided 'word', return the matching integer word id.""" if self._lexicon.has_key(word): return self._lexicon[word] else: return self.assignWordId(word) set = getWordId # Kludge for old code def getWord(self, wid): return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word, and return it.""" # Double check it's not in the lexicon already, and if it is, just # return it. if self._lexicon.has_key(word): return self._lexicon[word] # Get word id. BBB Backward compat pain. inverse=self._inverseLex try: insert=inverse.insert except AttributeError: # we have an "old" BTree object if inverse: wid=inverse.keys()[-1]+1 else: self._inverseLex=IOBTree() wid=1 inverse[wid] = word else: # we have a "new" IOBTree object wid=randid() while not inverse.insert(wid, word): wid=randid() self._lexicon[word] = wid # Now take all the digrams and insert them into the digram map. for digram in self.createDigrams(word): set = self._digrams.get(digram, None) if set is None: self._digrams[digram] = set = IISet() set.insert(wid) return wid def get(self, pattern): """ Query the lexicon for words matching a pattern.""" # single word pattern produce a slicing problem below. # Because the splitter throws away single characters we can # return an empty tuple here. if len(pattern)==1: return () wc_set = [self.multi_wc, self.single_wc] digrams = [] globbing = 0 for i in range(len(pattern)): if pattern[i] in wc_set: globbing = 1 continue if i == 0: digrams.insert(i, (self.eow + pattern[i]) ) digrams.append((pattern[i] + pattern[i+1])) else: try: if pattern[i+1] not in wc_set: digrams.append( pattern[i] + pattern[i+1] ) except IndexError: digrams.append( (pattern[i] + self.eow) ) if not globbing: result = self._lexicon.get(pattern, None) if result is None: return () return (result, ) ## now get all of the intsets that contain the result digrams result = None for digram in digrams: result=union(result, self._digrams.get(digram, None)) if not result: return () else: ## now we have narrowed the list of possible candidates ## down to those words which contain digrams. However, ## some words may have been returned that match digrams, ## but do not match 'pattern'. This is because some words ## may contain all matching digrams, but in the wrong ## order. expr = re.compile(self.createRegex(pattern)) words = [] hits = IISet() for x in result: if expr.match(self._inverseLex[x]): hits.insert(x) return hits def __getitem__(self, word): """ """ return self.get(word) def query_hook(self, q): """expand wildcards""" ListType = type([]) i = len(q) - 1 while i >= 0: e = q[i] if isinstance(e, ListType): self.query_hook(e) elif isinstance(e, Op): pass elif ( (self.multi_wc in e) or (self.single_wc in e) ): wids = self.get(e) words = [] for wid in wids: if words: words.append(Or) words.append(wid) if not words: # if words is empty, return something that will make # textindex's __getitem__ return an empty result list words.append('') q[i] = words i = i - 1 return q def Splitter(self, astring, words=None, encoding="latin1"): """ wrap the splitter """ ## don't do anything, less efficient but there's not much ## sense in stemming a globbing lexicon. try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding ) except: return self.SplitterFunc(astring, words) def createRegex(self, pat): """Translate a PATTERN to a regular expression. There is no way to quote meta-characters. """ # Remove characters that are meaningful in a regex if not isinstance(pat, UnicodeType): transTable = string.maketrans("", "") result = string.translate(pat, transTable, r'()&|!@#$%^{}\<>.') else: transTable={} for ch in r'()&|!@#$%^{}\<>.': transTable[ord(ch)]=None result=pat.translate(transTable) # First, deal with multi-character globbing result = result.replace( '*', '.*') # Next, we need to deal with single-character globbing result = result.replace( '?', '.') return "%s$" % result
class GlobbingLexicon(Lexicon): """Lexicon which supports basic globbing function ('*' and '?'). This lexicon keeps several data structures around that are useful for searching. They are: '_lexicon' -- Contains the mapping from word => word_id '_inverseLex' -- Contains the mapping from word_id => word '_digrams' -- Contains a mapping from digram => word_id Before going further, it is necessary to understand what a digram is, as it is a core component of the structure of this lexicon. A digram is a two-letter sequence in a word. For example, the word 'zope' would be converted into the digrams:: ['$z', 'zo', 'op', 'pe', 'e$'] where the '$' is a word marker. It is used at the beginning and end of the words. Those digrams are significant. """ multi_wc = '*' single_wc = '?' eow = '$' def __init__(self, useSplitter=None, extra=None): self.clear() self.useSplitter = useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() self._digrams = OOBTree() def _convertBTrees(self, threshold=200): Lexicon._convertBTrees(self, threshold) if type(self._digrams) is OOBTree: return from BTrees.convert import convert _digrams = self._digrams self._digrams = OOBTree() self._digrams._p_jar = self._p_jar convert(_digrams, self._digrams, threshold, IITreeSet) def createDigrams(self, word): """Returns a list with the set of digrams in the word.""" word = '$' + word + '$' return [word[i:i + 2] for i in range(len(word) - 1)] def getWordId(self, word): """Provided 'word', return the matching integer word id.""" if self._lexicon.has_key(word): return self._lexicon[word] else: return self.assignWordId(word) set = getWordId # Kludge for old code def getWord(self, wid): return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word, and return it.""" # Double check it's not in the lexicon already, and if it is, just # return it. if self._lexicon.has_key(word): return self._lexicon[word] # Get word id. BBB Backward compat pain. inverse = self._inverseLex try: insert = inverse.insert except AttributeError: # we have an "old" BTree object if inverse: wid = inverse.keys()[-1] + 1 else: self._inverseLex = IOBTree() wid = 1 inverse[wid] = word else: # we have a "new" IOBTree object wid = randid() while not inverse.insert(wid, word): wid = randid() self._lexicon[word] = wid # Now take all the digrams and insert them into the digram map. for digram in self.createDigrams(word): set = self._digrams.get(digram, None) if set is None: self._digrams[digram] = set = IISet() set.insert(wid) return wid def get(self, pattern): """ Query the lexicon for words matching a pattern.""" # single word pattern produce a slicing problem below. # Because the splitter throws away single characters we can # return an empty tuple here. if len(pattern) == 1: return () wc_set = [self.multi_wc, self.single_wc] digrams = [] globbing = 0 for i in range(len(pattern)): if pattern[i] in wc_set: globbing = 1 continue if i == 0: digrams.insert(i, (self.eow + pattern[i])) digrams.append((pattern[i] + pattern[i + 1])) else: try: if pattern[i + 1] not in wc_set: digrams.append(pattern[i] + pattern[i + 1]) except IndexError: digrams.append((pattern[i] + self.eow)) if not globbing: result = self._lexicon.get(pattern, None) if result is None: return () return (result, ) ## now get all of the intsets that contain the result digrams result = None for digram in digrams: result = union(result, self._digrams.get(digram, None)) if not result: return () else: ## now we have narrowed the list of possible candidates ## down to those words which contain digrams. However, ## some words may have been returned that match digrams, ## but do not match 'pattern'. This is because some words ## may contain all matching digrams, but in the wrong ## order. expr = re.compile(self.createRegex(pattern)) words = [] hits = IISet() for x in result: if expr.match(self._inverseLex[x]): hits.insert(x) return hits def __getitem__(self, word): """ """ return self.get(word) def query_hook(self, q): """expand wildcards""" ListType = type([]) i = len(q) - 1 while i >= 0: e = q[i] if isinstance(e, ListType): self.query_hook(e) elif isinstance(e, Op): pass elif ((self.multi_wc in e) or (self.single_wc in e)): wids = self.get(e) words = [] for wid in wids: if words: words.append(Or) words.append(wid) if not words: # if words is empty, return something that will make # textindex's __getitem__ return an empty result list words.append('') q[i] = words i = i - 1 return q def Splitter(self, astring, words=None, encoding="latin1"): """ wrap the splitter """ ## don't do anything, less efficient but there's not much ## sense in stemming a globbing lexicon. try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding) except: return self.SplitterFunc(astring, words) def createRegex(self, pat): """Translate a PATTERN to a regular expression. There is no way to quote meta-characters. """ # Remove characters that are meaningful in a regex if not isinstance(pat, UnicodeType): transTable = string.maketrans("", "") result = string.translate(pat, transTable, r'()&|!@#$%^{}\<>.') else: transTable = {} for ch in r'()&|!@#$%^{}\<>.': transTable[ord(ch)] = None result = pat.translate(transTable) # First, deal with multi-character globbing result = result.replace('*', '.*') # Next, we need to deal with single-character globbing result = result.replace('?', '.') return "%s$" % result
class Indexer(object): filestorage = database = connection = root = None def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words") def dumpfreqs(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) L = [] for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f L.append((freq, wid, lexicon.get_word(wid))) L.sort() L.reverse() for freq, wid, word in L: print("%10d %10d %s" % (wid, freq, word)) def dumpwids(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid))) def dumpwords(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for word in lexicon.words(): wid = lexicon.get_wid(word) freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, word)) def close(self): self.root = None if self.connection is not None: self.connection.close() self.connection = None if self.database is not None: self.database.close() self.database = None if self.filestorage is not None: self.filestorage.close() self.filestorage = None def interact(self, nbest=NBEST, maxlines=MAXLINES): try: import readline except ImportError: pass text = "" top = 0 results = [] while 1: try: line = raw_input("Query: ") except EOFError: print("\nBye.") break line = line.strip() if line.startswith("/"): self.specialcommand(line, results, top - nbest) continue if line: text = line top = 0 else: if not text: continue try: results, n = self.timequery(text, top + nbest) except KeyboardInterrupt: raise except: reportexc() text = "" continue if len(results) <= top: if not n: print("No hits for %r." % text) else: print("No more hits for %r." % text) text = "" continue print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n), end=" ") print("for query %s]" % repr(text)) self.formatresults(text, results, maxlines, top, top+nbest) top += nbest def specialcommand(self, line, results, first): assert line.startswith("/") line = line[1:] if not line: n = first else: try: n = int(line) - 1 except: print("Huh?") return if n < 0 or n >= len(results): print("Out of range") return docid, score = results[n] path = self.docpaths[docid] i = path.rfind("/") assert i > 0 folder = path[:i] n = path[i+1:] cmd = "show +%s %s" % (folder, n) if os.getenv("DISPLAY"): os.system("xterm -e sh -c '%s | less' &" % cmd) else: os.system(cmd) def query(self, text, nbest=NBEST, maxlines=MAXLINES): results, n = self.timequery(text, nbest) if not n: print("No hits for %r." % text) return print("[Results 1-%d from %d]" % (len(results), n)) self.formatresults(text, results, maxlines) def timequery(self, text, nbest): t0 = time.time() c0 = time.clock() results, n = self.index.query(text, 0, nbest) t1 = time.time() c1 = time.clock() print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)) return results, n def formatresults(self, text, results, maxlines=MAXLINES, lo=0, hi=sys.maxint): stop = self.stopdict.has_key words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)] pattern = r"\b(" + "|".join(words) + r")\b" pattern = pattern.replace("*", ".*") # glob -> re syntax prog = re.compile(pattern, re.IGNORECASE) print('='*70) rank = lo for docid, score in results[lo:hi]: rank += 1 path = self.docpaths[docid] score *= 100.0 print("Rank: %d Score: %d%% File: %s" % (rank, score, path)) path = os.path.join(self.mh.getpath(), path) try: fp = open(path) except (IOError, OSError) as msg: print("Can't open:", msg) continue msg = mhlib.Message("<folder>", 0, fp) for header in "From", "To", "Cc", "Bcc", "Subject", "Date": h = msg.getheader(header) if h: print("%-8s %s" % (header+":", h)) text = self.getmessagetext(msg) if text: print() nleft = maxlines for part in text: for line in part.splitlines(): if prog.search(line): print(line) nleft -= 1 if nleft <= 0: break if nleft <= 0: break print('-'*70) def update(self, args): folder = None seqs = [] for arg in args: if arg.startswith("+"): if folder is None: folder = arg[1:] else: print("only one folder at a time") return else: seqs.append(arg) if not folder: folder = self.mh.getcontext() if not seqs: seqs = ['all'] try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) return dict = {} for seq in seqs: try: nums = f.parsesequence(seq) except mhlib.Error as msg: print(msg or "unparsable message sequence: %s" % repr(seq)) return for n in nums: dict[n] = n msgs = dict.keys() msgs.sort() self.updatefolder(f, msgs) self.commit() def optimize(self, args): uniqwords = {} for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nOPTIMIZE FOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.prescan(f, f.listmessages(), uniqwords) L = [(uniqwords[word], word) for word in uniqwords.keys()] L.sort() L.reverse() for i in range(100): print("%3d. %6d %s" % ((i+1,) + L[i])) self.index.lexicon.sourceToWordIds([word for (count, word) in L]) def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1 def bulkupdate(self, args): if not args: print("No folders specified; use ALL to bulk-index all folders") return if "ALL" in args: i = args.index("ALL") args[i:i+1] = self.mh.listfolders() for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nFOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.updatefolder(f, f.listmessages()) print("Total", len(self.docpaths)) self.commit() print("Indexed", self.index.lexicon._nbytes, "bytes and",) print(self.index.lexicon._nwords, "words;",) print(len(self.index.lexicon._words), "unique words.") def updatefolder(self, f, msgs): self.watchfolders[f.name] = self.getmtime(f.name) for n in msgs: path = "%s/%s" % (f.name, n) docid = self.path2docid.get(path, 0) if docid and self.getmtime(path) == self.doctimes.get(docid, 0): print("unchanged", docid, path) continue docid = self.newdocid(path) try: m = f.openmessage(n) except IOError: print("disappeared", docid, path) self.unindexpath(path) continue text = self.getmessagetext(m, f.name) if not text: self.unindexpath(path) continue print("indexing", docid, path) self.index.index_doc(docid, text) self.maycommit() # Remove messages from the folder that no longer exist for path in list(self.path2docid.keys(f.name)): if not path.startswith(f.name + "/"): break if self.getmtime(path) == 0: self.unindexpath(path) print("done.") def unindexpath(self, path): if self.path2docid.has_key(path): docid = self.path2docid[path] print("unindexing", docid, path) del self.docpaths[docid] del self.doctimes[docid] del self.path2docid[path] try: self.index.unindex_doc(docid) except KeyError as msg: print("KeyError", msg) self.maycommit() def getmessagetext(self, m, name=None): L = [] if name: L.append("_folder " + name) # To restrict search to a folder self.getheaders(m, L) try: self.getmsgparts(m, L, 0) except KeyboardInterrupt: raise except: print("(getmsgparts failed:)") reportexc() return L def getmsgparts(self, m, L, level): ctype = m.gettype() if level or ctype != "text/plain": print(". "*level + str(ctype)) if ctype == "text/plain": L.append(m.getbodytext()) elif ctype in ("multipart/alternative", "multipart/mixed"): for part in m.getbodyparts(): self.getmsgparts(part, L, level+1) elif ctype == "message/rfc822": f = StringIO(m.getbodytext()) m = mhlib.Message("<folder>", 0, f) self.getheaders(m, L) self.getmsgparts(m, L, level+1) def getheaders(self, m, L): H = [] for key in "from", "to", "cc", "bcc", "subject": value = m.get(key) if value: H.append(value) if H: L.append("\n".join(H)) def newdocid(self, path): docid = self.path2docid.get(path) if docid is not None: self.doctimes[docid] = self.getmtime(path) return docid docid = self.maxdocid + 1 self.maxdocid = docid self.docpaths[docid] = path self.doctimes[docid] = self.getmtime(path) self.path2docid[path] = docid return docid def getmtime(self, path): path = os.path.join(self.mh.getpath(), path) try: st = os.stat(path) except os.error as msg: return 0 return int(st[ST_MTIME]) def maycommit(self): self.trans_count += 1 if self.trans_count >= self.trans_limit > 0: self.commit() def commit(self): if self.trans_count > 0: print("committing...") transaction.commit() self.trans_count = 0 self.pack_count += 1 if self.pack_count >= self.pack_limit > 0: self.pack() def pack(self): if self.pack_count > 0: print("packing...") self.database.pack() self.pack_count = 0
class OrderedBTreeContainer(BrowserDefaultMixin): """ """ security = ClassSecurityInfo() implements(interfaces.IOrderedBTreeContainer) meta_type = 'OrderedBTreeContainer' _at_rename_after_creation = True schema = OrderedBTreeContainer_schema ##code-section class-header #fill in your manual code here # Methods ##/code-section class-header # Methods # Manually created methods security.declarePrivate('_insert') def _insert(self, pos, id): """ """ positionId = self._positionId idPosition = self._idPosition # check if it is not inserted yet if positionId.insert(pos, id): idPosition[id] = pos return # create a gap in the list self._delta(pos, pos+1) # assign position and id positionId[pos] = id idPosition[id] = pos security.declarePrivate('_delete') def _delete(self, pos): """ """ positionId = self._positionId idPosition = self._idPosition if pos is None: return if len(positionId) <= 0: return if len(positionId) == 1: id = positionId[pos] del positionId[pos] del idPosition[id] return self._delta(pos, pos+1) security.declarePrivate('_moveObject') def _moveObject(self, id, position): """ Move id to position Does not check if position is sane """ if position < 0: position = 0 obj_pos = self.getObjectPosition(id) if obj_pos == position: return self._delete(obj_pos) self._insert(position, id) security.declarePrivate('__init__') def __init__(self, oid, **kwargs): self._positionId = IOBTree() self._idPosition = OIBTree() security.declarePrivate('_delta') def _delta(self, pos1, pos2): """ in the _positionId btree, move either creates a gap or shifts a portion left """ if pos1 == pos2: return #assert(abs(pos1-pos2) == 1) # haven't implemented delta completely yet positionId = self._positionId idPosition = self._idPosition max = 0 if len(positionId) > 1: max = positionId.maxKey() else: return if pos1 < pos2: # move left, ie, shortens list, that is, move left from pos2 to # pos1 delIds = [] for i in range(pos1, pos2): delIds.append(positionId[i]) for i in range(pos1, max-(pos2-pos1)+1): idPosition[positionId[i+pos2-pos1]] = i positionId[i] = positionId[pos2+i-pos1] # clear out ids and positions no longer used for id in delIds: del idPosition[id] for i in range(max-(pos2-pos1)+1, max+1): del positionId[i] else: # create a gap, that is, lengthens the list, move right from pos1 # to pos2, and shift the rest right for i in range(max+abs(pos1-pos2), pos2-1, -1): idPosition[positionId[i-abs(pos1-pos2)]] = i positionId[i] = positionId[i-abs(pos1-pos2)] for i in range(pos2, pos1+1): del positionId[i] security.declareProtected(permissions.ModifyPortalContent, 'getObjectPosition') def getObjectPosition(self, id): """ Get the object position for a given id """ if id is not None and self._idPosition.has_key(id): return self._idPosition[id] return None security.declareProtected(permissions.ModifyPortalContent, 'getIdsInOrder') def getIdsInOrder(self, start, end): """ return a list of ids starting at start and ending at end if end is None or greater than the end, the list is truncated en start is None or less than 0 it is set to 0 the ids is a list of length(end-start) """ ids = [] endIds = end # check if the tree is empty and if the list is wrapped if len(self._positionId) > 0 and (end is None or end < 0): endIds = self._positionId.maxKey() elif len(self._positionId) == 0: return [] if start is None or start < 0: start = 0 for i in range(start, endIds+1): if self._positionId.has_key(i): ids.append(self._positionId[i]) else: break if end is None: return ids return ids[:end-start] security.declareProtected(permissions.ModifyPortalContent, 'getObjectId') def getObjectId(self, position): """ """ if position is not None and self._positionId.has_key(position): return self._positionId[position] return None security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsUp') def moveObjectsUp(self, ids, delta=1, RESPONSE=None): """ Move an object up """ if type(ids) is StringType: ids = (ids,) for id in ids: self._moveObject(id, self.getObjectPosition(id)-delta) if RESPONSE is not None: RESPONSE.redirect('manage_workspace') security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsDown') def moveObjectsDown(self, ids, delta=1, RESPONSE=None): """ move an object down """ if type(ids) is StringType: ids = (ids,) for id in ids: self._moveObject(id, self.getObjectPosition(id)+delta) if RESPONSE is not None: RESPONSE.redirect('manage_workspace') security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsToTop') def moveObjectsToTop(self, ids, RESPONSE=None): """ move an object to the top """ if type(ids) is StringType: ids = (ids,) i = 0 while i < len(ids): self._moveObject(ids[i], i) i = i + 1 if RESPONSE is not None: RESPONSE.redirect('manage_workspace') security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsToBottom') def moveObjectsToBottom(self, ids, RESPONSE=None): """ move an object to the bottom """ if type(ids) is StringType: ids = (ids,) i = 0 max = self._positionId.maxKey() length = len(ids) while i < length: self._moveObject(ids[i], max - (length - 1) + i) i += 1 if RESPONSE is not None: RESPONSE.redirect('manage_workspace') security.declareProtected(permissions.ModifyPortalContent, 'orderObjects') def orderObjects(self, key, reverse=None): """ Order sub-objects by key and direction. """ ids = [ id for id, obj in sort( self.objectItems(), ( (key, 'cmp', 'asc'), ) ) ] if reverse: ids.reverse() self._clear_and_rebuild(ids=ids) return len(ids) security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsByDelta') def moveObjectsByDelta(self, ids, delta, subset_ids=None): """ Move specified sub-objects by delta. """ raise "Not implemented yet" if type(ids) is StringType: ids = (ids,) min_position = 0 objects = list(self._objects) if subset_ids == None: # OLD: subset_ids = [ obj['id'] for obj in objects ] subset_ids = self.getCMFObjectsSubsetIds(objects) else: subset_ids = list(subset_ids) # unify moving direction if delta > 0: ids = list(ids) ids.reverse() subset_ids.reverse() counter = 0 for id in ids: try: old_position = subset_ids.index(id) except ValueError: continue new_position = max( old_position - abs(delta), min_position ) if new_position == min_position: min_position += 1 if not old_position == new_position: subset_ids.remove(id) subset_ids.insert(new_position, id) counter += 1 if counter > 0: if delta > 0: subset_ids.reverse() obj_dict = {} for obj in objects: obj_dict[ obj['id'] ] = obj pos = 0 for i in range( len(objects) ): if objects[i]['id'] in subset_ids: try: objects[i] = obj_dict[ subset_ids[pos] ] pos += 1 except KeyError: raise ValueError('The object with the id "%s" does ' 'not exist.' % subset_ids[pos]) self._objects = tuple(objects) return counter security.declareProtected(permissions.ModifyPortalContent, 'getFirstEntryId') def getFirstEntryId(self): """ """ if len(self._positionId) > 0: return self._positionId[self._positionId.minKey()] return None security.declareProtected(permissions.ModifyPortalContent, 'getLastEntryId') def getLastEntryId(self): """ """ if len(self._positionId) > 0: return self._positionId[self._positionId.maxKey()] return None def addObject(self, id): """Adds object to end of btree, returns position """ if self.getObjectPosition(id) is not None: raise RuntimeError, "Object already in tree" if len(self._positionId) > 0: max = self._positionId.maxKey() self._positionId[max+1] = id self._idPosition[id] = max + 1 return max + 1 else: self._positionId[0] = id self._idPosition[id] = 0 return 0 def numberObjects(self): """ """ return len(self._positionId) security.declarePrivate('_clear_and_rebuild') def _clear_and_rebuild(self, ids=[]): """ """ self._positionId = IOBTree() self._idPosition = OIBTree() for id in ids: self.addObject(id)
class Lexicon(Persistent, Implicit): """Maps words to word ids and then some The Lexicon object is an attempt to abstract vocabularies out of Text indexes. This abstraction is not totally cooked yet, this module still includes the parser for the 'Text Index Query Language' and a few other hacks. """ # default for older objects stop_syn={} def __init__(self, stop_syn=None,useSplitter=None,extra=None): self.clear() if stop_syn is None: self.stop_syn = {} else: self.stop_syn = stop_syn self.useSplitter = Splitter.splitterNames[0] if useSplitter: self.useSplitter=useSplitter self.splitterParams = extra self.SplitterFunc = Splitter.getSplitter(self.useSplitter) def clear(self): self._lexicon = OIBTree() self._inverseLex = IOBTree() def _convertBTrees(self, threshold=200): if (type(self._lexicon) is OIBTree and type(getattr(self, '_inverseLex', None)) is IOBTree): return from BTrees.convert import convert lexicon=self._lexicon self._lexicon=OIBTree() self._lexicon._p_jar=self._p_jar convert(lexicon, self._lexicon, threshold) try: inverseLex=self._inverseLex self._inverseLex=IOBTree() except AttributeError: # older lexicons didn't have an inverse lexicon self._inverseLex=IOBTree() inverseLex=self._inverseLex self._inverseLex._p_jar=self._p_jar convert(inverseLex, self._inverseLex, threshold) def set_stop_syn(self, stop_syn): """ pass in a mapping of stopwords and synonyms. Format is: {'word' : [syn1, syn2, ..., synx]} Vocabularies do not necesarily need to implement this if their splitters do not support stemming or stoping. """ self.stop_syn = stop_syn def getWordId(self, word): """ return the word id of 'word' """ wid=self._lexicon.get(word, None) if wid is None: wid=self.assignWordId(word) return wid set = getWordId def getWord(self, wid): """ post-2.3.1b2 method, will not work with unconverted lexicons """ return self._inverseLex.get(wid, None) def assignWordId(self, word): """Assigns a new word id to the provided word and returns it.""" # First make sure it's not already in there if self._lexicon.has_key(word): return self._lexicon[word] try: inverse=self._inverseLex except AttributeError: # woops, old lexicom wo wids inverse=self._inverseLex=IOBTree() for word, wid in self._lexicon.items(): inverse[wid]=word wid=randid() while not inverse.insert(wid, word): wid=randid() if isinstance(word,StringType): self._lexicon[intern(word)] = wid else: self._lexicon[word] = wid return wid def get(self, key, default=None): """Return the matched word against the key.""" r=IISet() wid=self._lexicon.get(key, default) if wid is not None: r.insert(wid) return r def __getitem__(self, key): return self.get(key) def __len__(self): return len(self._lexicon) def Splitter(self, astring, words=None, encoding = "latin1"): """ wrap the splitter """ if words is None: words = self.stop_syn try: return self.SplitterFunc( astring, words, encoding=encoding, singlechar=self.splitterParams.splitterSingleChars, indexnumbers=self.splitterParams.splitterIndexNumbers, casefolding=self.splitterParams.splitterCasefolding ) except: return self.SplitterFunc(astring, words) def query_hook(self, q): """ we don't want to modify the query cuz we're dumb """ return q
class Table(Persistent): """Simple, generic relational table. """ schema = None _v_record_class = None def __init__(self, schema=None): if schema is not None: self.schema = schema columns = schema.get_columns() self.col_info = [] # [(tuple position, column),] self.positions = {} for i in range(len(columns)): # Leave space for the record ID at position 0. position = i + 1 self.col_info.append((position, columns[i])) self.positions[columns[i].name] = position self.proto_record = [None] * (len(columns) + 1) self.next_rid = 1 self.clear() def clear(self): self.data = IOBTree() # {rid -> record as tuple} self.indexes = {} # {index_name -> OOBTree({value -> IITreeSet})} self.primary_index = OIBTree() # {primary key -> rid} for position, column in self.col_info: if column.indexed: self.indexes[column.name] = OOBTree() def tuplify(self, params): """Accepts a mapping-like object and returns a tuple. """ record = self.proto_record[:] positions = self.positions if hasattr(params, '__record_schema__'): for name in params.__record_schema__.keys(): position = positions[name] record[position] = params[name] else: for name, value in params.items(): position = positions[name] record[position] = value return tuple(record) def insert(self, params): record = self.tuplify(params) # Determine the primary key. primary_key = [] for position, column in self.col_info: if column.primary: if record[position] is None: raise ValueError, ( "No value provided for primary key column %s" % repr(column.name)) primary_key.append(record[position]) if primary_key: primary_key = tuple(primary_key) if self.primary_index.has_key(primary_key): raise DuplicateError("Primary key %s in use" % repr(primary_key)) # Add a record. rid = self.next_rid self.next_rid += 1 # XXX Hotspot! record = (rid, ) + record[1:] self.data[rid] = record if primary_key: self.primary_index[primary_key] = rid # Add to indexes. for position, column in self.col_info: name = column.name value = record[position] if value is not None: if self.indexes.has_key(name): set = self.indexes[name].get(value) if set is None: set = IITreeSet() self.indexes[name][value] = set set.insert(rid) # Return the number of rows inserted. return 1 def delete(self, filter): rids = self._select_rids(self.tuplify(filter)) if rids is None: # Zap everything count = len(self.data) self.clear() return count elif not rids: # No rows selected return 0 rids = tuple(rids) # Make sure rids is a static sequence for rid in rids: old_r = self.data[rid] assert old_r[0] == rid primary_key = [] for position, column in self.col_info: old_value = old_r[position] if old_value is not None: if column.primary: primary_key.append(old_value) # Remove from indexes. index = self.indexes.get(column.name) if index is not None: if index.has_key(old_value): # Remove an index entry. set = index[old_value] set.remove(rid) if not set: del index[old_value] if primary_key: # Remove a primary key. primary_key = tuple(primary_key) assert self.primary_index[primary_key] == rid del self.primary_index[primary_key] # Remove the data. del self.data[rid] return len(rids) def update(self, filter, changes): rids = self._select_rids(self.tuplify(filter)) if rids is None: rids = self.data.keys() elif not rids: # Nothing needs to be updated. return 0 count = len(rids) # Identify changes. old_data = {} # rid -> old tuple new_data = {} # rid -> new tuple old_to_new = {} # old primary key -> new primary key new_to_rid = {} # new primary key -> rid record = self.tuplify(changes) for rid in rids: old_r = self.data[rid] old_data[rid] = old_r new_r = list(old_r) # new_r and old_r contain record tuples. for position, column in self.col_info: if record[position] is not None: new_r[position] = record[position] new_data[rid] = tuple(new_r) # Hmm. The code below allows an update to change the primary # key. It might be better to prevent primary key columns from # being changed by an update() call. opk = [] npk = [] for position, column in self.col_info: if column.primary: opk.append(old_r[position]) npk.append(new_r[position]) if opk != npk: opk = tuple(opk) npk = tuple(npk) old_to_new[opk] = npk new_to_rid[npk] = rid # Look for primary key conflicts. A primary key conflict can # occur when changing a record to a different primary key and # the new primary key is already in use. for pk in old_to_new.values(): if (self.primary_index.has_key(pk) and not old_to_new.has_key(pk)): raise DuplicateError("Primary key %s in use" % repr(pk)) # Update the data. self.data.update(new_data) # Remove old primary key indexes and insert new primary key indexes. for pk in old_to_new.keys(): del self.primary_index[pk] self.primary_index.update(new_to_rid) # Update indexes. for rid, old_r in old_data.items(): for position, column in self.col_info: index = self.indexes.get(column.name) if index is not None: new_value = record[position] old_value = old_r[position] if new_value != old_value: if old_value is not None and index.has_key(old_value): # Remove an index entry. set = index[old_value] set.remove(rid) if not set: del index[old_value] if new_value is not None: # Add an index entry. set = index.get(new_value) if set is None: set = IITreeSet() index[new_value] = set set.insert(rid) # Return the number of rows affected. return count def get_record_class(self): klass = self._v_record_class if klass is None: schema = {'rid': 0} for position, column in self.col_info: schema[column.name] = position class TableRecord(TableRecordMixin, Record): __record_schema__ = schema self._v_record_class = klass = TableRecord return klass def select(self, filter): rids = self._select_rids(self.tuplify(filter)) if rids is None: # All klass = self.get_record_class() return [klass(rec) for rec in self.data.values()] elif rids: # Some klass = self.get_record_class() data = self.data return [klass(data[rid]) for rid in rids] else: # None return [] def _select_rids(self, query): """Searches the table for matches, returning record ids. Returns a sequence of record ids, or None for all records. """ primary_key = [] params = 0 # The number of parameters specified primary_params = 0 # The number of primary params specified for position, column in self.col_info: value = query[position] if value is not None: params += 1 if column.primary: primary_params += 1 if primary_key is not None: primary_key.append(value) elif column.primary: # Didn't fully specify the primary key. # Can't search by primary key. primary_key = None if not params: # No query. Select all. return None # First strategy: try to satisfy the request by consulting # the primary key index. if primary_key: # The primary key is complete. The result set will have # either zero rows or one row. primary_key = tuple(primary_key) rid = self.primary_index.get(primary_key) if rid is None: return () # Possibly filter out the single item. if params > primary_params: cand = self.data[rid] for position, column in self.col_info: if query[position] is not None: if cand[position] != query[position]: # Not a match. return () return (rid, ) # Second strategy: try to satisfy the request by intersecting # indexes. rids = None iteration_filters = [] for position, column in self.col_info: value = query[position] if value is not None: index = self.indexes.get(column.name) if index is None: iteration_filters.append((position, value)) else: set = index.get(value) if set is None: # No rows satisfy this criterion. return () if rids is None: rids = set else: rids = intersection(rids, set) if not rids: # No rows satisfy all criteria. return () if rids is not None: rids = rids.keys() if not iteration_filters: # Indexes did all the work. No need to search each record. return rids # Fallback strategy: Eliminate items one by one. if rids is None: # Use the whole data set. candidates = self.data.values() else: # Use the specified records. candidates = [self.data[rid] for rid in rids] rids = [] append = rids.append for cand in candidates: for position, value in iteration_filters: if cand[position] != value: # Not a match. break else: # A match. append(cand[0]) return rids def __repr__(self): return "<%s(schema=%s)>" % (self.__class__.__name__, repr(self.schema))
class Indexer(object): filestorage = database = connection = root = None def __init__(self, datafs, writable=0, trans=0, pack=0): self.trans_limit = trans self.pack_limit = pack self.trans_count = 0 self.pack_count = 0 self.stopdict = get_stopdict() self.mh = mhlib.MH() self.filestorage = FileStorage(datafs, read_only=(not writable)) self.database = DB(self.filestorage) self.connection = self.database.open() self.root = self.connection.root() try: self.index = self.root["index"] except KeyError: self.index = self.root["index"] = TextIndexWrapper() try: self.docpaths = self.root["docpaths"] except KeyError: self.docpaths = self.root["docpaths"] = IOBTree() try: self.doctimes = self.root["doctimes"] except KeyError: self.doctimes = self.root["doctimes"] = IIBTree() try: self.watchfolders = self.root["watchfolders"] except KeyError: self.watchfolders = self.root["watchfolders"] = {} self.path2docid = OIBTree() for docid in self.docpaths.keys(): path = self.docpaths[docid] self.path2docid[path] = docid try: self.maxdocid = max(self.docpaths.keys()) except ValueError: self.maxdocid = 0 print(len(self.docpaths), "Document ids") print(len(self.path2docid), "Pathnames") print(self.index.lexicon.length(), "Words") def dumpfreqs(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) L = [] for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f L.append((freq, wid, lexicon.get_word(wid))) L.sort() L.reverse() for freq, wid, word in L: print("%10d %10d %s" % (wid, freq, word)) def dumpwids(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for wid in lexicon.wids(): freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid))) def dumpwords(self): lexicon = self.index.lexicon index = self.index.index assert isinstance(index, OkapiIndex) for word in lexicon.words(): wid = lexicon.get_wid(word) freq = 0 for f in index._wordinfo.get(wid, {}).values(): freq += f print("%10d %10d %s" % (wid, freq, word)) def close(self): self.root = None if self.connection is not None: self.connection.close() self.connection = None if self.database is not None: self.database.close() self.database = None if self.filestorage is not None: self.filestorage.close() self.filestorage = None def interact(self, nbest=NBEST, maxlines=MAXLINES): try: import readline except ImportError: pass text = "" top = 0 results = [] while 1: try: line = raw_input("Query: ") except EOFError: print("\nBye.") break line = line.strip() if line.startswith("/"): self.specialcommand(line, results, top - nbest) continue if line: text = line top = 0 else: if not text: continue try: results, n = self.timequery(text, top + nbest) except KeyboardInterrupt: raise except: reportexc() text = "" continue if len(results) <= top: if not n: print("No hits for %r." % text) else: print("No more hits for %r." % text) text = "" continue print("[Results %d-%d from %d" % (top + 1, min(n, top + nbest), n), end=" ") print("for query %s]" % repr(text)) self.formatresults(text, results, maxlines, top, top + nbest) top += nbest def specialcommand(self, line, results, first): assert line.startswith("/") line = line[1:] if not line: n = first else: try: n = int(line) - 1 except: print("Huh?") return if n < 0 or n >= len(results): print("Out of range") return docid, score = results[n] path = self.docpaths[docid] i = path.rfind("/") assert i > 0 folder = path[:i] n = path[i + 1:] cmd = "show +%s %s" % (folder, n) if os.getenv("DISPLAY"): os.system("xterm -e sh -c '%s | less' &" % cmd) else: os.system(cmd) def query(self, text, nbest=NBEST, maxlines=MAXLINES): results, n = self.timequery(text, nbest) if not n: print("No hits for %r." % text) return print("[Results 1-%d from %d]" % (len(results), n)) self.formatresults(text, results, maxlines) def timequery(self, text, nbest): t0 = time.time() c0 = time.clock() results, n = self.index.query(text, 0, nbest) t1 = time.time() c1 = time.clock() print("[Query time: %.3f real, %.3f user]" % (t1 - t0, c1 - c0)) return results, n def formatresults(self, text, results, maxlines=MAXLINES, lo=0, hi=sys.maxint): stop = self.stopdict.has_key words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)] pattern = r"\b(" + "|".join(words) + r")\b" pattern = pattern.replace("*", ".*") # glob -> re syntax prog = re.compile(pattern, re.IGNORECASE) print('=' * 70) rank = lo for docid, score in results[lo:hi]: rank += 1 path = self.docpaths[docid] score *= 100.0 print("Rank: %d Score: %d%% File: %s" % (rank, score, path)) path = os.path.join(self.mh.getpath(), path) try: fp = open(path) except (IOError, OSError) as msg: print("Can't open:", msg) continue msg = mhlib.Message("<folder>", 0, fp) for header in "From", "To", "Cc", "Bcc", "Subject", "Date": h = msg.getheader(header) if h: print("%-8s %s" % (header + ":", h)) text = self.getmessagetext(msg) if text: print() nleft = maxlines for part in text: for line in part.splitlines(): if prog.search(line): print(line) nleft -= 1 if nleft <= 0: break if nleft <= 0: break print('-' * 70) def update(self, args): folder = None seqs = [] for arg in args: if arg.startswith("+"): if folder is None: folder = arg[1:] else: print("only one folder at a time") return else: seqs.append(arg) if not folder: folder = self.mh.getcontext() if not seqs: seqs = ['all'] try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) return dict = {} for seq in seqs: try: nums = f.parsesequence(seq) except mhlib.Error as msg: print(msg or "unparsable message sequence: %s" % repr(seq)) return for n in nums: dict[n] = n msgs = dict.keys() msgs.sort() self.updatefolder(f, msgs) self.commit() def optimize(self, args): uniqwords = {} for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nOPTIMIZE FOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.prescan(f, f.listmessages(), uniqwords) L = [(uniqwords[word], word) for word in uniqwords.keys()] L.sort() L.reverse() for i in range(100): print("%3d. %6d %s" % ((i + 1, ) + L[i])) self.index.lexicon.sourceToWordIds([word for (count, word) in L]) def prescan(self, f, msgs, uniqwords): pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()] for n in msgs: print("prescanning", n) m = f.openmessage(n) text = self.getmessagetext(m, f.name) for p in pipeline: text = p.process(text) for word in text: uniqwords[word] = uniqwords.get(word, 0) + 1 def bulkupdate(self, args): if not args: print("No folders specified; use ALL to bulk-index all folders") return if "ALL" in args: i = args.index("ALL") args[i:i + 1] = self.mh.listfolders() for folder in args: if folder.startswith("+"): folder = folder[1:] print("\nFOLDER", folder) try: f = self.mh.openfolder(folder) except mhlib.Error as msg: print(msg) continue self.updatefolder(f, f.listmessages()) print("Total", len(self.docpaths)) self.commit() print( "Indexed", self.index.lexicon._nbytes, "bytes and", ) print( self.index.lexicon._nwords, "words;", ) print(len(self.index.lexicon._words), "unique words.") def updatefolder(self, f, msgs): self.watchfolders[f.name] = self.getmtime(f.name) for n in msgs: path = "%s/%s" % (f.name, n) docid = self.path2docid.get(path, 0) if docid and self.getmtime(path) == self.doctimes.get(docid, 0): print("unchanged", docid, path) continue docid = self.newdocid(path) try: m = f.openmessage(n) except IOError: print("disappeared", docid, path) self.unindexpath(path) continue text = self.getmessagetext(m, f.name) if not text: self.unindexpath(path) continue print("indexing", docid, path) self.index.index_doc(docid, text) self.maycommit() # Remove messages from the folder that no longer exist for path in list(self.path2docid.keys(f.name)): if not path.startswith(f.name + "/"): break if self.getmtime(path) == 0: self.unindexpath(path) print("done.") def unindexpath(self, path): if self.path2docid.has_key(path): docid = self.path2docid[path] print("unindexing", docid, path) del self.docpaths[docid] del self.doctimes[docid] del self.path2docid[path] try: self.index.unindex_doc(docid) except KeyError as msg: print("KeyError", msg) self.maycommit() def getmessagetext(self, m, name=None): L = [] if name: L.append("_folder " + name) # To restrict search to a folder self.getheaders(m, L) try: self.getmsgparts(m, L, 0) except KeyboardInterrupt: raise except: print("(getmsgparts failed:)") reportexc() return L def getmsgparts(self, m, L, level): ctype = m.gettype() if level or ctype != "text/plain": print(". " * level + str(ctype)) if ctype == "text/plain": L.append(m.getbodytext()) elif ctype in ("multipart/alternative", "multipart/mixed"): for part in m.getbodyparts(): self.getmsgparts(part, L, level + 1) elif ctype == "message/rfc822": f = StringIO(m.getbodytext()) m = mhlib.Message("<folder>", 0, f) self.getheaders(m, L) self.getmsgparts(m, L, level + 1) def getheaders(self, m, L): H = [] for key in "from", "to", "cc", "bcc", "subject": value = m.get(key) if value: H.append(value) if H: L.append("\n".join(H)) def newdocid(self, path): docid = self.path2docid.get(path) if docid is not None: self.doctimes[docid] = self.getmtime(path) return docid docid = self.maxdocid + 1 self.maxdocid = docid self.docpaths[docid] = path self.doctimes[docid] = self.getmtime(path) self.path2docid[path] = docid return docid def getmtime(self, path): path = os.path.join(self.mh.getpath(), path) try: st = os.stat(path) except os.error as msg: return 0 return int(st[ST_MTIME]) def maycommit(self): self.trans_count += 1 if self.trans_count >= self.trans_limit > 0: self.commit() def commit(self): if self.trans_count > 0: print("committing...") transaction.commit() self.trans_count = 0 self.pack_count += 1 if self.pack_count >= self.pack_limit > 0: self.pack() def pack(self): if self.pack_count > 0: print("packing...") self.database.pack() self.pack_count = 0