class Repository(Implicit, Persistent):
    """The repository implementation manages the actual data of versions
       and version histories. It does not handle user interface issues."""

    def __init__(self):
        # These keep track of symbolic label and branch names that
        # have been used to ensure that they don't collide.
        self._branches = OIBTree()
        self._branches['mainline'] = 1
        self._labels = OIBTree()

        self._histories = OOBTree()
        self._created = time.time()

    security = ClassSecurityInfo()

    security.declarePrivate('createVersionHistory')
    def createVersionHistory(self, object):
        """Internal: create a new version history for a resource."""
        # When one creates the first version in a version history, neither
        # the version or version history yet have a _p_jar, which causes
        # copy operations to fail. To work around that, we share our _p_jar.
        history_id = None
        while history_id is None or self._histories.has_key(history_id):
            history_id = str(randint(1, 9999999999))
        history = ZopeVersionHistory(history_id, object)
        self._histories[history_id] = history
        return history.__of__(self)

    security.declarePrivate('getVersionHistory')
    def getVersionHistory(self, history_id):
        """Internal: return a version history given a version history id."""
        return self._histories[history_id].__of__(self)

    security.declarePrivate('replaceState')
    def replaceState(self, obj, new_state):
        """Internal: replace the state of a persistent object.
        """
        non_versioned = getNonVersionedData(obj)
        # XXX There ought to be some way to do this more cleanly.
        # This fills the __dict__ of the old object with new state.
        # The other way to achieve the desired effect is to replace
        # the object in its container, but this method preserves the
        # identity of the object.
        if obj.__class__ is not new_state.__class__:
            raise VersionControlError(
                "The class of the versioned object has changed. %s != %s"
                % (repr(obj.__class__, new_state.__class__)))
        obj._p_changed = 1
        for key in obj.__dict__.keys():
            if not new_state.__dict__.has_key(key):
                del obj.__dict__[key]
        for key, value in new_state.__dict__.items():
            obj.__dict__[key] = value
        if non_versioned:
            # Restore the non-versioned data into the new state.
            restoreNonVersionedData(obj, non_versioned)
        return obj

    #####################################################################
    # This is the implementation of the public version control interface.
    #####################################################################

    security.declarePublic('isAVersionableResource')
    def isAVersionableResource(self, obj):
        # For now, an object must be persistent (have its own db record)
        # in order to be considered a versionable resource.
        return isAVersionableResource(obj)

    security.declarePublic('isUnderVersionControl')
    def isUnderVersionControl(self, object):
        return hasattr(object, '__vc_info__')

    security.declarePublic('isResourceUpToDate')
    def isResourceUpToDate(self, object, require_branch=0):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        branch = 'mainline'
        if info.sticky:
            if info.sticky[0] == 'B':
                branch = info.sticky[1]
            elif require_branch:
                # The object is updated to a particular version
                # rather than a branch.  The caller
                # requires a branch.
                return 0
        return history.isLatestVersion(info.version_id, branch)

    security.declarePublic('isResourceChanged')
    def isResourceChanged(self, object):
        # Return true if the state of a resource has changed in a transaction
        # *after* the version bookkeeping was saved. Note that this method is
        # not appropriate for detecting changes within a transaction!
        info = self.getVersionInfo(object)
        itime = getattr(info, '_p_mtime', None)
        if itime is None:
            return 0
        mtime = Utility._findModificationTime(object)
        if mtime is None:
            return 0
        return mtime > itime

    security.declarePublic('getVersionInfo')
    def getVersionInfo(self, object):
        info = getattr(object, '__vc_info__', None)
        if info is not None:
            return info
        raise VersionControlError(
            'The specified resource is not under version control.'
            )

    security.declareProtected(use_vc_permission, 'applyVersionControl')
    def applyVersionControl(self, object, message=None):
        if self.isUnderVersionControl(object):
            raise VersionControlError(
                'The resource is already under version control.'
                )
        if not self.isAVersionableResource(object):
            raise VersionControlError(
                'This resource cannot be put under version control.'
                )

        # Need to check the parent to see if the container of the object
        # being put under version control is itself a version-controlled
        # object. If so, we need to use the branch id of the container.
        branch = 'mainline'
        parent = aq_parent(aq_inner(object))
        p_info = getattr(parent, '__vc_info__', None)
        if p_info is not None:
            sticky = p_info.sticky
            if sticky and sticky[0] == 'B':
                branch = sticky[1]

        # Create a new version history and initial version object.
        history = self.createVersionHistory(object)
        version = history.createVersion(object, branch)

        history_id = history.getId()
        version_id = version.getId()

        # Add bookkeeping information to the version controlled object.
        info = VersionInfo(history_id, version_id, VersionInfo.CHECKED_IN)
        if branch != 'mainline':
            info.sticky = ('B', branch)
        object.__vc_info__ = info

        # Save an audit record of the action being performed.
        history.addLogEntry(version_id,
                            LogEntry.ACTION_CHECKIN,
                            _findPath(object),
                            message is None and 'Initial checkin.' or message
                            )
        return object

    security.declareProtected(use_vc_permission, 'checkoutResource')
    def checkoutResource(self, object):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource is already checked out.'
                )

        if info.sticky and info.sticky[0] != 'B':
            raise VersionControlError(
                'The selected resource has been updated to a particular '
                'version, label or date. The resource must be updated to '
                'the mainline or a branch before it may be checked out.'
                )

        if not self.isResourceUpToDate(object):
            raise VersionControlError(
                'The selected resource is not up to date!'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        # Save an audit record of the action being performed.
        history.addLogEntry(info.version_id,
                            LogEntry.ACTION_CHECKOUT,
                            ob_path
                            )

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.status = newinfo.CHECKED_OUT
        object.__vc_info__ = newinfo
        return object

    security.declareProtected(use_vc_permission, 'checkinResource')
    def checkinResource(self, object, message=''):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_OUT:
            raise VersionControlError(
                'The selected resource is not checked out.'
                )

        if info.sticky and info.sticky[0] != 'B':
            raise VersionControlError(
                'The selected resource has been updated to a particular '
                'version, label or date. The resource must be updated to '
                'the mainline or a branch before it may be checked in.'
                )

        if not self.isResourceUpToDate(object):
            raise VersionControlError(
                'The selected resource is not up to date!'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        branch = 'mainline'
        if info.sticky is not None and info.sticky[0] == 'B':
            branch = info.sticky[1]

        version = history.createVersion(object, branch)

        # Save an audit record of the action being performed.
        history.addLogEntry(version.getId(),
                            LogEntry.ACTION_CHECKIN,
                            ob_path,
                            message
                            )

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.version_id = version.getId()
        newinfo.status = newinfo.CHECKED_IN
        object.__vc_info__ = newinfo
        return object

    security.declareProtected(use_vc_permission, 'uncheckoutResource')
    def uncheckoutResource(self, object):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_OUT:
            raise VersionControlError(
                'The selected resource is not checked out.'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        version = history.getVersionById(info.version_id)
        new_obj = version.copyState()

        # Save an audit record of the action being performed.
        history.addLogEntry(info.version_id,
                            LogEntry.ACTION_UNCHECKOUT,
                            ob_path
                            )

        # Replace the state of the object with a reverted state.
        new_obj = self.replaceState(object, new_obj)

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.version_id = version.getId()
        newinfo.status = newinfo.CHECKED_IN
        new_obj.__vc_info__ = newinfo
        return new_obj

    security.declareProtected(use_vc_permission, 'updateResource')
    def updateResource(self, object, selector=None):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in to be updated.'
                )

        history = self.getVersionHistory(info.history_id)
        version = None
        sticky = info.sticky

        if not selector:
            # If selector is null, update to the latest version taking any
            # sticky attrs into account (branch, date). Note that the sticky
            # tag could also be a date or version id. We don't bother checking
            # for those, since in both cases we do nothing (because we'll
            # always be up to date until the sticky tag changes).
            if sticky and sticky[0] == 'L':
                # A label sticky tag, so update to that label (since it is
                # possible, but unlikely, that the label has been moved).
                version = history.getVersionByLabel(sticky[1])
            elif sticky and sticky[0] == 'B':
                # A branch sticky tag. Update to latest version on branch.
                version = history.getLatestVersion(selector)
            else:
                # Update to mainline, forgetting any date or version id
                # sticky tag that was previously associated with the object.
                version = history.getLatestVersion('mainline')
                sticky = None
        else:
            # If the selector is non-null, we find the version specified
            # and update the sticky tag. Later we'll check the version we
            # found and decide whether we really need to update the object.
            if history.hasVersionId(selector):
                version = history.getVersionById(selector)
                sticky = ('V', selector)

            elif self._labels.has_key(selector):
                version = history.getVersionByLabel(selector)
                sticky = ('L', selector)

            elif self._branches.has_key(selector):
                version = history.getLatestVersion(selector)
                if selector == 'mainline':
                    sticky = None
                else:
                    sticky = ('B', selector)
            else:
                try:    date = DateTime(selector)
                except:
                    raise VersionControlError(
                        'Invalid version selector: %s' % selector
                        )
                else:
                    timestamp = date.timeTime()
                    sticky = ('D', timestamp)
                    # Fix!
                    branch = history.findBranchId(info.version_id)
                    version = history.getVersionByDate(branch, timestamp)

        # If the state of the resource really needs to be changed, do the
        # update and make a log entry for the update.
        version_id = version and version.getId() or info.version_id
        new_object = object
        if version and (version_id != info.version_id):
            new_object = version.copyState()
            new_object = self.replaceState(object, new_object)

            history.addLogEntry(version_id,
                                LogEntry.ACTION_UPDATE,
                                _findPath(new_object)
                                )

        # Update bookkeeping information.
        newinfo = info.clone(1)
        newinfo.version_id = version_id
        newinfo.status = newinfo.CHECKED_IN
        if sticky is not None:
            newinfo.sticky = sticky
        new_object.__vc_info__ = newinfo
        return new_object

    security.declareProtected(use_vc_permission, 'labelResource')
    def labelResource(self, object, label, force=0):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in to be labeled.'
                )

        # Make sure that labels and branch ids do not collide.
        if self._branches.has_key(label) or label == 'mainline':
            raise VersionControlError(
                'The label value given is already in use as an activity id.'
                )
        if not self._labels.has_key(label):
            self._labels[label] = 1

        history = self.getVersionHistory(info.history_id)
        history.labelVersion(info.version_id, label, force)
        return object

    security.declareProtected(use_vc_permission, 'makeActivity')
    def makeActivity(self, object, branch_id):
        # Note - this is not part of the official version control API yet.
        # It is here to allow unit testing of the architectural aspects
        # that are already in place to support activities in the future.

        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in.'
                )

        branch_id = branch_id or None

        # Make sure that activity ids and labels do not collide.
        if self._labels.has_key(branch_id) or branch_id == 'mainline':
            raise VersionControlError(
                'The value given is already in use as a version label.'
                )

        if not self._branches.has_key(branch_id):
            self._branches[branch_id] = 1

        history = self.getVersionHistory(info.history_id)

        if history._branches.has_key(branch_id):
            raise VersionControlError(
                'The resource is already associated with the given activity.'
                )

        history.createBranch(branch_id, info.version_id)
        return object

    security.declareProtected(use_vc_permission, 'getVersionOfResource')
    def getVersionOfResource(self, history_id, selector):
        history = self.getVersionHistory(history_id)
        sticky = None

        if not selector or selector == 'mainline':
            version = history.getLatestVersion('mainline')
        else:
            if history.hasVersionId(selector):
                version = history.getVersionById(selector)
                sticky = ('V', selector)

            elif self._labels.has_key(selector):
                version = history.getVersionByLabel(selector)
                sticky = ('L', selector)

            elif self._branches.has_key(selector):
                version = history.getLatestVersion(selector)
                sticky = ('B', selector)
            else:
                try: date = DateTime(selector)
                except:
                    raise VersionControlError(
                        'Invalid version selector: %s' % selector
                        )
                else:
                    timestamp = date.timeTime()
                    sticky = ('D', timestamp)
                    version = history.getVersionByDate('mainline', timestamp)

        object = version.copyState()

        info = VersionInfo(history_id, version.getId(), VersionInfo.CHECKED_IN)
        if sticky is not None:
            info.sticky = sticky
        object.__vc_info__ = info
        return object

    security.declareProtected(use_vc_permission, 'getVersionIds')
    def getVersionIds(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getVersionIds()

    security.declareProtected(use_vc_permission, 'getLabelsForResource')
    def getLabelsForResource(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getLabels()

    security.declareProtected(use_vc_permission, 'getLogEntries')
    def getLogEntries(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getLogEntries()
Esempio n. 2
0
class Repository(Implicit, Persistent):
    """The repository implementation manages the actual data of versions
       and version histories. It does not handle user interface issues."""

    def __init__(self):
        # These keep track of symbolic label and branch names that
        # have been used to ensure that they don't collide.
        self._branches = OIBTree()
        self._branches['mainline'] = 1
        self._labels = OIBTree()

        self._histories = OOBTree()
        self._created = time.time()

    security = ClassSecurityInfo()

    security.declarePrivate('createVersionHistory')
    def createVersionHistory(self, object):
        """Internal: create a new version history for a resource."""
        # When one creates the first version in a version history, neither
        # the version or version history yet have a _p_jar, which causes
        # copy operations to fail. To work around that, we share our _p_jar.
        history_id = None
        while history_id is None or self._histories.has_key(history_id):
            history_id = str(randint(1, 9999999999))
        history = ZopeVersionHistory(history_id, object)
        self._histories[history_id] = history
        return history.__of__(self)

    security.declarePrivate('getVersionHistory')
    def getVersionHistory(self, history_id):
        """Internal: return a version history given a version history id."""
        return self._histories[history_id].__of__(self)

    security.declarePrivate('replaceState')
    def replaceState(self, obj, new_state):
        """Internal: replace the state of a persistent object.
        """
        non_versioned = getNonVersionedData(obj)
        # XXX There ought to be some way to do this more cleanly.
        # This fills the __dict__ of the old object with new state.
        # The other way to achieve the desired effect is to replace
        # the object in its container, but this method preserves the
        # identity of the object.
        if obj.__class__ is not new_state.__class__:
            raise VersionControlError(
                "The class of the versioned object has changed. %s != %s"
                % (repr(obj.__class__, new_state.__class__)))
        obj._p_changed = 1
        for key in obj.__dict__.keys():
            if not new_state.__dict__.has_key(key):
                del obj.__dict__[key]
        for key, value in new_state.__dict__.items():
            obj.__dict__[key] = value
        if non_versioned:
            # Restore the non-versioned data into the new state.
            restoreNonVersionedData(obj, non_versioned)
        return obj

    #####################################################################
    # This is the implementation of the public version control interface.
    #####################################################################

    security.declarePublic('isAVersionableResource')
    def isAVersionableResource(self, obj):
        # For now, an object must be persistent (have its own db record)
        # in order to be considered a versionable resource.
        return isAVersionableResource(obj)

    security.declarePublic('isUnderVersionControl')
    def isUnderVersionControl(self, object):
        return hasattr(object, '__vc_info__')

    security.declarePublic('isResourceUpToDate')
    def isResourceUpToDate(self, object, require_branch=0):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        branch = 'mainline'
        if info.sticky:
            if info.sticky[0] == 'B':
                branch = info.sticky[1]
            elif require_branch:
                # The object is updated to a particular version
                # rather than a branch.  The caller
                # requires a branch.
                return 0
        return history.isLatestVersion(info.version_id, branch)

    security.declarePublic('isResourceChanged')
    def isResourceChanged(self, object):
        # Return true if the state of a resource has changed in a transaction
        # *after* the version bookkeeping was saved. Note that this method is
        # not appropriate for detecting changes within a transaction!
        info = self.getVersionInfo(object)
        itime = getattr(info, '_p_mtime', None)
        if itime is None:
            return 0
        mtime = Utility._findModificationTime(object)
        if mtime is None:
            return 0
        return mtime > itime

    security.declarePublic('getVersionInfo')
    def getVersionInfo(self, object):
        info = getattr(object, '__vc_info__', None)
        if info is not None:
            return info
        raise VersionControlError(
            'The specified resource is not under version control.'
            )

    security.declareProtected(use_vc_permission, 'applyVersionControl')
    def applyVersionControl(self, object, message=None):
        if self.isUnderVersionControl(object):
            raise VersionControlError(
                'The resource is already under version control.'
                )
        if not self.isAVersionableResource(object):
            raise VersionControlError(
                'This resource cannot be put under version control.'
                )

        # Need to check the parent to see if the container of the object
        # being put under version control is itself a version-controlled
        # object. If so, we need to use the branch id of the container.
        branch = 'mainline'
        parent = aq_parent(aq_inner(object))
        p_info = getattr(parent, '__vc_info__', None)
        if p_info is not None:
            sticky = p_info.sticky
            if sticky and sticky[0] == 'B':
                branch = sticky[1]

        # Create a new version history and initial version object.
        history = self.createVersionHistory(object)
        version = history.createVersion(object, branch)

        history_id = history.getId()
        version_id = version.getId()

        # Add bookkeeping information to the version controlled object.
        info = VersionInfo(history_id, version_id, VersionInfo.CHECKED_IN)
        if branch != 'mainline':
            info.sticky = ('B', branch)
        object.__vc_info__ = info

        # Save an audit record of the action being performed.
        history.addLogEntry(version_id,
                            LogEntry.ACTION_CHECKIN,
                            _findPath(object),
                            message is None and 'Initial checkin.' or message
                            )
        return object

    security.declareProtected(use_vc_permission, 'checkoutResource')
    def checkoutResource(self, object):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource is already checked out.'
                )

        if info.sticky and info.sticky[0] != 'B':
            raise VersionControlError(
                'The selected resource has been updated to a particular '
                'version, label or date. The resource must be updated to '
                'the mainline or a branch before it may be checked out.'
                )

        if not self.isResourceUpToDate(object):
            raise VersionControlError(
                'The selected resource is not up to date!'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        # Save an audit record of the action being performed.
        history.addLogEntry(info.version_id,
                            LogEntry.ACTION_CHECKOUT,
                            ob_path
                            )

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.status = newinfo.CHECKED_OUT
        object.__vc_info__ = newinfo
        return object

    security.declareProtected(use_vc_permission, 'checkinResource')
    def checkinResource(self, object, message=''):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_OUT:
            raise VersionControlError(
                'The selected resource is not checked out.'
                )

        if info.sticky and info.sticky[0] != 'B':
            raise VersionControlError(
                'The selected resource has been updated to a particular '
                'version, label or date. The resource must be updated to '
                'the mainline or a branch before it may be checked in.'
                )

        if not self.isResourceUpToDate(object):
            raise VersionControlError(
                'The selected resource is not up to date!'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        branch = 'mainline'
        if info.sticky is not None and info.sticky[0] == 'B':
            branch = info.sticky[1]

        version = history.createVersion(object, branch)

        # Save an audit record of the action being performed.
        history.addLogEntry(version.getId(),
                            LogEntry.ACTION_CHECKIN,
                            ob_path,
                            message
                            )

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.version_id = version.getId()
        newinfo.status = newinfo.CHECKED_IN
        object.__vc_info__ = newinfo
        return object

    security.declareProtected(use_vc_permission, 'uncheckoutResource')
    def uncheckoutResource(self, object):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_OUT:
            raise VersionControlError(
                'The selected resource is not checked out.'
                )

        history = self.getVersionHistory(info.history_id)
        ob_path = _findPath(object)

        version = history.getVersionById(info.version_id)
        new_obj = version.copyState()

        # Save an audit record of the action being performed.
        history.addLogEntry(info.version_id,
                            LogEntry.ACTION_UNCHECKOUT,
                            ob_path
                            )

        # Replace the state of the object with a reverted state.
        new_obj = self.replaceState(object, new_obj)

        # Update bookkeeping information.
        newinfo = info.clone()
        newinfo.version_id = version.getId()
        newinfo.status = newinfo.CHECKED_IN
        new_obj.__vc_info__ = newinfo
        return new_obj

    security.declareProtected(use_vc_permission, 'updateResource')
    def updateResource(self, object, selector=None):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in to be updated.'
                )

        history = self.getVersionHistory(info.history_id)
        version = None
        sticky = info.sticky

        if not selector:
            # If selector is null, update to the latest version taking any
            # sticky attrs into account (branch, date). Note that the sticky
            # tag could also be a date or version id. We don't bother checking
            # for those, since in both cases we do nothing (because we'll
            # always be up to date until the sticky tag changes).
            if sticky and sticky[0] == 'L':
                # A label sticky tag, so update to that label (since it is
                # possible, but unlikely, that the label has been moved).
                version = history.getVersionByLabel(sticky[1])
            elif sticky and sticky[0] == 'B':
                # A branch sticky tag. Update to latest version on branch.
                version = history.getLatestVersion(selector)
            else:
                # Update to mainline, forgetting any date or version id
                # sticky tag that was previously associated with the object.
                version = history.getLatestVersion('mainline')
                sticky = None
        else:
            # If the selector is non-null, we find the version specified
            # and update the sticky tag. Later we'll check the version we
            # found and decide whether we really need to update the object.
            if history.hasVersionId(selector):
                version = history.getVersionById(selector)
                sticky = ('V', selector)

            elif self._labels.has_key(selector):
                version = history.getVersionByLabel(selector)
                sticky = ('L', selector)

            elif self._branches.has_key(selector):
                version = history.getLatestVersion(selector)
                if selector == 'mainline':
                    sticky = None
                else:
                    sticky = ('B', selector)
            else:
                try:    date = DateTime(selector)
                except:
                    raise VersionControlError(
                        'Invalid version selector: %s' % selector
                        )
                else:
                    timestamp = date.timeTime()
                    sticky = ('D', timestamp)
                    # Fix!
                    branch = history.findBranchId(info.version_id)
                    version = history.getVersionByDate(branch, timestamp)

        # If the state of the resource really needs to be changed, do the
        # update and make a log entry for the update.
        version_id = version and version.getId() or info.version_id
        new_object = object
        if version and (version_id != info.version_id):
            new_object = version.copyState()
            new_object = self.replaceState(object, new_object)

            history.addLogEntry(version_id,
                                LogEntry.ACTION_UPDATE,
                                _findPath(new_object)
                                )

        # Update bookkeeping information.
        newinfo = info.clone(1)
        newinfo.version_id = version_id
        newinfo.status = newinfo.CHECKED_IN
        if sticky is not None:
            newinfo.sticky = sticky
        new_object.__vc_info__ = newinfo
        return new_object

    security.declareProtected(use_vc_permission, 'labelResource')
    def labelResource(self, object, label, force=0):
        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in to be labeled.'
                )

        # Make sure that labels and branch ids do not collide.
        if self._branches.has_key(label) or label == 'mainline':
            raise VersionControlError(
                'The label value given is already in use as an activity id.'
                )
        if not self._labels.has_key(label):
            self._labels[label] = 1

        history = self.getVersionHistory(info.history_id)
        history.labelVersion(info.version_id, label, force)
        return object

    security.declareProtected(use_vc_permission, 'makeActivity')
    def makeActivity(self, object, branch_id):
        # Note - this is not part of the official version control API yet.
        # It is here to allow unit testing of the architectural aspects
        # that are already in place to support activities in the future.

        info = self.getVersionInfo(object)
        if info.status != info.CHECKED_IN:
            raise VersionControlError(
                'The selected resource must be checked in.'
                )

        branch_id = branch_id or None

        # Make sure that activity ids and labels do not collide.
        if self._labels.has_key(branch_id) or branch_id == 'mainline':
            raise VersionControlError(
                'The value given is already in use as a version label.'
                )

        if not self._branches.has_key(branch_id):
            self._branches[branch_id] = 1

        history = self.getVersionHistory(info.history_id)

        if history._branches.has_key(branch_id):
            raise VersionControlError(
                'The resource is already associated with the given activity.'
                )

        history.createBranch(branch_id, info.version_id)
        return object

    security.declareProtected(use_vc_permission, 'getVersionOfResource')
    def getVersionOfResource(self, history_id, selector):
        history = self.getVersionHistory(history_id)
        sticky = None

        if not selector or selector == 'mainline':
            version = history.getLatestVersion('mainline')
        else:
            if history.hasVersionId(selector):
                version = history.getVersionById(selector)
                sticky = ('V', selector)

            elif self._labels.has_key(selector):
                version = history.getVersionByLabel(selector)
                sticky = ('L', selector)

            elif self._branches.has_key(selector):
                version = history.getLatestVersion(selector)
                sticky = ('B', selector)
            else:
                try: date = DateTime(selector)
                except:
                    raise VersionControlError(
                        'Invalid version selector: %s' % selector
                        )
                else:
                    timestamp = date.timeTime()
                    sticky = ('D', timestamp)
                    version = history.getVersionByDate('mainline', timestamp)

        object = version.copyState()

        info = VersionInfo(history_id, version.getId(), VersionInfo.CHECKED_IN)
        if sticky is not None:
            info.sticky = sticky
        object.__vc_info__ = info
        return object

    security.declareProtected(use_vc_permission, 'getVersionIds')
    def getVersionIds(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getVersionIds()

    security.declareProtected(use_vc_permission, 'getLabelsForResource')
    def getLabelsForResource(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getLabels()

    security.declareProtected(use_vc_permission, 'getLogEntries')
    def getLogEntries(self, object):
        info = self.getVersionInfo(object)
        history = self.getVersionHistory(info.history_id)
        return history.getLogEntries()
class GlobbingLexicon(Lexicon):
    """Lexicon which supports basic globbing function ('*' and '?').

    This lexicon keeps several data structures around that are useful
    for searching. They are:

      '_lexicon' -- Contains the mapping from word => word_id

      '_inverseLex' -- Contains the mapping from word_id => word

      '_digrams' -- Contains a mapping from digram => word_id

    Before going further, it is necessary to understand what a digram is,
    as it is a core component of the structure of this lexicon.  A digram
    is a two-letter sequence in a word.  For example, the word 'zope'
    would be converted into the digrams::

      ['$z', 'zo', 'op', 'pe', 'e$']

    where the '$' is a word marker.  It is used at the beginning and end
    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'


    def __init__(self,useSplitter=None,extra=None):
        self.clear()
        self.useSplitter = useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)

    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()

    def _convertBTrees(self, threshold=200):
        Lexicon._convertBTrees(self, threshold)
        if type(self._digrams) is OOBTree: return

        from BTrees.convert import convert

        _digrams=self._digrams
        self._digrams=OOBTree()
        self._digrams._p_jar=self._p_jar
        convert(_digrams, self._digrams, threshold, IITreeSet)


    def createDigrams(self, word):
        """Returns a list with the set of digrams in the word."""

        word = '$'+word+'$'
        return [ word[i:i+2] for i in range(len(word)-1)]


    def getWordId(self, word):
        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
            return self.assignWordId(word)

    set = getWordId                     # Kludge for old code

    def getWord(self, wid):
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word, and return it."""

        # Double check it's not in the lexicon already, and if it is, just
        # return it.
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        # Get word id. BBB Backward compat pain.
        inverse=self._inverseLex
        try: insert=inverse.insert
        except AttributeError:
            # we have an "old" BTree object
            if inverse:
                wid=inverse.keys()[-1]+1
            else:
                self._inverseLex=IOBTree()
                wid=1
            inverse[wid] = word
        else:
            # we have a "new" IOBTree object
            wid=randid()
            while not inverse.insert(wid, word):
                wid=randid()

        self._lexicon[word] = wid

        # Now take all the digrams and insert them into the digram map.
        for digram in self.createDigrams(word):
            set = self._digrams.get(digram, None)
            if set is None:
                self._digrams[digram] = set = IISet()
            set.insert(wid)

        return wid


    def get(self, pattern):
        """ Query the lexicon for words matching a pattern."""

        # single word pattern  produce a slicing problem below.
        # Because the splitter throws away single characters we can
        # return an empty tuple here.

        if len(pattern)==1: return ()

        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
        globbing = 0
        for i in range(len(pattern)):
            if pattern[i] in wc_set:
                globbing = 1
                continue

            if i == 0:
                digrams.insert(i, (self.eow + pattern[i]) )
                digrams.append((pattern[i] + pattern[i+1]))
            else:
                try:
                    if pattern[i+1] not in wc_set:
                        digrams.append( pattern[i] + pattern[i+1] )

                except IndexError:
                    digrams.append( (pattern[i] + self.eow) )

        if not globbing:
            result =  self._lexicon.get(pattern, None)
            if result is None:
                return ()
            return (result, )

        ## now get all of the intsets that contain the result digrams
        result = None
        for digram in digrams:
            result=union(result, self._digrams.get(digram, None))

        if not result:
            return ()
        else:
            ## now we have narrowed the list of possible candidates
            ## down to those words which contain digrams.  However,
            ## some words may have been returned that match digrams,
            ## but do not match 'pattern'.  This is because some words
            ## may contain all matching digrams, but in the wrong
            ## order.

            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = IISet()
            for x in result:
                if expr.match(self._inverseLex[x]):
                    hits.insert(x)
            return hits


    def __getitem__(self, word):
        """ """
        return self.get(word)


    def query_hook(self, q):
        """expand wildcards"""
        ListType = type([])
        i = len(q) - 1
        while i >= 0:
            e = q[i]
            if isinstance(e, ListType):
                self.query_hook(e)
            elif isinstance(e, Op):
                pass
            elif ( (self.multi_wc in e) or
                   (self.single_wc in e) ):
                wids = self.get(e)
                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
                if not words:
                    # if words is empty, return something that will make
                    # textindex's __getitem__ return an empty result list
                    words.append('')
                q[i] = words
            i = i - 1

        return q

    def Splitter(self, astring, words=None, encoding="latin1"):
        """ wrap the splitter """

        ## don't do anything, less efficient but there's not much
        ## sense in stemming a globbing lexicon.

        try:
            return self.SplitterFunc(
                    astring,
                    words,
                    encoding=encoding,
                    singlechar=self.splitterParams.splitterSingleChars,
                    indexnumbers=self.splitterParams.splitterIndexNumbers,
                    casefolding=self.splitterParams.splitterCasefolding
                    )
        except:
            return self.SplitterFunc(astring, words)


    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

        # Remove characters that are meaningful in a regex
        if not isinstance(pat, UnicodeType):
            transTable = string.maketrans("", "")
            result = string.translate(pat, transTable,
                                      r'()&|!@#$%^{}\<>.')
        else:
            transTable={}
            for ch in r'()&|!@#$%^{}\<>.':
                transTable[ord(ch)]=None
            result=pat.translate(transTable)

        # First, deal with multi-character globbing
        result = result.replace( '*', '.*')

        # Next, we need to deal with single-character globbing
        result = result.replace( '?', '.')

        return "%s$" % result
Esempio n. 4
0
class GlobbingLexicon(Lexicon):
    """Lexicon which supports basic globbing function ('*' and '?').

    This lexicon keeps several data structures around that are useful
    for searching. They are:

      '_lexicon' -- Contains the mapping from word => word_id

      '_inverseLex' -- Contains the mapping from word_id => word

      '_digrams' -- Contains a mapping from digram => word_id

    Before going further, it is necessary to understand what a digram is,
    as it is a core component of the structure of this lexicon.  A digram
    is a two-letter sequence in a word.  For example, the word 'zope'
    would be converted into the digrams::

      ['$z', 'zo', 'op', 'pe', 'e$']

    where the '$' is a word marker.  It is used at the beginning and end
    of the words.  Those digrams are significant.
    """

    multi_wc = '*'
    single_wc = '?'
    eow = '$'

    def __init__(self, useSplitter=None, extra=None):
        self.clear()
        self.useSplitter = useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)

    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()
        self._digrams = OOBTree()

    def _convertBTrees(self, threshold=200):
        Lexicon._convertBTrees(self, threshold)
        if type(self._digrams) is OOBTree: return

        from BTrees.convert import convert

        _digrams = self._digrams
        self._digrams = OOBTree()
        self._digrams._p_jar = self._p_jar
        convert(_digrams, self._digrams, threshold, IITreeSet)

    def createDigrams(self, word):
        """Returns a list with the set of digrams in the word."""

        word = '$' + word + '$'
        return [word[i:i + 2] for i in range(len(word) - 1)]

    def getWordId(self, word):
        """Provided 'word', return the matching integer word id."""

        if self._lexicon.has_key(word):
            return self._lexicon[word]
        else:
            return self.assignWordId(word)

    set = getWordId  # Kludge for old code

    def getWord(self, wid):
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word, and return it."""

        # Double check it's not in the lexicon already, and if it is, just
        # return it.
        if self._lexicon.has_key(word):
            return self._lexicon[word]

        # Get word id. BBB Backward compat pain.
        inverse = self._inverseLex
        try:
            insert = inverse.insert
        except AttributeError:
            # we have an "old" BTree object
            if inverse:
                wid = inverse.keys()[-1] + 1
            else:
                self._inverseLex = IOBTree()
                wid = 1
            inverse[wid] = word
        else:
            # we have a "new" IOBTree object
            wid = randid()
            while not inverse.insert(wid, word):
                wid = randid()

        self._lexicon[word] = wid

        # Now take all the digrams and insert them into the digram map.
        for digram in self.createDigrams(word):
            set = self._digrams.get(digram, None)
            if set is None:
                self._digrams[digram] = set = IISet()
            set.insert(wid)

        return wid

    def get(self, pattern):
        """ Query the lexicon for words matching a pattern."""

        # single word pattern  produce a slicing problem below.
        # Because the splitter throws away single characters we can
        # return an empty tuple here.

        if len(pattern) == 1: return ()

        wc_set = [self.multi_wc, self.single_wc]

        digrams = []
        globbing = 0
        for i in range(len(pattern)):
            if pattern[i] in wc_set:
                globbing = 1
                continue

            if i == 0:
                digrams.insert(i, (self.eow + pattern[i]))
                digrams.append((pattern[i] + pattern[i + 1]))
            else:
                try:
                    if pattern[i + 1] not in wc_set:
                        digrams.append(pattern[i] + pattern[i + 1])

                except IndexError:
                    digrams.append((pattern[i] + self.eow))

        if not globbing:
            result = self._lexicon.get(pattern, None)
            if result is None:
                return ()
            return (result, )

        ## now get all of the intsets that contain the result digrams
        result = None
        for digram in digrams:
            result = union(result, self._digrams.get(digram, None))

        if not result:
            return ()
        else:
            ## now we have narrowed the list of possible candidates
            ## down to those words which contain digrams.  However,
            ## some words may have been returned that match digrams,
            ## but do not match 'pattern'.  This is because some words
            ## may contain all matching digrams, but in the wrong
            ## order.

            expr = re.compile(self.createRegex(pattern))
            words = []
            hits = IISet()
            for x in result:
                if expr.match(self._inverseLex[x]):
                    hits.insert(x)
            return hits

    def __getitem__(self, word):
        """ """
        return self.get(word)

    def query_hook(self, q):
        """expand wildcards"""
        ListType = type([])
        i = len(q) - 1
        while i >= 0:
            e = q[i]
            if isinstance(e, ListType):
                self.query_hook(e)
            elif isinstance(e, Op):
                pass
            elif ((self.multi_wc in e) or (self.single_wc in e)):
                wids = self.get(e)
                words = []
                for wid in wids:
                    if words:
                        words.append(Or)
                    words.append(wid)
                if not words:
                    # if words is empty, return something that will make
                    # textindex's __getitem__ return an empty result list
                    words.append('')
                q[i] = words
            i = i - 1

        return q

    def Splitter(self, astring, words=None, encoding="latin1"):
        """ wrap the splitter """

        ## don't do anything, less efficient but there's not much
        ## sense in stemming a globbing lexicon.

        try:
            return self.SplitterFunc(
                astring,
                words,
                encoding=encoding,
                singlechar=self.splitterParams.splitterSingleChars,
                indexnumbers=self.splitterParams.splitterIndexNumbers,
                casefolding=self.splitterParams.splitterCasefolding)
        except:
            return self.SplitterFunc(astring, words)

    def createRegex(self, pat):
        """Translate a PATTERN to a regular expression.

        There is no way to quote meta-characters.
        """

        # Remove characters that are meaningful in a regex
        if not isinstance(pat, UnicodeType):
            transTable = string.maketrans("", "")
            result = string.translate(pat, transTable, r'()&|!@#$%^{}\<>.')
        else:
            transTable = {}
            for ch in r'()&|!@#$%^{}\<>.':
                transTable[ord(ch)] = None
            result = pat.translate(transTable)

        # First, deal with multi-character globbing
        result = result.replace('*', '.*')

        # Next, we need to deal with single-character globbing
        result = result.replace('?', '.')

        return "%s$" % result
Esempio n. 5
0
class Indexer(object):

    filestorage = database = connection = root = None

    def __init__(self, datafs, writable=0, trans=0, pack=0):
        self.trans_limit = trans
        self.pack_limit = pack
        self.trans_count = 0
        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
        self.database = DB(self.filestorage)
        self.connection = self.database.open()
        self.root = self.connection.root()
        try:
            self.index = self.root["index"]
        except KeyError:
            self.index = self.root["index"] = TextIndexWrapper()
        try:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
        try:
            self.doctimes = self.root["doctimes"]
        except KeyError:
            self.doctimes = self.root["doctimes"] = IIBTree()
        try:
            self.watchfolders = self.root["watchfolders"]
        except KeyError:
            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
            self.path2docid[path] = docid
        try:
            self.maxdocid = max(self.docpaths.keys())
        except ValueError:
            self.maxdocid = 0
        print(len(self.docpaths), "Document ids")
        print(len(self.path2docid), "Pathnames")
        print(self.index.lexicon.length(), "Words")

    def dumpfreqs(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        L = []
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            L.append((freq, wid, lexicon.get_word(wid)))
        L.sort()
        L.reverse()
        for freq, wid, word in L:
            print("%10d %10d %s" % (wid, freq, word))

    def dumpwids(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid)))

    def dumpwords(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for word in lexicon.words():
            wid = lexicon.get_wid(word)
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, word))

    def close(self):
        self.root = None
        if self.connection is not None:
            self.connection.close()
            self.connection = None
        if self.database is not None:
            self.database.close()
            self.database = None
        if self.filestorage is not None:
            self.filestorage.close()
            self.filestorage = None

    def interact(self, nbest=NBEST, maxlines=MAXLINES):
        try:
            import readline
        except ImportError:
            pass
        text = ""
        top = 0
        results = []
        while 1:
            try:
                line = raw_input("Query: ")
            except EOFError:
                print("\nBye.")
                break
            line = line.strip()
            if line.startswith("/"):
                self.specialcommand(line, results, top - nbest)
                continue
            if line:
                text = line
                top = 0
            else:
                if not text:
                    continue
            try:
                results, n = self.timequery(text, top + nbest)
            except KeyboardInterrupt:
                raise
            except:
                reportexc()
                text = ""
                continue
            if len(results) <= top:
                if not n:
                    print("No hits for %r." % text)
                else:
                    print("No more hits for %r." % text)
                text = ""
                continue
            print("[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
                  end=" ")
            print("for query %s]" % repr(text))
            self.formatresults(text, results, maxlines, top, top+nbest)
            top += nbest

    def specialcommand(self, line, results, first):
        assert line.startswith("/")
        line = line[1:]
        if not line:
            n = first
        else:
            try:
                n = int(line) - 1
            except:
                print("Huh?")
                return
        if n < 0 or n >= len(results):
            print("Out of range")
            return
        docid, score = results[n]
        path = self.docpaths[docid]
        i = path.rfind("/")
        assert i > 0
        folder = path[:i]
        n = path[i+1:]
        cmd = "show +%s %s" % (folder, n)
        if os.getenv("DISPLAY"):
            os.system("xterm -e  sh -c '%s | less' &" % cmd)
        else:
            os.system(cmd)

    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
        results, n = self.timequery(text, nbest)
        if not n:
            print("No hits for %r." % text)
            return
        print("[Results 1-%d from %d]" % (len(results), n))
        self.formatresults(text, results, maxlines)

    def timequery(self, text, nbest):
        t0 = time.time()
        c0 = time.clock()
        results, n = self.index.query(text, 0, nbest)
        t1 = time.time()
        c1 = time.clock()
        print("[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0))
        return results, n

    def formatresults(self, text, results, maxlines=MAXLINES,
                      lo=0, hi=sys.maxint):
        stop = self.stopdict.has_key
        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
        pattern = r"\b(" + "|".join(words) + r")\b"
        pattern = pattern.replace("*", ".*") # glob -> re syntax
        prog = re.compile(pattern, re.IGNORECASE)
        print('='*70)
        rank = lo
        for docid, score in results[lo:hi]:
            rank += 1
            path = self.docpaths[docid]
            score *= 100.0
            print("Rank:    %d   Score: %d%%   File: %s" % (rank, score, path))
            path = os.path.join(self.mh.getpath(), path)
            try:
                fp = open(path)
            except (IOError, OSError) as msg:
                print("Can't open:", msg)
                continue
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
                h = msg.getheader(header)
                if h:
                    print("%-8s %s" % (header+":", h))
            text = self.getmessagetext(msg)
            if text:
                print()
                nleft = maxlines
                for part in text:
                    for line in part.splitlines():
                        if prog.search(line):
                            print(line)
                            nleft -= 1
                            if nleft <= 0:
                                break
                    if nleft <= 0:
                        break
            print('-'*70)

    def update(self, args):
        folder = None
        seqs = []

        for arg in args:
            if arg.startswith("+"):
                if folder is None:
                    folder = arg[1:]
                else:
                    print("only one folder at a time")
                    return
            else:
                seqs.append(arg)

        if not folder:
            folder = self.mh.getcontext()
        if not seqs:
            seqs = ['all']

        try:
            f = self.mh.openfolder(folder)
        except mhlib.Error as msg:
            print(msg)
            return

        dict = {}
        for seq in seqs:
            try:
                nums = f.parsesequence(seq)
            except mhlib.Error as msg:
                print(msg or "unparsable message sequence: %s" % repr(seq))
                return
            for n in nums:
                dict[n] = n
        msgs = dict.keys()
        msgs.sort()

        self.updatefolder(f, msgs)
        self.commit()

    def optimize(self, args):
        uniqwords = {}
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nOPTIMIZE FOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.prescan(f, f.listmessages(), uniqwords)
        L = [(uniqwords[word], word) for word in uniqwords.keys()]
        L.sort()
        L.reverse()
        for i in range(100):
            print("%3d. %6d %s" % ((i+1,) + L[i]))
        self.index.lexicon.sourceToWordIds([word for (count, word) in L])

    def prescan(self, f, msgs, uniqwords):
        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
        for n in msgs:
            print("prescanning", n)
            m = f.openmessage(n)
            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
        if not args:
            print("No folders specified; use ALL to bulk-index all folders")
            return
        if "ALL" in args:
            i = args.index("ALL")
            args[i:i+1] = self.mh.listfolders()
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nFOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.updatefolder(f, f.listmessages())
            print("Total", len(self.docpaths))
        self.commit()
        print("Indexed", self.index.lexicon._nbytes, "bytes and",)
        print(self.index.lexicon._nwords, "words;",)
        print(len(self.index.lexicon._words), "unique words.")

    def updatefolder(self, f, msgs):
        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
            path = "%s/%s" % (f.name, n)
            docid = self.path2docid.get(path, 0)
            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
                print("unchanged", docid, path)
                continue
            docid = self.newdocid(path)
            try:
                m = f.openmessage(n)
            except IOError:
                print("disappeared", docid, path)
                self.unindexpath(path)
                continue
            text = self.getmessagetext(m, f.name)
            if not text:
                self.unindexpath(path)
                continue
            print("indexing", docid, path)
            self.index.index_doc(docid, text)
            self.maycommit()
        # Remove messages from the folder that no longer exist
        for path in list(self.path2docid.keys(f.name)):
            if not path.startswith(f.name + "/"):
                break
            if self.getmtime(path) == 0:
                self.unindexpath(path)
        print("done.")

    def unindexpath(self, path):
        if self.path2docid.has_key(path):
            docid = self.path2docid[path]
            print("unindexing", docid, path)
            del self.docpaths[docid]
            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex_doc(docid)
            except KeyError as msg:
                print("KeyError", msg)
            self.maycommit()

    def getmessagetext(self, m, name=None):
        L = []
        if name:
            L.append("_folder " + name) # To restrict search to a folder
            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except KeyboardInterrupt:
            raise
        except:
            print("(getmsgparts failed:)")
            reportexc()
        return L

    def getmsgparts(self, m, L, level):
        ctype = m.gettype()
        if level or ctype != "text/plain":
            print(". "*level + str(ctype))
        if ctype == "text/plain":
            L.append(m.getbodytext())
        elif ctype in ("multipart/alternative", "multipart/mixed"):
            for part in m.getbodyparts():
                self.getmsgparts(part, L, level+1)
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
            self.getheaders(m, L)
            self.getmsgparts(m, L, level+1)

    def getheaders(self, m, L):
        H = []
        for key in "from", "to", "cc", "bcc", "subject":
            value = m.get(key)
            if value:
                H.append(value)
        if H:
            L.append("\n".join(H))

    def newdocid(self, path):
        docid = self.path2docid.get(path)
        if docid is not None:
            self.doctimes[docid] = self.getmtime(path)
            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

    def getmtime(self, path):
        path = os.path.join(self.mh.getpath(), path)
        try:
            st = os.stat(path)
        except os.error as msg:
            return 0
        return int(st[ST_MTIME])

    def maycommit(self):
        self.trans_count += 1
        if self.trans_count >= self.trans_limit > 0:
            self.commit()

    def commit(self):
        if self.trans_count > 0:
            print("committing...")
            transaction.commit()
            self.trans_count = 0
            self.pack_count += 1
            if self.pack_count >= self.pack_limit > 0:
                self.pack()

    def pack(self):
        if self.pack_count > 0:
            print("packing...")
            self.database.pack()
            self.pack_count = 0
class OrderedBTreeContainer(BrowserDefaultMixin):
    """
    """
    security = ClassSecurityInfo()

    implements(interfaces.IOrderedBTreeContainer)

    meta_type = 'OrderedBTreeContainer'
    _at_rename_after_creation = True

    schema = OrderedBTreeContainer_schema

    ##code-section class-header #fill in your manual code here

    # Methods

    ##/code-section class-header

    # Methods

    # Manually created methods

    security.declarePrivate('_insert')
    def _insert(self, pos, id):
        """
        """
        positionId = self._positionId
        idPosition = self._idPosition
        # check if it is not inserted yet
        if positionId.insert(pos, id):
            idPosition[id] = pos
            return

        # create a gap in the list
        self._delta(pos, pos+1)

        # assign position and id
        positionId[pos] = id
        idPosition[id] = pos

    security.declarePrivate('_delete')
    def _delete(self, pos):
        """
        """
        positionId = self._positionId
        idPosition = self._idPosition
        if pos is None:
            return
        if len(positionId) <= 0:
            return
        if len(positionId) == 1:
            id = positionId[pos]
            del positionId[pos]
            del idPosition[id]
            return
        self._delta(pos, pos+1)

    security.declarePrivate('_moveObject')
    def _moveObject(self, id, position):
        """ Move id to position
            Does not check if position is sane
        """
        if position < 0:
            position = 0

        obj_pos = self.getObjectPosition(id)
        if obj_pos == position:
            return

        self._delete(obj_pos)
        self._insert(position, id)

    security.declarePrivate('__init__')
    def __init__(self, oid, **kwargs):
        self._positionId = IOBTree()
        self._idPosition = OIBTree()

    security.declarePrivate('_delta')
    def _delta(self, pos1, pos2):
        """ in the _positionId btree, move either creates a gap or shifts a
            portion left
        """
        if pos1 == pos2: return
        #assert(abs(pos1-pos2) == 1) # haven't implemented delta completely yet
        positionId = self._positionId
        idPosition = self._idPosition
        max = 0
        if len(positionId) > 1:
            max = positionId.maxKey()
        else:
            return
        if pos1 < pos2:
            # move left, ie, shortens list, that is, move left from pos2 to
            # pos1
            delIds = []
            for i in range(pos1, pos2):
                delIds.append(positionId[i])
            for i in range(pos1, max-(pos2-pos1)+1):
                idPosition[positionId[i+pos2-pos1]] = i
                positionId[i] = positionId[pos2+i-pos1]
            # clear out ids and positions no longer used
            for id in delIds:
                del idPosition[id]
            for i in range(max-(pos2-pos1)+1, max+1):
                del positionId[i]
        else:
            # create a gap, that is, lengthens the list, move right from pos1
            # to pos2, and shift the rest right
            for i in range(max+abs(pos1-pos2), pos2-1, -1):
                idPosition[positionId[i-abs(pos1-pos2)]] = i
                positionId[i] = positionId[i-abs(pos1-pos2)]
            for i in range(pos2, pos1+1):
                del positionId[i]

    security.declareProtected(permissions.ModifyPortalContent, 'getObjectPosition')
    def getObjectPosition(self, id):
        """ Get the object position for a given id
        """
        if id is not None and self._idPosition.has_key(id):
            return self._idPosition[id]
        return None

    security.declareProtected(permissions.ModifyPortalContent, 'getIdsInOrder')
    def getIdsInOrder(self, start, end):
        """ return a list of ids starting at start and ending at end
            if end is None or greater than the end, the list is truncated
            en start is None or less than 0 it is set to 0
            the ids is a list of length(end-start)
        """
        ids = []
        endIds = end
        # check if the tree is empty and if the list is wrapped
        if len(self._positionId) > 0 and (end is None or end < 0):
            endIds = self._positionId.maxKey()
        elif len(self._positionId) == 0:
            return []

        if start is None or start < 0:
            start = 0
        for i in range(start, endIds+1):
            if self._positionId.has_key(i):
                ids.append(self._positionId[i])
            else:
                break

        if end is None:
            return ids
        return ids[:end-start]

    security.declareProtected(permissions.ModifyPortalContent, 'getObjectId')
    def getObjectId(self, position):
        """
        """
        if position is not None and self._positionId.has_key(position):
            return self._positionId[position]
        return None

    security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsUp')
    def moveObjectsUp(self, ids, delta=1, RESPONSE=None):
        """ Move an object up """

        if type(ids) is StringType:
            ids = (ids,)

        for id in ids:
            self._moveObject(id, self.getObjectPosition(id)-delta)

        if RESPONSE is not None:
            RESPONSE.redirect('manage_workspace')

    security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsDown')
    def moveObjectsDown(self, ids, delta=1, RESPONSE=None):
        """ move an object down """

        if type(ids) is StringType:
            ids = (ids,)

        for id in ids:
            self._moveObject(id, self.getObjectPosition(id)+delta)

        if RESPONSE is not None:
            RESPONSE.redirect('manage_workspace')

    security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsToTop')
    def moveObjectsToTop(self, ids, RESPONSE=None):
        """ move an object to the top """

        if type(ids) is StringType:
            ids = (ids,)

        i = 0
        while i < len(ids):
            self._moveObject(ids[i], i)
            i = i + 1

        if RESPONSE is not None:
            RESPONSE.redirect('manage_workspace')

    security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsToBottom')
    def moveObjectsToBottom(self, ids, RESPONSE=None):
        """ move an object to the bottom """

        if type(ids) is StringType:
            ids = (ids,)

        i = 0
        max = self._positionId.maxKey()
        length = len(ids)
        while i < length:
            self._moveObject(ids[i], max - (length - 1)  + i)
            i += 1

        if RESPONSE is not None:
            RESPONSE.redirect('manage_workspace')

    security.declareProtected(permissions.ModifyPortalContent, 'orderObjects')
    def orderObjects(self, key, reverse=None):
        """ Order sub-objects by key and direction.
        """
        ids = [ id for id, obj in sort( self.objectItems(),
                                        ( (key, 'cmp', 'asc'), ) ) ]
        if reverse:
            ids.reverse()

        self._clear_and_rebuild(ids=ids)
        return len(ids)

    security.declareProtected(permissions.ModifyPortalContent, 'moveObjectsByDelta')
    def moveObjectsByDelta(self, ids, delta, subset_ids=None):
        """ Move specified sub-objects by delta.
        """
        raise "Not implemented yet"
        if type(ids) is StringType:
            ids = (ids,)
        min_position = 0
        objects = list(self._objects)
        if subset_ids == None:
            # OLD: subset_ids = [ obj['id'] for obj in objects ]
            subset_ids = self.getCMFObjectsSubsetIds(objects)
        else:
            subset_ids = list(subset_ids)
        # unify moving direction
        if delta > 0:
            ids = list(ids)
            ids.reverse()
            subset_ids.reverse()
        counter = 0

        for id in ids:
            try:
                old_position = subset_ids.index(id)
            except ValueError:
                continue
            new_position = max( old_position - abs(delta), min_position )
            if new_position == min_position:
                min_position += 1
            if not old_position == new_position:
                subset_ids.remove(id)
                subset_ids.insert(new_position, id)
                counter += 1

        if counter > 0:
            if delta > 0:
                subset_ids.reverse()
            obj_dict = {}
            for obj in objects:
                obj_dict[ obj['id'] ] = obj
            pos = 0
            for i in range( len(objects) ):
                if objects[i]['id'] in subset_ids:
                    try:
                        objects[i] = obj_dict[ subset_ids[pos] ]
                        pos += 1
                    except KeyError:
                        raise ValueError('The object with the id "%s" does '
                                         'not exist.' % subset_ids[pos])
            self._objects = tuple(objects)

        return counter

    security.declareProtected(permissions.ModifyPortalContent, 'getFirstEntryId')
    def getFirstEntryId(self):
        """
        """
        if len(self._positionId) > 0:
            return self._positionId[self._positionId.minKey()]
        return None

    security.declareProtected(permissions.ModifyPortalContent, 'getLastEntryId')
    def getLastEntryId(self):
        """
        """
        if len(self._positionId) > 0:
            return self._positionId[self._positionId.maxKey()]
        return None

    def addObject(self, id):
        """Adds object to end of btree, returns position
        """
        if self.getObjectPosition(id) is not None:
            raise RuntimeError, "Object already in tree"

        if len(self._positionId) > 0:
            max = self._positionId.maxKey()
            self._positionId[max+1] = id
            self._idPosition[id] = max + 1
            return max + 1
        else:
            self._positionId[0] = id
            self._idPosition[id] = 0
            return 0

    def numberObjects(self):
        """
        """
        return len(self._positionId)

    security.declarePrivate('_clear_and_rebuild')
    def _clear_and_rebuild(self, ids=[]):
        """
        """
        self._positionId = IOBTree()
        self._idPosition = OIBTree()

        for id in ids:
            self.addObject(id)
Esempio n. 7
0
class Lexicon(Persistent, Implicit):
    """Maps words to word ids and then some

    The Lexicon object is an attempt to abstract vocabularies out of
    Text indexes.  This abstraction is not totally cooked yet, this
    module still includes the parser for the 'Text Index Query
    Language' and a few other hacks.

    """

    # default for older objects
    stop_syn={}

    def __init__(self, stop_syn=None,useSplitter=None,extra=None):

        self.clear()
        if stop_syn is None:
            self.stop_syn = {}
        else:
            self.stop_syn = stop_syn

        self.useSplitter = Splitter.splitterNames[0]
        if useSplitter: self.useSplitter=useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)


    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()

    def _convertBTrees(self, threshold=200):
        if (type(self._lexicon) is OIBTree and
            type(getattr(self, '_inverseLex', None)) is IOBTree):
            return

        from BTrees.convert import convert

        lexicon=self._lexicon
        self._lexicon=OIBTree()
        self._lexicon._p_jar=self._p_jar
        convert(lexicon, self._lexicon, threshold)

        try:
            inverseLex=self._inverseLex
            self._inverseLex=IOBTree()
        except AttributeError:
            # older lexicons didn't have an inverse lexicon
            self._inverseLex=IOBTree()
            inverseLex=self._inverseLex

        self._inverseLex._p_jar=self._p_jar
        convert(inverseLex, self._inverseLex, threshold)

    def set_stop_syn(self, stop_syn):
        """ pass in a mapping of stopwords and synonyms.  Format is:

        {'word' : [syn1, syn2, ..., synx]}

        Vocabularies do not necesarily need to implement this if their
        splitters do not support stemming or stoping.

        """
        self.stop_syn = stop_syn


    def getWordId(self, word):
        """ return the word id of 'word' """

        wid=self._lexicon.get(word, None)
        if wid is None:
            wid=self.assignWordId(word)
        return wid

    set = getWordId

    def getWord(self, wid):
        """ post-2.3.1b2 method, will not work with unconverted lexicons """
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word and returns it."""
        # First make sure it's not already in there
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        try: inverse=self._inverseLex
        except AttributeError:
            # woops, old lexicom wo wids
            inverse=self._inverseLex=IOBTree()
            for word, wid in self._lexicon.items():
                inverse[wid]=word

        wid=randid()
        while not inverse.insert(wid, word):
            wid=randid()

        if isinstance(word,StringType):
            self._lexicon[intern(word)] = wid
        else:
            self._lexicon[word] = wid


        return wid


    def get(self, key, default=None):
        """Return the matched word against the key."""
        r=IISet()
        wid=self._lexicon.get(key, default)
        if wid is not None: r.insert(wid)
        return r

    def __getitem__(self, key):
        return self.get(key)


    def __len__(self):
        return len(self._lexicon)


    def Splitter(self, astring, words=None, encoding = "latin1"):
        """ wrap the splitter """
        if words is None: words = self.stop_syn

        try:
            return self.SplitterFunc(
                    astring,
                    words,
                    encoding=encoding,
                    singlechar=self.splitterParams.splitterSingleChars,
                    indexnumbers=self.splitterParams.splitterIndexNumbers,
                    casefolding=self.splitterParams.splitterCasefolding
                    )
        except:
            return self.SplitterFunc(astring, words)


    def query_hook(self, q):
        """ we don't want to modify the query cuz we're dumb """
        return q
Esempio n. 8
0
class Table(Persistent):
    """Simple, generic relational table.
    """
    schema = None
    _v_record_class = None

    def __init__(self, schema=None):
        if schema is not None:
            self.schema = schema
        columns = schema.get_columns()
        self.col_info = []  # [(tuple position, column),]
        self.positions = {}
        for i in range(len(columns)):
            # Leave space for the record ID at position 0.
            position = i + 1
            self.col_info.append((position, columns[i]))
            self.positions[columns[i].name] = position
        self.proto_record = [None] * (len(columns) + 1)
        self.next_rid = 1
        self.clear()

    def clear(self):
        self.data = IOBTree()  # {rid -> record as tuple}
        self.indexes = {}  # {index_name -> OOBTree({value -> IITreeSet})}
        self.primary_index = OIBTree()  # {primary key -> rid}
        for position, column in self.col_info:
            if column.indexed:
                self.indexes[column.name] = OOBTree()

    def tuplify(self, params):
        """Accepts a mapping-like object and returns a tuple.
        """
        record = self.proto_record[:]
        positions = self.positions
        if hasattr(params, '__record_schema__'):
            for name in params.__record_schema__.keys():
                position = positions[name]
                record[position] = params[name]
        else:
            for name, value in params.items():
                position = positions[name]
                record[position] = value
        return tuple(record)

    def insert(self, params):
        record = self.tuplify(params)

        # Determine the primary key.
        primary_key = []
        for position, column in self.col_info:
            if column.primary:
                if record[position] is None:
                    raise ValueError, (
                        "No value provided for primary key column %s" %
                        repr(column.name))
                primary_key.append(record[position])
        if primary_key:
            primary_key = tuple(primary_key)
            if self.primary_index.has_key(primary_key):
                raise DuplicateError("Primary key %s in use" %
                                     repr(primary_key))

        # Add a record.
        rid = self.next_rid
        self.next_rid += 1  # XXX Hotspot!
        record = (rid, ) + record[1:]
        self.data[rid] = record
        if primary_key:
            self.primary_index[primary_key] = rid

        # Add to indexes.
        for position, column in self.col_info:
            name = column.name
            value = record[position]
            if value is not None:
                if self.indexes.has_key(name):
                    set = self.indexes[name].get(value)
                    if set is None:
                        set = IITreeSet()
                        self.indexes[name][value] = set
                    set.insert(rid)

        # Return the number of rows inserted.
        return 1

    def delete(self, filter):
        rids = self._select_rids(self.tuplify(filter))
        if rids is None:
            # Zap everything
            count = len(self.data)
            self.clear()
            return count
        elif not rids:
            # No rows selected
            return 0

        rids = tuple(rids)  # Make sure rids is a static sequence
        for rid in rids:
            old_r = self.data[rid]
            assert old_r[0] == rid
            primary_key = []
            for position, column in self.col_info:
                old_value = old_r[position]
                if old_value is not None:
                    if column.primary:
                        primary_key.append(old_value)
                    # Remove from indexes.
                    index = self.indexes.get(column.name)
                    if index is not None:
                        if index.has_key(old_value):
                            # Remove an index entry.
                            set = index[old_value]
                            set.remove(rid)
                            if not set:
                                del index[old_value]

            if primary_key:
                # Remove a primary key.
                primary_key = tuple(primary_key)
                assert self.primary_index[primary_key] == rid
                del self.primary_index[primary_key]

            # Remove the data.
            del self.data[rid]

        return len(rids)

    def update(self, filter, changes):
        rids = self._select_rids(self.tuplify(filter))
        if rids is None:
            rids = self.data.keys()
        elif not rids:
            # Nothing needs to be updated.
            return 0
        count = len(rids)

        # Identify changes.
        old_data = {}  # rid -> old tuple
        new_data = {}  # rid -> new tuple
        old_to_new = {}  # old primary key -> new primary key
        new_to_rid = {}  # new primary key -> rid

        record = self.tuplify(changes)
        for rid in rids:
            old_r = self.data[rid]
            old_data[rid] = old_r
            new_r = list(old_r)
            # new_r and old_r contain record tuples.
            for position, column in self.col_info:
                if record[position] is not None:
                    new_r[position] = record[position]
            new_data[rid] = tuple(new_r)
            # Hmm.  The code below allows an update to change the primary
            # key.  It might be better to prevent primary key columns from
            # being changed by an update() call.
            opk = []
            npk = []
            for position, column in self.col_info:
                if column.primary:
                    opk.append(old_r[position])
                    npk.append(new_r[position])
            if opk != npk:
                opk = tuple(opk)
                npk = tuple(npk)
                old_to_new[opk] = npk
                new_to_rid[npk] = rid

        # Look for primary key conflicts.  A primary key conflict can
        # occur when changing a record to a different primary key and
        # the new primary key is already in use.
        for pk in old_to_new.values():
            if (self.primary_index.has_key(pk) and not old_to_new.has_key(pk)):
                raise DuplicateError("Primary key %s in use" % repr(pk))

        # Update the data.
        self.data.update(new_data)

        # Remove old primary key indexes and insert new primary key indexes.
        for pk in old_to_new.keys():
            del self.primary_index[pk]
        self.primary_index.update(new_to_rid)

        # Update indexes.
        for rid, old_r in old_data.items():
            for position, column in self.col_info:
                index = self.indexes.get(column.name)
                if index is not None:
                    new_value = record[position]
                    old_value = old_r[position]
                    if new_value != old_value:
                        if old_value is not None and index.has_key(old_value):
                            # Remove an index entry.
                            set = index[old_value]
                            set.remove(rid)
                            if not set:
                                del index[old_value]
                        if new_value is not None:
                            # Add an index entry.
                            set = index.get(new_value)
                            if set is None:
                                set = IITreeSet()
                                index[new_value] = set
                            set.insert(rid)

        # Return the number of rows affected.
        return count

    def get_record_class(self):
        klass = self._v_record_class
        if klass is None:
            schema = {'rid': 0}
            for position, column in self.col_info:
                schema[column.name] = position

            class TableRecord(TableRecordMixin, Record):
                __record_schema__ = schema

            self._v_record_class = klass = TableRecord
        return klass

    def select(self, filter):
        rids = self._select_rids(self.tuplify(filter))
        if rids is None:
            # All
            klass = self.get_record_class()
            return [klass(rec) for rec in self.data.values()]
        elif rids:
            # Some
            klass = self.get_record_class()
            data = self.data
            return [klass(data[rid]) for rid in rids]
        else:
            # None
            return []

    def _select_rids(self, query):
        """Searches the table for matches, returning record ids.

        Returns a sequence of record ids, or None for all records.
        """
        primary_key = []
        params = 0  # The number of parameters specified
        primary_params = 0  # The number of primary params specified
        for position, column in self.col_info:
            value = query[position]
            if value is not None:
                params += 1
                if column.primary:
                    primary_params += 1
                    if primary_key is not None:
                        primary_key.append(value)
            elif column.primary:
                # Didn't fully specify the primary key.
                # Can't search by primary key.
                primary_key = None

        if not params:
            # No query.  Select all.
            return None

        # First strategy: try to satisfy the request by consulting
        # the primary key index.
        if primary_key:
            # The primary key is complete.  The result set will have
            # either zero rows or one row.
            primary_key = tuple(primary_key)
            rid = self.primary_index.get(primary_key)
            if rid is None:
                return ()
            # Possibly filter out the single item.
            if params > primary_params:
                cand = self.data[rid]
                for position, column in self.col_info:
                    if query[position] is not None:
                        if cand[position] != query[position]:
                            # Not a match.
                            return ()
            return (rid, )

        # Second strategy: try to satisfy the request by intersecting
        # indexes.
        rids = None
        iteration_filters = []
        for position, column in self.col_info:
            value = query[position]
            if value is not None:
                index = self.indexes.get(column.name)
                if index is None:
                    iteration_filters.append((position, value))
                else:
                    set = index.get(value)
                    if set is None:
                        # No rows satisfy this criterion.
                        return ()
                    if rids is None:
                        rids = set
                    else:
                        rids = intersection(rids, set)
                    if not rids:
                        # No rows satisfy all criteria.
                        return ()
        if rids is not None:
            rids = rids.keys()

        if not iteration_filters:
            # Indexes did all the work.  No need to search each record.
            return rids

        # Fallback strategy: Eliminate items one by one.
        if rids is None:
            # Use the whole data set.
            candidates = self.data.values()
        else:
            # Use the specified records.
            candidates = [self.data[rid] for rid in rids]

        rids = []
        append = rids.append
        for cand in candidates:
            for position, value in iteration_filters:
                if cand[position] != value:
                    # Not a match.
                    break
            else:
                # A match.
                append(cand[0])
        return rids

    def __repr__(self):
        return "<%s(schema=%s)>" % (self.__class__.__name__, repr(self.schema))
Esempio n. 9
0
class Indexer(object):

    filestorage = database = connection = root = None

    def __init__(self, datafs, writable=0, trans=0, pack=0):
        self.trans_limit = trans
        self.pack_limit = pack
        self.trans_count = 0
        self.pack_count = 0
        self.stopdict = get_stopdict()
        self.mh = mhlib.MH()
        self.filestorage = FileStorage(datafs, read_only=(not writable))
        self.database = DB(self.filestorage)
        self.connection = self.database.open()
        self.root = self.connection.root()
        try:
            self.index = self.root["index"]
        except KeyError:
            self.index = self.root["index"] = TextIndexWrapper()
        try:
            self.docpaths = self.root["docpaths"]
        except KeyError:
            self.docpaths = self.root["docpaths"] = IOBTree()
        try:
            self.doctimes = self.root["doctimes"]
        except KeyError:
            self.doctimes = self.root["doctimes"] = IIBTree()
        try:
            self.watchfolders = self.root["watchfolders"]
        except KeyError:
            self.watchfolders = self.root["watchfolders"] = {}
        self.path2docid = OIBTree()
        for docid in self.docpaths.keys():
            path = self.docpaths[docid]
            self.path2docid[path] = docid
        try:
            self.maxdocid = max(self.docpaths.keys())
        except ValueError:
            self.maxdocid = 0
        print(len(self.docpaths), "Document ids")
        print(len(self.path2docid), "Pathnames")
        print(self.index.lexicon.length(), "Words")

    def dumpfreqs(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        L = []
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            L.append((freq, wid, lexicon.get_word(wid)))
        L.sort()
        L.reverse()
        for freq, wid, word in L:
            print("%10d %10d %s" % (wid, freq, word))

    def dumpwids(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for wid in lexicon.wids():
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, lexicon.get_word(wid)))

    def dumpwords(self):
        lexicon = self.index.lexicon
        index = self.index.index
        assert isinstance(index, OkapiIndex)
        for word in lexicon.words():
            wid = lexicon.get_wid(word)
            freq = 0
            for f in index._wordinfo.get(wid, {}).values():
                freq += f
            print("%10d %10d %s" % (wid, freq, word))

    def close(self):
        self.root = None
        if self.connection is not None:
            self.connection.close()
            self.connection = None
        if self.database is not None:
            self.database.close()
            self.database = None
        if self.filestorage is not None:
            self.filestorage.close()
            self.filestorage = None

    def interact(self, nbest=NBEST, maxlines=MAXLINES):
        try:
            import readline
        except ImportError:
            pass
        text = ""
        top = 0
        results = []
        while 1:
            try:
                line = raw_input("Query: ")
            except EOFError:
                print("\nBye.")
                break
            line = line.strip()
            if line.startswith("/"):
                self.specialcommand(line, results, top - nbest)
                continue
            if line:
                text = line
                top = 0
            else:
                if not text:
                    continue
            try:
                results, n = self.timequery(text, top + nbest)
            except KeyboardInterrupt:
                raise
            except:
                reportexc()
                text = ""
                continue
            if len(results) <= top:
                if not n:
                    print("No hits for %r." % text)
                else:
                    print("No more hits for %r." % text)
                text = ""
                continue
            print("[Results %d-%d from %d" % (top + 1, min(n, top + nbest), n),
                  end=" ")
            print("for query %s]" % repr(text))
            self.formatresults(text, results, maxlines, top, top + nbest)
            top += nbest

    def specialcommand(self, line, results, first):
        assert line.startswith("/")
        line = line[1:]
        if not line:
            n = first
        else:
            try:
                n = int(line) - 1
            except:
                print("Huh?")
                return
        if n < 0 or n >= len(results):
            print("Out of range")
            return
        docid, score = results[n]
        path = self.docpaths[docid]
        i = path.rfind("/")
        assert i > 0
        folder = path[:i]
        n = path[i + 1:]
        cmd = "show +%s %s" % (folder, n)
        if os.getenv("DISPLAY"):
            os.system("xterm -e  sh -c '%s | less' &" % cmd)
        else:
            os.system(cmd)

    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
        results, n = self.timequery(text, nbest)
        if not n:
            print("No hits for %r." % text)
            return
        print("[Results 1-%d from %d]" % (len(results), n))
        self.formatresults(text, results, maxlines)

    def timequery(self, text, nbest):
        t0 = time.time()
        c0 = time.clock()
        results, n = self.index.query(text, 0, nbest)
        t1 = time.time()
        c1 = time.clock()
        print("[Query time: %.3f real, %.3f user]" % (t1 - t0, c1 - c0))
        return results, n

    def formatresults(self,
                      text,
                      results,
                      maxlines=MAXLINES,
                      lo=0,
                      hi=sys.maxint):
        stop = self.stopdict.has_key
        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
        pattern = r"\b(" + "|".join(words) + r")\b"
        pattern = pattern.replace("*", ".*")  # glob -> re syntax
        prog = re.compile(pattern, re.IGNORECASE)
        print('=' * 70)
        rank = lo
        for docid, score in results[lo:hi]:
            rank += 1
            path = self.docpaths[docid]
            score *= 100.0
            print("Rank:    %d   Score: %d%%   File: %s" % (rank, score, path))
            path = os.path.join(self.mh.getpath(), path)
            try:
                fp = open(path)
            except (IOError, OSError) as msg:
                print("Can't open:", msg)
                continue
            msg = mhlib.Message("<folder>", 0, fp)
            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
                h = msg.getheader(header)
                if h:
                    print("%-8s %s" % (header + ":", h))
            text = self.getmessagetext(msg)
            if text:
                print()
                nleft = maxlines
                for part in text:
                    for line in part.splitlines():
                        if prog.search(line):
                            print(line)
                            nleft -= 1
                            if nleft <= 0:
                                break
                    if nleft <= 0:
                        break
            print('-' * 70)

    def update(self, args):
        folder = None
        seqs = []

        for arg in args:
            if arg.startswith("+"):
                if folder is None:
                    folder = arg[1:]
                else:
                    print("only one folder at a time")
                    return
            else:
                seqs.append(arg)

        if not folder:
            folder = self.mh.getcontext()
        if not seqs:
            seqs = ['all']

        try:
            f = self.mh.openfolder(folder)
        except mhlib.Error as msg:
            print(msg)
            return

        dict = {}
        for seq in seqs:
            try:
                nums = f.parsesequence(seq)
            except mhlib.Error as msg:
                print(msg or "unparsable message sequence: %s" % repr(seq))
                return
            for n in nums:
                dict[n] = n
        msgs = dict.keys()
        msgs.sort()

        self.updatefolder(f, msgs)
        self.commit()

    def optimize(self, args):
        uniqwords = {}
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nOPTIMIZE FOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.prescan(f, f.listmessages(), uniqwords)
        L = [(uniqwords[word], word) for word in uniqwords.keys()]
        L.sort()
        L.reverse()
        for i in range(100):
            print("%3d. %6d %s" % ((i + 1, ) + L[i]))
        self.index.lexicon.sourceToWordIds([word for (count, word) in L])

    def prescan(self, f, msgs, uniqwords):
        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
        for n in msgs:
            print("prescanning", n)
            m = f.openmessage(n)
            text = self.getmessagetext(m, f.name)
            for p in pipeline:
                text = p.process(text)
            for word in text:
                uniqwords[word] = uniqwords.get(word, 0) + 1

    def bulkupdate(self, args):
        if not args:
            print("No folders specified; use ALL to bulk-index all folders")
            return
        if "ALL" in args:
            i = args.index("ALL")
            args[i:i + 1] = self.mh.listfolders()
        for folder in args:
            if folder.startswith("+"):
                folder = folder[1:]
            print("\nFOLDER", folder)
            try:
                f = self.mh.openfolder(folder)
            except mhlib.Error as msg:
                print(msg)
                continue
            self.updatefolder(f, f.listmessages())
            print("Total", len(self.docpaths))
        self.commit()
        print(
            "Indexed",
            self.index.lexicon._nbytes,
            "bytes and",
        )
        print(
            self.index.lexicon._nwords,
            "words;",
        )
        print(len(self.index.lexicon._words), "unique words.")

    def updatefolder(self, f, msgs):
        self.watchfolders[f.name] = self.getmtime(f.name)
        for n in msgs:
            path = "%s/%s" % (f.name, n)
            docid = self.path2docid.get(path, 0)
            if docid and self.getmtime(path) == self.doctimes.get(docid, 0):
                print("unchanged", docid, path)
                continue
            docid = self.newdocid(path)
            try:
                m = f.openmessage(n)
            except IOError:
                print("disappeared", docid, path)
                self.unindexpath(path)
                continue
            text = self.getmessagetext(m, f.name)
            if not text:
                self.unindexpath(path)
                continue
            print("indexing", docid, path)
            self.index.index_doc(docid, text)
            self.maycommit()
        # Remove messages from the folder that no longer exist
        for path in list(self.path2docid.keys(f.name)):
            if not path.startswith(f.name + "/"):
                break
            if self.getmtime(path) == 0:
                self.unindexpath(path)
        print("done.")

    def unindexpath(self, path):
        if self.path2docid.has_key(path):
            docid = self.path2docid[path]
            print("unindexing", docid, path)
            del self.docpaths[docid]
            del self.doctimes[docid]
            del self.path2docid[path]
            try:
                self.index.unindex_doc(docid)
            except KeyError as msg:
                print("KeyError", msg)
            self.maycommit()

    def getmessagetext(self, m, name=None):
        L = []
        if name:
            L.append("_folder " + name)  # To restrict search to a folder
            self.getheaders(m, L)
        try:
            self.getmsgparts(m, L, 0)
        except KeyboardInterrupt:
            raise
        except:
            print("(getmsgparts failed:)")
            reportexc()
        return L

    def getmsgparts(self, m, L, level):
        ctype = m.gettype()
        if level or ctype != "text/plain":
            print(". " * level + str(ctype))
        if ctype == "text/plain":
            L.append(m.getbodytext())
        elif ctype in ("multipart/alternative", "multipart/mixed"):
            for part in m.getbodyparts():
                self.getmsgparts(part, L, level + 1)
        elif ctype == "message/rfc822":
            f = StringIO(m.getbodytext())
            m = mhlib.Message("<folder>", 0, f)
            self.getheaders(m, L)
            self.getmsgparts(m, L, level + 1)

    def getheaders(self, m, L):
        H = []
        for key in "from", "to", "cc", "bcc", "subject":
            value = m.get(key)
            if value:
                H.append(value)
        if H:
            L.append("\n".join(H))

    def newdocid(self, path):
        docid = self.path2docid.get(path)
        if docid is not None:
            self.doctimes[docid] = self.getmtime(path)
            return docid
        docid = self.maxdocid + 1
        self.maxdocid = docid
        self.docpaths[docid] = path
        self.doctimes[docid] = self.getmtime(path)
        self.path2docid[path] = docid
        return docid

    def getmtime(self, path):
        path = os.path.join(self.mh.getpath(), path)
        try:
            st = os.stat(path)
        except os.error as msg:
            return 0
        return int(st[ST_MTIME])

    def maycommit(self):
        self.trans_count += 1
        if self.trans_count >= self.trans_limit > 0:
            self.commit()

    def commit(self):
        if self.trans_count > 0:
            print("committing...")
            transaction.commit()
            self.trans_count = 0
            self.pack_count += 1
            if self.pack_count >= self.pack_limit > 0:
                self.pack()

    def pack(self):
        if self.pack_count > 0:
            print("packing...")
            self.database.pack()
            self.pack_count = 0
Esempio n. 10
0
class Lexicon(Persistent, Implicit):
    """Maps words to word ids and then some

    The Lexicon object is an attempt to abstract vocabularies out of
    Text indexes.  This abstraction is not totally cooked yet, this
    module still includes the parser for the 'Text Index Query
    Language' and a few other hacks.

    """

    # default for older objects
    stop_syn={}

    def __init__(self, stop_syn=None,useSplitter=None,extra=None):

        self.clear()
        if stop_syn is None:
            self.stop_syn = {}
        else:
            self.stop_syn = stop_syn

        self.useSplitter = Splitter.splitterNames[0]
        if useSplitter: self.useSplitter=useSplitter
        self.splitterParams = extra
        self.SplitterFunc = Splitter.getSplitter(self.useSplitter)


    def clear(self):
        self._lexicon = OIBTree()
        self._inverseLex = IOBTree()

    def _convertBTrees(self, threshold=200):
        if (type(self._lexicon) is OIBTree and
            type(getattr(self, '_inverseLex', None)) is IOBTree):
            return

        from BTrees.convert import convert

        lexicon=self._lexicon
        self._lexicon=OIBTree()
        self._lexicon._p_jar=self._p_jar
        convert(lexicon, self._lexicon, threshold)

        try:
            inverseLex=self._inverseLex
            self._inverseLex=IOBTree()
        except AttributeError:
            # older lexicons didn't have an inverse lexicon
            self._inverseLex=IOBTree()
            inverseLex=self._inverseLex

        self._inverseLex._p_jar=self._p_jar
        convert(inverseLex, self._inverseLex, threshold)

    def set_stop_syn(self, stop_syn):
        """ pass in a mapping of stopwords and synonyms.  Format is:

        {'word' : [syn1, syn2, ..., synx]}

        Vocabularies do not necesarily need to implement this if their
        splitters do not support stemming or stoping.

        """
        self.stop_syn = stop_syn


    def getWordId(self, word):
        """ return the word id of 'word' """

        wid=self._lexicon.get(word, None)
        if wid is None:
            wid=self.assignWordId(word)
        return wid

    set = getWordId

    def getWord(self, wid):
        """ post-2.3.1b2 method, will not work with unconverted lexicons """
        return self._inverseLex.get(wid, None)

    def assignWordId(self, word):
        """Assigns a new word id to the provided word and returns it."""
        # First make sure it's not already in there
        if self._lexicon.has_key(word):
            return self._lexicon[word]


        try: inverse=self._inverseLex
        except AttributeError:
            # woops, old lexicom wo wids
            inverse=self._inverseLex=IOBTree()
            for word, wid in self._lexicon.items():
                inverse[wid]=word

        wid=randid()
        while not inverse.insert(wid, word):
            wid=randid()

        if isinstance(word,StringType):
            self._lexicon[intern(word)] = wid
        else:
            self._lexicon[word] = wid


        return wid


    def get(self, key, default=None):
        """Return the matched word against the key."""
        r=IISet()
        wid=self._lexicon.get(key, default)
        if wid is not None: r.insert(wid)
        return r

    def __getitem__(self, key):
        return self.get(key)


    def __len__(self):
        return len(self._lexicon)


    def Splitter(self, astring, words=None, encoding = "latin1"):
        """ wrap the splitter """
        if words is None: words = self.stop_syn

        try:
            return self.SplitterFunc(
                    astring,
                    words,
                    encoding=encoding,
                    singlechar=self.splitterParams.splitterSingleChars,
                    indexnumbers=self.splitterParams.splitterIndexNumbers,
                    casefolding=self.splitterParams.splitterCasefolding
                    )
        except:
            return self.SplitterFunc(astring, words)


    def query_hook(self, q):
        """ we don't want to modify the query cuz we're dumb """
        return q