Python OIBTree.clear Examples

Programming Language: Python

Namespace/Package Name: BTrees.OIBTree

Class/Type: OIBTree

Method/Function: clear

Examples at hotexamples.com: 3

Python OIBTree.clear - 3 examples found. These are the top rated real world Python examples of BTrees.OIBTree.OIBTree.clear extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OIBTree(30)

get(16)

has_key(5)

items(4)

keys(4)

clear(2)

insert(1)

iteritems(1)

maxKey(1)

pop(1)

update(1)

values(1)

Example #1

Show file

class UdbBtreeIndex(UdbIndex):
    is_prefixed = True
    is_ranged = True
    is_sorted_asc = True
    type = 'btree'

    def __init__(self, schema, name=None):
        from BTrees.OIBTree import OIBTree

        UdbIndex.__init__(self, schema, name)

        self._btree = OIBTree()

    def __len__(self):
        return len(self._btree)

    def clear(self):
        self._btree.clear()

        return self

    def delete(self, key, uid=None):
        self._btree.pop(key, EMPTY)

        return self

    def insert(self, key, uid):
        self._btree.insert(key, uid)

        return self

    def search_by_key(self, key):
        val = self._btree.get(key, EMPTY)

        if val != EMPTY:
            yield val

    def search_by_key_in(self, keys):
        for key in keys:
            val = self._btree.get(key, EMPTY)

            if val != EMPTY:
                yield val

    def search_by_key_prefix(self, key):
        for val in self._btree.values(key, key + CHAR255):
            yield val

    def search_by_key_prefix_in(self, keys):
        for key in keys:
            for val in self._btree.values(key, key + CHAR255):
                yield val

    def search_by_key_range(self,
                            gte=None,
                            lte=None,
                            gte_excluded=False,
                            lte_excluded=False):
        for val in self._btree.values(gte, lte, gte_excluded, lte_excluded):
            yield val

    def upsert(self, old, new, uid):
        if old != new:
            self._btree.pop(old)

        self._btree.insert(new, uid)

        return self

Example #2

Show file

File: tool.py Project: collective/collective.linkcheck

class LinkCheckTool(SimpleItem):
    security = ClassSecurityInfo()

    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # Additional queue for internal crawler to revalidate the site
        self.crawl_queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0

    security.declarePrivate("is_available")
    def is_available(self):
        return hasattr(self, 'index') and \
               hasattr(self, 'checked') and \
               hasattr(self, 'queue') and \
               hasattr(self, 'counter')

    security.declarePrivate("clear")
    def clear(self):
        while True:
            try:
                self.queue.pull()
            except IndexError:
                break
        while True:
            try:
                self.crawl_queue.pull()
            except IndexError:
                break

        self.checked.clear()
        self.index.clear()
        self.links.clear()
        self.counter = 0

    security.declarePrivate("crawl")
    def crawl(self):
        self.clear()
        query = {}
        registry = getUtility(IRegistry)
        settings = registry.forInterface(ISettings)
        if settings.content_types:
            query['portal_type'] = settings.content_types

        if settings.workflow_states:
            query['review_state'] = settings.workflow_states

        catalog = api.portal.get_tool('portal_catalog')
        brains = catalog(query)
        for brain in brains:
            # asyncronous crawling not working yet
            # self.crawl_enqueue(brain.UID)

            obj = brain.getObject()
            obj.restrictedTraverse('@@linkcheck')()
            logger.info('Crawling: checked {0}'.format(brain.getURL()))

    security.declarePrivate("enqueue")
    def enqueue(self, url):
        index = self.index.get(url)
        if index is None:
            # a really new url
            index = self.store(url)
        else:
            entry = self.checked.get(index)
            if entry is not None and entry:
                entry = None, entry[1], entry[2]
                self.checked[index] = entry
            else:
                # reset empty entry
                self.remove(url)
                index = self.store(url)
        self.queue.put(index)
        return index

    security.declarePrivate("register")
    def register(self, hrefs, referer, timestamp):
        """Add or update link presence information.

        If a link has not been checked since the provided timestamp,
        it will be added to the queue (or if it is not in the
        database).
        """

        referer = self.index.get(referer) or self.store(referer)

        registry = getUtility(IRegistry, context=self.aq_parent)
        try:
            settings = registry.forInterface(ISettings)
        except KeyError as exc:
            logger.warn(exc)
            return

        limit = settings.referers

        for href in hrefs:
            if self.should_ignore(href, settings.ignore_list):
                continue

            # If the hyperlink is not already in the work queue,
            # compare the provided timestamp to our database to see if
            # we need to check its validity. Note that internal links
            # are excempt if we're not using the publisher.
            index = self.index.get(href)
            entry = self.checked.get(-1 if index is None else index)

            if index not in self.queue:
                if entry is None or entry[0] < timestamp:
                    if settings.use_publisher or not href.startswith('/'):
                        index = self.enqueue(href)
                    elif href not in self.index:
                        index = self.store(href)

            assert index is not None

            if entry is None:
                self.checked[index] = None, None, IISet((referer,))
            else:
                # If the provided paths are a subset of the already
                # seen paths, and if there is no new referer, we don't
                # issue an update.
                referers = entry[2]
                if referer not in referers and len(referers) <= limit:
                    referers.add(referer)

    security.declarePrivate("store")
    def store(self, url):
        index = self.index[url] = self.counter
        self.links[index] = url
        self.counter += 1
        return index

    security.declarePrivate("remove")
    def remove(self, url):
        index = self.index.get(url)
        if url in self.index:
            del self.index[url]
        if index and index in self.checked:
            del self.checked[index]

    security.declarePrivate("update")
    def update(self, href, status):
        """Update link status."""
        now = datetime.datetime.now()
        timestamp = int(time.mktime(now.timetuple()))

        index = self.index.get(href)
        if index is None:
            return

        entry = self.checked.get(-1 if index is None else index)
        if entry is None:
            self.checked[index] = timestamp, status, IISet()

        # If the status changed, we update the entry.
        elif status != entry[1] or not entry[0]:

            # If the status was previously good, then we clear the
            # status. What this means is that we'll wait for the next
            # check to declare a bad status (it might be temporary).
            if entry[1] == 200:
                status = None

            self.checked[index] = timestamp, status, entry[2]

    @cache(lambda method, self, ignore_list: ignore_list)
    def get_matchers(self, ignore_list):
        matchers = []
        for expression in ignore_list:
            try:
                matcher = re.compile(expression).search
            except re.error:
                pass
            else:
                matchers.append(matcher)

        return matchers

    def should_ignore(self, href, ignore_list):
        for matcher in self.get_matchers(ignore_list):
            if matcher(href):
                return True

        return False

    def crawl_enqueue(self, obj):
        if not isinstance(obj, basestring):
            obj = obj.UID()
        self.crawl_queue.put(obj)

    def crawl_dequeue(self):
        if self.crawl_queue._data:
            return self.crawl_queue.pull()

Example #3

Show file

File: tool.py Project: enfold/collective.linkcheck

class LinkCheckTool(SimpleItem):
    security = ClassSecurityInfo()

    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # Additional queue for internal crawler to revalidate the site
        self.crawl_queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0

    security.declarePrivate("is_available")

    def is_available(self):
        return hasattr(self, 'index') and \
               hasattr(self, 'checked') and \
               hasattr(self, 'queue') and \
               hasattr(self, 'counter')

    security.declarePrivate("clear")

    def clear(self):
        while True:
            try:
                self.queue.pull()
            except IndexError:
                break
        while True:
            try:
                self.crawl_queue.pull()
            except IndexError:
                break

        self.checked.clear()
        self.index.clear()
        self.links.clear()
        self.counter = 0

    security.declarePrivate("crawl")

    def crawl(self):
        self.clear()
        query = {}
        registry = getUtility(IRegistry)
        settings = registry.forInterface(ISettings)
        if settings.content_types:
            query['portal_type'] = settings.content_types

        if settings.workflow_states:
            query['review_state'] = settings.workflow_states

        catalog = api.portal.get_tool('portal_catalog')
        brains = catalog(query)
        for brain in brains:
            # asyncronous crawling not working yet
            # self.crawl_enqueue(brain.UID)

            obj = brain.getObject()
            obj.restrictedTraverse('@@linkcheck')()
            logger.info('Crawling: checked {0}'.format(brain.getURL()))

    security.declarePrivate("enqueue")

    def enqueue(self, url):
        index = self.index.get(url)
        if index is None:
            # a really new url
            index = self.store(url)
        else:
            entry = self.checked.get(index)
            if entry is not None and entry:
                entry = None, entry[1], entry[2]
                self.checked[index] = entry
            else:
                # reset empty entry
                self.remove(url)
                index = self.store(url)
        self.queue.put(index)
        return index

    security.declarePrivate("register")

    def register(self, hrefs, referer, timestamp):
        """Add or update link presence information.

        If a link has not been checked since the provided timestamp,
        it will be added to the queue (or if it is not in the
        database).
        """

        referer = self.index.get(referer) or self.store(referer)

        registry = getUtility(IRegistry, context=self.aq_parent)
        try:
            settings = registry.forInterface(ISettings)
        except KeyError as exc:
            logger.warn(exc)
            return

        limit = settings.referers

        for href in hrefs:
            if self.should_ignore(href, settings.ignore_list):
                continue

            # If the hyperlink is not already in the work queue,
            # compare the provided timestamp to our database to see if
            # we need to check its validity. Note that internal links
            # are excempt if we're not using the publisher.
            index = self.index.get(href)
            entry = self.checked.get(-1 if index is None else index)

            if index not in self.queue:
                if entry is None or entry[0] < timestamp:
                    if settings.use_publisher or not href.startswith('/'):
                        index = self.enqueue(href)
                    elif href not in self.index:
                        index = self.store(href)

            assert index is not None

            if entry is None:
                self.checked[index] = None, None, IISet((referer, ))
            else:
                # If the provided paths are a subset of the already
                # seen paths, and if there is no new referer, we don't
                # issue an update.
                referers = entry[2]
                if referer not in referers and len(referers) <= limit:
                    referers.add(referer)

    security.declarePrivate("store")

    def store(self, url):
        index = self.index[url] = self.counter
        self.links[index] = url
        self.counter += 1
        return index

    security.declarePrivate("remove")

    def remove(self, url):
        index = self.index.get(url)
        if url in self.index:
            del self.index[url]
        if index and index in self.checked:
            del self.checked[index]

    security.declarePrivate("update")

    def update(self, href, status):
        """Update link status."""
        now = datetime.datetime.now()
        timestamp = int(time.mktime(now.timetuple()))

        index = self.index.get(href)
        if index is None:
            return

        entry = self.checked.get(-1 if index is None else index)
        if entry is None:
            self.checked[index] = timestamp, status, IISet()

        # If the status changed, we update the entry.
        elif status != entry[1] or not entry[0]:

            # If the status was previously good, then we clear the
            # status. What this means is that we'll wait for the next
            # check to declare a bad status (it might be temporary).
            if entry[1] == 200:
                status = None

            self.checked[index] = timestamp, status, entry[2]

    @cache(lambda method, self, ignore_list: ignore_list)
    def get_matchers(self, ignore_list):
        matchers = []
        for expression in ignore_list:
            try:
                matcher = re.compile(expression).search
            except re.error:
                pass
            else:
                matchers.append(matcher)

        return matchers

    def should_ignore(self, href, ignore_list):
        for matcher in self.get_matchers(ignore_list):
            if matcher(href):
                return True

        return False

    def crawl_enqueue(self, obj):
        if not isinstance(obj, basestring):
            obj = obj.UID()
        self.crawl_queue.put(obj)

    def crawl_dequeue(self):
        if self.crawl_queue._data:
            return self.crawl_queue.pull()