Esempio n. 1
0
class UdbBtreeIndex(UdbIndex):
    is_prefixed = True
    is_ranged = True
    is_sorted_asc = True
    type = 'btree'

    def __init__(self, schema, name=None):
        from BTrees.OIBTree import OIBTree

        UdbIndex.__init__(self, schema, name)

        self._btree = OIBTree()

    def __len__(self):
        return len(self._btree)

    def clear(self):
        self._btree.clear()

        return self

    def delete(self, key, uid=None):
        self._btree.pop(key, EMPTY)

        return self

    def insert(self, key, uid):
        self._btree.insert(key, uid)

        return self

    def search_by_key(self, key):
        val = self._btree.get(key, EMPTY)

        if val != EMPTY:
            yield val

    def search_by_key_in(self, keys):
        for key in keys:
            val = self._btree.get(key, EMPTY)

            if val != EMPTY:
                yield val

    def search_by_key_prefix(self, key):
        for val in self._btree.values(key, key + CHAR255):
            yield val

    def search_by_key_prefix_in(self, keys):
        for key in keys:
            for val in self._btree.values(key, key + CHAR255):
                yield val

    def search_by_key_range(self,
                            gte=None,
                            lte=None,
                            gte_excluded=False,
                            lte_excluded=False):
        for val in self._btree.values(gte, lte, gte_excluded, lte_excluded):
            yield val

    def upsert(self, old, new, uid):
        if old != new:
            self._btree.pop(old)

        self._btree.insert(new, uid)

        return self
Esempio n. 2
0
class LinkCheckTool(SimpleItem):
    security = ClassSecurityInfo()

    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # Additional queue for internal crawler to revalidate the site
        self.crawl_queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0

    security.declarePrivate("is_available")
    def is_available(self):
        return hasattr(self, 'index') and \
               hasattr(self, 'checked') and \
               hasattr(self, 'queue') and \
               hasattr(self, 'counter')

    security.declarePrivate("clear")
    def clear(self):
        while True:
            try:
                self.queue.pull()
            except IndexError:
                break
        while True:
            try:
                self.crawl_queue.pull()
            except IndexError:
                break

        self.checked.clear()
        self.index.clear()
        self.links.clear()
        self.counter = 0

    security.declarePrivate("crawl")
    def crawl(self):
        self.clear()
        query = {}
        registry = getUtility(IRegistry)
        settings = registry.forInterface(ISettings)
        if settings.content_types:
            query['portal_type'] = settings.content_types

        if settings.workflow_states:
            query['review_state'] = settings.workflow_states

        catalog = api.portal.get_tool('portal_catalog')
        brains = catalog(query)
        for brain in brains:
            # asyncronous crawling not working yet
            # self.crawl_enqueue(brain.UID)

            obj = brain.getObject()
            obj.restrictedTraverse('@@linkcheck')()
            logger.info('Crawling: checked {0}'.format(brain.getURL()))

    security.declarePrivate("enqueue")
    def enqueue(self, url):
        index = self.index.get(url)
        if index is None:
            # a really new url
            index = self.store(url)
        else:
            entry = self.checked.get(index)
            if entry is not None and entry:
                entry = None, entry[1], entry[2]
                self.checked[index] = entry
            else:
                # reset empty entry
                self.remove(url)
                index = self.store(url)
        self.queue.put(index)
        return index

    security.declarePrivate("register")
    def register(self, hrefs, referer, timestamp):
        """Add or update link presence information.

        If a link has not been checked since the provided timestamp,
        it will be added to the queue (or if it is not in the
        database).
        """

        referer = self.index.get(referer) or self.store(referer)

        registry = getUtility(IRegistry, context=self.aq_parent)
        try:
            settings = registry.forInterface(ISettings)
        except KeyError as exc:
            logger.warn(exc)
            return

        limit = settings.referers

        for href in hrefs:
            if self.should_ignore(href, settings.ignore_list):
                continue

            # If the hyperlink is not already in the work queue,
            # compare the provided timestamp to our database to see if
            # we need to check its validity. Note that internal links
            # are excempt if we're not using the publisher.
            index = self.index.get(href)
            entry = self.checked.get(-1 if index is None else index)

            if index not in self.queue:
                if entry is None or entry[0] < timestamp:
                    if settings.use_publisher or not href.startswith('/'):
                        index = self.enqueue(href)
                    elif href not in self.index:
                        index = self.store(href)

            assert index is not None

            if entry is None:
                self.checked[index] = None, None, IISet((referer,))
            else:
                # If the provided paths are a subset of the already
                # seen paths, and if there is no new referer, we don't
                # issue an update.
                referers = entry[2]
                if referer not in referers and len(referers) <= limit:
                    referers.add(referer)

    security.declarePrivate("store")
    def store(self, url):
        index = self.index[url] = self.counter
        self.links[index] = url
        self.counter += 1
        return index

    security.declarePrivate("remove")
    def remove(self, url):
        index = self.index.get(url)
        if url in self.index:
            del self.index[url]
        if index and index in self.checked:
            del self.checked[index]

    security.declarePrivate("update")
    def update(self, href, status):
        """Update link status."""
        now = datetime.datetime.now()
        timestamp = int(time.mktime(now.timetuple()))

        index = self.index.get(href)
        if index is None:
            return

        entry = self.checked.get(-1 if index is None else index)
        if entry is None:
            self.checked[index] = timestamp, status, IISet()

        # If the status changed, we update the entry.
        elif status != entry[1] or not entry[0]:

            # If the status was previously good, then we clear the
            # status. What this means is that we'll wait for the next
            # check to declare a bad status (it might be temporary).
            if entry[1] == 200:
                status = None

            self.checked[index] = timestamp, status, entry[2]

    @cache(lambda method, self, ignore_list: ignore_list)
    def get_matchers(self, ignore_list):
        matchers = []
        for expression in ignore_list:
            try:
                matcher = re.compile(expression).search
            except re.error:
                pass
            else:
                matchers.append(matcher)

        return matchers

    def should_ignore(self, href, ignore_list):
        for matcher in self.get_matchers(ignore_list):
            if matcher(href):
                return True

        return False

    def crawl_enqueue(self, obj):
        if not isinstance(obj, basestring):
            obj = obj.UID()
        self.crawl_queue.put(obj)

    def crawl_dequeue(self):
        if self.crawl_queue._data:
            return self.crawl_queue.pull()
Esempio n. 3
0
class LinkCheckTool(SimpleItem):
    security = ClassSecurityInfo()

    def __init__(self, id=None):
        super(LinkCheckTool, self).__init__(id)

        # This is the work queue; items in this queue are scheduled
        # for link validity check.
        self.queue = CompositeQueue()

        # Additional queue for internal crawler to revalidate the site
        self.crawl_queue = CompositeQueue()

        # This is the link database. It maps a hyperlink index to a
        # tuple (timestamp, status, referers).
        self.checked = IOBTree()

        # Indexes
        self.index = OIBTree()
        self.links = IOBTree()

        # This is a counter that allows us to add new hyperlinks and
        # provide an indexc quickly.
        self.counter = 0

    security.declarePrivate("is_available")

    def is_available(self):
        return hasattr(self, 'index') and \
               hasattr(self, 'checked') and \
               hasattr(self, 'queue') and \
               hasattr(self, 'counter')

    security.declarePrivate("clear")

    def clear(self):
        while True:
            try:
                self.queue.pull()
            except IndexError:
                break
        while True:
            try:
                self.crawl_queue.pull()
            except IndexError:
                break

        self.checked.clear()
        self.index.clear()
        self.links.clear()
        self.counter = 0

    security.declarePrivate("crawl")

    def crawl(self):
        self.clear()
        query = {}
        registry = getUtility(IRegistry)
        settings = registry.forInterface(ISettings)
        if settings.content_types:
            query['portal_type'] = settings.content_types

        if settings.workflow_states:
            query['review_state'] = settings.workflow_states

        catalog = api.portal.get_tool('portal_catalog')
        brains = catalog(query)
        for brain in brains:
            # asyncronous crawling not working yet
            # self.crawl_enqueue(brain.UID)

            obj = brain.getObject()
            obj.restrictedTraverse('@@linkcheck')()
            logger.info('Crawling: checked {0}'.format(brain.getURL()))

    security.declarePrivate("enqueue")

    def enqueue(self, url):
        index = self.index.get(url)
        if index is None:
            # a really new url
            index = self.store(url)
        else:
            entry = self.checked.get(index)
            if entry is not None and entry:
                entry = None, entry[1], entry[2]
                self.checked[index] = entry
            else:
                # reset empty entry
                self.remove(url)
                index = self.store(url)
        self.queue.put(index)
        return index

    security.declarePrivate("register")

    def register(self, hrefs, referer, timestamp):
        """Add or update link presence information.

        If a link has not been checked since the provided timestamp,
        it will be added to the queue (or if it is not in the
        database).
        """

        referer = self.index.get(referer) or self.store(referer)

        registry = getUtility(IRegistry, context=self.aq_parent)
        try:
            settings = registry.forInterface(ISettings)
        except KeyError as exc:
            logger.warn(exc)
            return

        limit = settings.referers

        for href in hrefs:
            if self.should_ignore(href, settings.ignore_list):
                continue

            # If the hyperlink is not already in the work queue,
            # compare the provided timestamp to our database to see if
            # we need to check its validity. Note that internal links
            # are excempt if we're not using the publisher.
            index = self.index.get(href)
            entry = self.checked.get(-1 if index is None else index)

            if index not in self.queue:
                if entry is None or entry[0] < timestamp:
                    if settings.use_publisher or not href.startswith('/'):
                        index = self.enqueue(href)
                    elif href not in self.index:
                        index = self.store(href)

            assert index is not None

            if entry is None:
                self.checked[index] = None, None, IISet((referer, ))
            else:
                # If the provided paths are a subset of the already
                # seen paths, and if there is no new referer, we don't
                # issue an update.
                referers = entry[2]
                if referer not in referers and len(referers) <= limit:
                    referers.add(referer)

    security.declarePrivate("store")

    def store(self, url):
        index = self.index[url] = self.counter
        self.links[index] = url
        self.counter += 1
        return index

    security.declarePrivate("remove")

    def remove(self, url):
        index = self.index.get(url)
        if url in self.index:
            del self.index[url]
        if index and index in self.checked:
            del self.checked[index]

    security.declarePrivate("update")

    def update(self, href, status):
        """Update link status."""
        now = datetime.datetime.now()
        timestamp = int(time.mktime(now.timetuple()))

        index = self.index.get(href)
        if index is None:
            return

        entry = self.checked.get(-1 if index is None else index)
        if entry is None:
            self.checked[index] = timestamp, status, IISet()

        # If the status changed, we update the entry.
        elif status != entry[1] or not entry[0]:

            # If the status was previously good, then we clear the
            # status. What this means is that we'll wait for the next
            # check to declare a bad status (it might be temporary).
            if entry[1] == 200:
                status = None

            self.checked[index] = timestamp, status, entry[2]

    @cache(lambda method, self, ignore_list: ignore_list)
    def get_matchers(self, ignore_list):
        matchers = []
        for expression in ignore_list:
            try:
                matcher = re.compile(expression).search
            except re.error:
                pass
            else:
                matchers.append(matcher)

        return matchers

    def should_ignore(self, href, ignore_list):
        for matcher in self.get_matchers(ignore_list):
            if matcher(href):
                return True

        return False

    def crawl_enqueue(self, obj):
        if not isinstance(obj, basestring):
            obj = obj.UID()
        self.crawl_queue.put(obj)

    def crawl_dequeue(self):
        if self.crawl_queue._data:
            return self.crawl_queue.pull()