def test_unknown_uris(self):

        unique_filter = UniqueUriFilter("sha1")

        self.assertFalse(unique_filter.is_known("http://www.google.de", add_if_unknown=True))
        self.assertFalse(unique_filter.is_known("http://www.yahoo.com", add_if_unknown=True))
        self.assertTrue(unique_filter.is_known("http://www.google.de"))
        self.assertTrue(unique_filter.is_known("http://www.yahoo.com"))
Example #2
0
    def test_unknown_uris(self):

        unique_filter = UniqueUriFilter('sha1')

        self.assertFalse(
            unique_filter.is_known("http://www.google.de",
                                   add_if_unknown=True))
        self.assertFalse(
            unique_filter.is_known("http://www.yahoo.com",
                                   add_if_unknown=True))
        self.assertTrue(unique_filter.is_known("http://www.google.de"))
        self.assertTrue(unique_filter.is_known("http://www.yahoo.com"))
Example #3
0
    def __init__(self,
                 settings,
                 log_handler,
                 front_end_queues,
                 prioritizer,
                 unique_hash='sha1'):
        """
        Initialize the frontier and instantiate the
        :class:`SQLiteSingleHostUriQueue`.

        The default frontier we will use the `sha1` hash function for the
        unique uri filter. For very large crawls you might want to use a
        larger hash function (`sha512`, e.g.)
        """
        LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER)
        # front end queue
        self._prioritizer = prioritizer
        self._front_end_queues = front_end_queues
        # checkpointing
        self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING
        self._uris_added = 0

        # the heap
        self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE)
        self._heap_min_size = settings.FRONTIER_HEAP_MIN

        # a list of uris currently being crawled.
        self._current_uris = dict()
        # dns cache
        self._dns_cache = DnsCache(settings)
        # unique uri filter
        self._unique_uri = UniqueUriFilter(unique_hash)
        for url in self._front_end_queues.all_uris():
            assert not self._unique_uri.is_known(url, add_if_unknown=True)

        # the sinks
        self._sinks = []

        # timezone
        self._timezone = settings.LOCAL_TIMEZONE
        self._logger.info("frontier::initialized")
Example #4
0
    def __init__(self, settings, log_handler, front_end_queues, prioritizer,
        unique_hash='sha1'):
        """
        Initialize the frontier and instantiate the
        :class:`SQLiteSingleHostUriQueue`.

        The default frontier we will use the `sha1` hash function for the
        unique uri filter. For very large crawls you might want to use a
        larger hash function (`sha512`, e.g.)
        """
        LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER)
        # front end queue
        self._prioritizer = prioritizer
        self._front_end_queues = front_end_queues
        # checkpointing
        self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING
        self._uris_added = 0

        # the heap
        self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE)
        self._heap_min_size = settings.FRONTIER_HEAP_MIN

        # a list of uris currently being crawled.
        self._current_uris = dict()
        # dns cache
        self._dns_cache = DnsCache(settings)
        # unique uri filter
        self._unique_uri = UniqueUriFilter(unique_hash)
        for url in self._front_end_queues.all_uris():
            assert not self._unique_uri.is_known(url, add_if_unknown=True)

        # the sinks
        self._sinks = []

        # timezone
        self._timezone = settings.LOCAL_TIMEZONE
        self._logger.info("frontier::initialized")
Example #5
0
class AbstractBaseFrontier(object, LoggingMixin):
    """
    A base class for implementing frontiers.

    Basically this class provides the different general methods and
    configuration parameters used for frontiers.
    """

    def __init__(self, settings, log_handler, front_end_queues, prioritizer,
        unique_hash='sha1'):
        """
        Initialize the frontier and instantiate the
        :class:`SQLiteSingleHostUriQueue`.

        The default frontier we will use the `sha1` hash function for the
        unique uri filter. For very large crawls you might want to use a
        larger hash function (`sha512`, e.g.)
        """
        LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER)
        # front end queue
        self._prioritizer = prioritizer
        self._front_end_queues = front_end_queues
        # checkpointing
        self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING
        self._uris_added = 0

        # the heap
        self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE)
        self._heap_min_size = settings.FRONTIER_HEAP_MIN

        # a list of uris currently being crawled.
        self._current_uris = dict()
        # dns cache
        self._dns_cache = DnsCache(settings)
        # unique uri filter
        self._unique_uri = UniqueUriFilter(unique_hash)
        for url in self._front_end_queues.all_uris():
            assert not self._unique_uri.is_known(url, add_if_unknown=True)

        # the sinks
        self._sinks = []

        # timezone
        self._timezone = settings.LOCAL_TIMEZONE
        self._logger.info("frontier::initialized")

    def add_sink(self, sink):
        """
        Add a sink to the frontier. A sink will be responsible for the long
        term storage of the crawled contents.
        """
        self._sinks.append(sink)

    def add_uri(self, curi):
        """
        Add the specified :class:`CrawlUri` to the frontier.

        `next_date` is a datetime object for the next time the uri should be
        crawled.

        Note: time based crawling is never strict, it is generally used as some
        kind of prioritization.
        """
        if self._unique_uri.is_known(curi.url, add_if_unknown=True):
            # we already know this uri
            self._logger.debug("frontier::Trying to update a known uri... " + \
                    "(%s)" % (curi.url,))
            return

        self._logger.info("frontier::Adding '%s' to the frontier" % curi.url)
        self._front_end_queues.add_uri(self._uri_from_curi(curi))
        self._maybe_checkpoint()

    def update_uri(self, curi):
        """
        Update a given uri.
        """
        self._front_end_queues.update_uri(self._uri_from_curi(curi))
        self._maybe_checkpoint()

    def get_next(self):
        """
        Return the next uri scheduled for crawling.
        """
        if self._heap.qsize() < self._heap_min_size:
            self._update_heap()

        try:
            (_next_date, next_uri) = self._heap.get_nowait()
        except Empty:
            # heap is empty, there is nothing to crawl right now!
            # maybe log this in the future
            raise

        return self._crawluri_from_uri(next_uri)

    def close(self):
        """
        Close the underlying frontend queues.
        """
        self._front_end_queues.checkpoint()
        self._front_end_queues.close()

    def _add_to_heap(self, uri, next_date):
        """
        Add an URI to the heap that is ready to be crawled.
        """
        self._heap.put_nowait((next_date, uri))
        (url, _etag, _mod_date, _next_date, _prio) = uri
        self._current_uris[url] = uri
        self._logger.debug("frontier::Adding '%s' to the heap" % url)

    def _reschedule_uri(self, curi):
        """
        Return the `next_crawl_date` for :class:`CrawlUri`s.
        """
        (prio, delta) = self._prioritizer.calculate_priority(curi)
        now = datetime.now(self._timezone)
        return (prio, time.mktime((now + delta).timetuple()))

    def _ignore_uri(self, curi):
        """
        Ignore a :class:`CrawlUri` from now on.
        """
        self._front_end_queues.ignore_uri(curi.url, curi.status_code)

    def _uri_from_curi(self, curi):
        """
        Create the uri tuple from the :class:`CrawlUri` and calculate the
        priority.

        Overwrite this method in more specific frontiers.
        """
        etag = mod_date = None
        if curi.rep_header:
            if "Etag" in curi.rep_header:
                etag = curi.rep_header["Etag"]
            if "Last-Modified" in curi.rep_header:
                mod_date = time.mktime(deserialize_date_time(
                    curi.rep_header["Last-Modified"]).timetuple())
            if not mod_date and 'Date' in curi.rep_header:
                mod_date = time.mktime(deserialize_date_time(
                    curi.rep_header["Date"]).timetuple())

        if mod_date:
            # only reschedule if it has been crawled before
            (prio, next_crawl_date) = self._reschedule_uri(curi)
        else:
            (prio, next_crawl_date) = (1,
                    time.mktime(datetime.now(self._timezone).timetuple()))

        return (curi.url, etag, mod_date, next_crawl_date, prio)

    def _crawluri_from_uri(self, uri):
        """
        Convert an URI tuple to a :class:`CrawlUri`.

        Replace the hostname with the real IP in order to cache DNS queries.
        """
        (url, etag, mod_date, _next_date, prio) = uri

        parsed_url = urlparse(url)

        # dns resolution and caching
        port = parsed_url.port
        if not port:
            port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme]

        effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname,
            port)]

        curi = CrawlUri(url)
        curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" %
                effective_netloc)
        curi.current_priority = prio
        curi.req_header = dict()
        if etag:
            curi.req_header["Etag"] = etag
        if mod_date:
            mod_date_time = datetime.fromtimestamp(mod_date)
            curi.req_header["Last-Modified"] = serialize_date_time(
                    mod_date_time)

        curi.optional_vars = dict()
        if parsed_url.username and parsed_url.password:
            curi.optional_vars[CURI_SITE_USERNAME] = \
                parsed_url.username.encode()
            curi.optional_vars[CURI_SITE_PASSWORD] = \
                parsed_url.password.encode()

        return curi

    def _update_heap(self):
        """
        Abstract method. Implement this in the actual Frontier.

        The implementation should really only add uris to the heap if they can
        be downloaded right away.
        """
        pass

    def _maybe_checkpoint(self, force_checkpoint=False):
        """
        Periodically checkpoint the state db.
        """
        self._uris_added += 1
        if self._uris_added > self._checkpoint_interval or force_checkpoint:
            self._front_end_queues.checkpoint()
            self._uris_added = 0

    def process_successful_crawl(self, curi):
        """
        Called when an URI has been crawled successfully.

        `curi` is a :class:`CrawlUri`
        """
        self.update_uri(curi)

        if curi.optional_vars and CURI_EXTRACTED_URLS in curi.optional_vars:
            for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"):
                if len(url) > 5 and not self._unique_uri.is_known(url):
                    self.add_uri(CrawlUri(url))

        del self._current_uris[curi.url]

        for sink in self._sinks:
            sink.process_successful_crawl(curi)

    def process_not_found(self, curi):
        """
        Called when an URL was not found.

        This could mean, that the URL has been removed from the server. If so,
        do something about it!

        Override this method in the actual frontier implementation.
        """
        del self._current_uris[curi.url]
        self._ignore_uri(curi)

        for sink in self._sinks:
            sink.process_not_found(curi)

    def process_redirect(self, curi):
        """
        Called when there were too many redirects for an URL, or the site has
        note been updated since the last visit.

        In the latter case, update the internal uri and increase the priority
        level.
        """
        del self._current_uris[curi.url]

        if curi.status_code in [301, 302]:
            # simply ignore the URL. The URL that is being redirected to is
            # extracted and added in the processing
            self._ignore_uri(curi)

        if curi.status_code == 304:
            # the page has not been modified since the last visit! Update it
            # NOTE: prio increasing happens in the prioritizer
            self.update_uri(curi)

        for sink in self._sinks:
            sink.process_redirect(curi)

    def process_server_error(self, curi):
        """
        Called when there was some kind of server error.

        Override this method in the actual frontier implementation.
        """
        del self._current_uris[curi.url]
        self._ignore_uri(curi)

        for sink in self._sinks:
            sink.process_server_error(curi)
Example #6
0
class AbstractBaseFrontier(object, LoggingMixin):
    """
    A base class for implementing frontiers.

    Basically this class provides the different general methods and
    configuration parameters used for frontiers.
    """
    def __init__(self,
                 settings,
                 log_handler,
                 front_end_queues,
                 prioritizer,
                 unique_hash='sha1'):
        """
        Initialize the frontier and instantiate the
        :class:`SQLiteSingleHostUriQueue`.

        The default frontier we will use the `sha1` hash function for the
        unique uri filter. For very large crawls you might want to use a
        larger hash function (`sha512`, e.g.)
        """
        LoggingMixin.__init__(self, log_handler, settings.LOG_LEVEL_MASTER)
        # front end queue
        self._prioritizer = prioritizer
        self._front_end_queues = front_end_queues
        # checkpointing
        self._checkpoint_interval = settings.FRONTIER_CHECKPOINTING
        self._uris_added = 0

        # the heap
        self._heap = PriorityQueue(maxsize=settings.FRONTIER_HEAP_SIZE)
        self._heap_min_size = settings.FRONTIER_HEAP_MIN

        # a list of uris currently being crawled.
        self._current_uris = dict()
        # dns cache
        self._dns_cache = DnsCache(settings)
        # unique uri filter
        self._unique_uri = UniqueUriFilter(unique_hash)
        for url in self._front_end_queues.all_uris():
            assert not self._unique_uri.is_known(url, add_if_unknown=True)

        # the sinks
        self._sinks = []

        # timezone
        self._timezone = settings.LOCAL_TIMEZONE
        self._logger.info("frontier::initialized")

    def add_sink(self, sink):
        """
        Add a sink to the frontier. A sink will be responsible for the long
        term storage of the crawled contents.
        """
        self._sinks.append(sink)

    def add_uri(self, curi):
        """
        Add the specified :class:`CrawlUri` to the frontier.

        `next_date` is a datetime object for the next time the uri should be
        crawled.

        Note: time based crawling is never strict, it is generally used as some
        kind of prioritization.
        """
        if self._unique_uri.is_known(curi.url, add_if_unknown=True):
            # we already know this uri
            self._logger.debug("frontier::Trying to update a known uri... " + \
                    "(%s)" % (curi.url,))
            return

        self._logger.info("frontier::Adding '%s' to the frontier" % curi.url)
        self._front_end_queues.add_uri(self._uri_from_curi(curi))
        self._maybe_checkpoint()

    def update_uri(self, curi):
        """
        Update a given uri.
        """
        self._front_end_queues.update_uri(self._uri_from_curi(curi))
        self._maybe_checkpoint()

    def get_next(self):
        """
        Return the next uri scheduled for crawling.
        """
        if self._heap.qsize() < self._heap_min_size:
            self._update_heap()

        try:
            (_next_date, next_uri) = self._heap.get_nowait()
        except Empty:
            # heap is empty, there is nothing to crawl right now!
            # maybe log this in the future
            raise

        return self._crawluri_from_uri(next_uri)

    def close(self):
        """
        Close the underlying frontend queues.
        """
        self._front_end_queues.checkpoint()
        self._front_end_queues.close()

    def _crawl_now(self, uri):
        """
        Convinience method for crawling an uri right away.
        """
        self._add_to_heap(uri, 3000)

    def _add_to_heap(self, uri, next_date):
        """
        Add an URI to the heap that is ready to be crawled.
        """
        self._heap.put_nowait((next_date, uri))
        (url, _etag, _mod_date, _next_date, _prio) = uri
        self._current_uris[url] = uri
        self._logger.debug("frontier::Adding '%s' to the heap" % url)

    def _reschedule_uri(self, curi):
        """
        Return the `next_crawl_date` for :class:`CrawlUri`s.
        """
        (prio, delta) = self._prioritizer.calculate_priority(curi)
        now = datetime.now(self._timezone)
        return (prio, time.mktime((now + delta).timetuple()))

    def _ignore_uri(self, curi):
        """
        Ignore a :class:`CrawlUri` from now on.
        """
        self._front_end_queues.ignore_uri(curi.url, curi.status_code)

    def _uri_from_curi(self, curi):
        """
        Create the uri tuple from the :class:`CrawlUri` and calculate the
        priority.

        Overwrite this method in more specific frontiers.
        """
        etag = mod_date = None
        if curi.rep_header:
            if "Etag" in curi.rep_header:
                etag = curi.rep_header["Etag"]
            if "Last-Modified" in curi.rep_header:
                mod_date = time.mktime(
                    deserialize_date_time(
                        curi.rep_header["Last-Modified"]).timetuple())
            if not mod_date and 'Date' in curi.rep_header:
                mod_date = time.mktime(
                    deserialize_date_time(curi.rep_header["Date"]).timetuple())

        if mod_date:
            # only reschedule if it has been crawled before
            (prio, next_crawl_date) = self._reschedule_uri(curi)
        else:
            (prio,
             next_crawl_date) = (1,
                                 time.mktime(
                                     datetime.now(self._timezone).timetuple()))

        return (curi.url, etag, mod_date, next_crawl_date, prio)

    def _crawluri_from_uri(self, uri):
        """
        Convert an URI tuple to a :class:`CrawlUri`.

        Replace the hostname with the real IP in order to cache DNS queries.
        """
        (url, etag, mod_date, _next_date, prio) = uri

        parsed_url = urlparse(url)

        # dns resolution and caching
        port = parsed_url.port
        if not port:
            port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme]

        effective_netloc = self._dns_cache["%s:%s" %
                                           (parsed_url.hostname, port)]

        curi = CrawlUri(url)
        curi.effective_url = url.replace(parsed_url.netloc,
                                         "%s:%s" % effective_netloc)
        curi.current_priority = prio
        curi.req_header = dict()
        if etag:
            curi.req_header["Etag"] = etag
        if mod_date:
            mod_date_time = datetime.fromtimestamp(mod_date)
            curi.req_header["Last-Modified"] = serialize_date_time(
                mod_date_time)

        curi.optional_vars = dict()
        if parsed_url.username and parsed_url.password:
            curi.optional_vars[CURI_SITE_USERNAME] = \
                parsed_url.username.encode()
            curi.optional_vars[CURI_SITE_PASSWORD] = \
                parsed_url.password.encode()

        return curi

    def _update_heap(self):
        """
        Abstract method. Implement this in the actual Frontier.

        The implementation should really only add uris to the heap if they can
        be downloaded right away.
        """
        pass

    def _maybe_checkpoint(self, force_checkpoint=False):
        """
        Periodically checkpoint the state db.
        """
        self._uris_added += 1
        if self._uris_added > self._checkpoint_interval or force_checkpoint:
            self._front_end_queues.checkpoint()
            self._uris_added = 0

    def process_successful_crawl(self, curi):
        """
        Called when an URI has been crawled successfully.

        `curi` is a :class:`CrawlUri`
        """
        self.update_uri(curi)

        if curi.optional_vars and CURI_EXTRACTED_URLS in curi.optional_vars:
            for url in curi.optional_vars[CURI_EXTRACTED_URLS].split("\n"):
                if len(url) > 5 and not self._unique_uri.is_known(url):
                    self.add_uri(CrawlUri(url))

        del self._current_uris[curi.url]

        for sink in self._sinks:
            sink.process_successful_crawl(curi)

    def process_not_found(self, curi):
        """
        Called when an URL was not found.

        This could mean, that the URL has been removed from the server. If so,
        do something about it!

        Override this method in the actual frontier implementation.
        """
        del self._current_uris[curi.url]
        self._ignore_uri(curi)

        for sink in self._sinks:
            sink.process_not_found(curi)

    def process_redirect(self, curi):
        """
        Called when there were too many redirects for an URL, or the site has
        note been updated since the last visit.

        In the latter case, update the internal uri and increase the priority
        level.
        """
        del self._current_uris[curi.url]

        if curi.status_code in [301, 302]:
            # simply ignore the URL. The URL that is being redirected to is
            # extracted and added in the processing
            self._ignore_uri(curi)

        if curi.status_code == 304:
            # the page has not been modified since the last visit! Update it
            # NOTE: prio increasing happens in the prioritizer
            self.update_uri(curi)

        for sink in self._sinks:
            sink.process_redirect(curi)

    def process_server_error(self, curi):
        """
        Called when there was some kind of server error.

        Override this method in the actual frontier implementation.
        """
        del self._current_uris[curi.url]
        self._ignore_uri(curi)

        for sink in self._sinks:
            sink.process_server_error(curi)