Example #1
0
    def _get_links(self, elements, attribute, base_url_split,
                   original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]

                if not self.worker_config.strict_mode:
                    url = url.strip()

                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if not is_supported_scheme(
                        abs_url_split, self.worker_config.ignore_bad_tel_urls):
                    continue

                link = Link(type=unicode(element.name),
                            url_split=abs_url_split,
                            original_url_split=original_url_split,
                            source_str=unicode(element))
                links.append(link)

        return links
Example #2
0
    def __init__(self, worker_init):
        self.worker_config = worker_init.worker_config
        self.input_queue = worker_init.input_queue
        self.output_queue = worker_init.output_queue
        self.urlopen = get_url_open()
        self.request_class = get_url_request()
        self.logger = worker_init.logger
        if not self.logger:
            # Get a new one!
            self.logger = get_logger()

        # We do this here to allow patching by gevent
        import socket
        self.timeout_exception = socket.timeout

        self.auth_header = None

        if self.worker_config.username and self.worker_config.password:
            base64string = unicode(
                base64.encodestring(
                    '{0}:{1}'.format(
                        self.worker_config.username,
                        self.worker_config.password)
                    .encode("utf-8")), "utf-8")
            self.auth_header = ("Authorization",
                                "Basic {0}".format(base64string))
Example #3
0
    def _get_links(self, elements, attribute, base_url_split,
                   original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]

                if not self.worker_config.strict_mode:
                    url = url.strip()

                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if abs_url_split.scheme not in SUPPORTED_SCHEMES:
                    continue

                link = Link(
                    type=unicode(element.name), url_split=abs_url_split,
                    original_url_split=original_url_split,
                    source_str=unicode(element))
                links.append(link)

        return links
Example #4
0
    def __init__(self, worker_init):
        self.worker_config = worker_init.worker_config
        self.input_queue = worker_init.input_queue
        self.output_queue = worker_init.output_queue
        self.urlopen = get_url_open()
        self.request_class = get_url_request()
        self.logger = worker_init.logger
        if not self.logger:
            # Get a new one!
            self.logger = get_logger()

        # We do this here to allow patching by gevent
        import socket
        self.timeout_exception = socket.timeout

        self.auth_header = None

        if self.worker_config.username and self.worker_config.password:
            base64string = unicode(
                base64.encodestring('{0}:{1}'.format(
                    self.worker_config.username,
                    self.worker_config.password).encode("utf-8")), "utf-8")
            self.auth_header = ("Authorization",
                                "Basic {0}".format(base64string))
Example #5
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        erroneous_content = []
        missing_content = []
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(
                self.urlopen, self.request_class,
                url_split_to_crawl.geturl(), self.worker_config.timeout,
                self.timeout_exception, self.auth_header,
                extra_headers=self.worker_config.extra_headers,
                logger=self.logger)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None, status=response.status,
                        is_timeout=False, is_redirect=False, links=[],
                        exception=None, is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=None,
                        site_origin=worker_input.site_origin)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None, status=None,
                        is_timeout=True, is_redirect=False, links=[],
                        exception=None, is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(
                        unicode(type(response.exception)),
                        unicode(response.exception))
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None, status=None,
                        is_timeout=False, is_redirect=False, links=[],
                        exception=exception, is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                message = response.content.info()
                mime_type = get_content_type(message)
                if self.worker_config.prefer_server_encoding:
                    charset = get_charset(message)
                else:
                    charset = None
                links = []

                is_html = mime_type == HTML_MIME_TYPE
                process_time = None

                if is_html and worker_input.should_crawl:
                    start = time.time()
                    html_soup = BeautifulSoup(
                        response.content, self.worker_config.parser,
                        from_encoding=charset)
                    links = self.get_links(html_soup, final_url_split)
                    if self._has_content_to_check(worker_input):
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                unicode(html_soup), html_soup,
                                url_split_to_crawl,
                                final_url_split, worker_input.content_check)
                    process_time = time.time() - start
                else:
                    self.logger.debug(
                        "Won't crawl %s. MIME Type: %s. Should crawl: %s",
                        final_url_split, mime_type,
                        worker_input.should_crawl)
                    if self._has_content_to_check(worker_input):
                        text_content = self.get_text_content(
                            response.content.read(), charset)
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                text_content, None, url_split_to_crawl,
                                final_url_split, worker_input.content_check)

                page_crawl = PageCrawl(
                    original_url_split=url_split_to_crawl,
                    final_url_split=final_url_split, status=response.status,
                    is_timeout=False, is_redirect=response.is_redirect,
                    links=links, exception=None, is_html=is_html,
                    depth=worker_input.depth,
                    response_time=response.response_time,
                    process_time=process_time,
                    site_origin=worker_input.site_origin,
                    missing_content=missing_content,
                    erroneous_content=erroneous_content)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(
                original_url_split=url_split_to_crawl,
                final_url_split=None, status=None,
                is_timeout=False, is_redirect=False, links=[],
                exception=exception, is_html=False,
                depth=worker_input.depth,
                response_time=None,
                process_time=None,
                site_origin=worker_input.site_origin)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl
Example #6
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        erroneous_content = []
        missing_content = []
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(self.urlopen,
                                self.request_class,
                                url_split_to_crawl.geturl(),
                                self.worker_config.timeout,
                                self.timeout_exception,
                                self.auth_header,
                                extra_headers=self.worker_config.extra_headers,
                                logger=self.logger)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=response.status,
                        is_timeout=False,
                        is_redirect=False,
                        links=[],
                        exception=None,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=None,
                        site_origin=worker_input.site_origin)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=None,
                        is_timeout=True,
                        is_redirect=False,
                        links=[],
                        exception=None,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(unicode(type(response.exception)),
                                             unicode(response.exception))
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=None,
                        is_timeout=False,
                        is_redirect=False,
                        links=[],
                        exception=exception,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                message = response.content.info()
                mime_type = get_content_type(message)
                if self.worker_config.prefer_server_encoding:
                    charset = get_charset(message)
                else:
                    charset = None
                links = []

                is_html = mime_type == HTML_MIME_TYPE
                process_time = None

                if is_html and worker_input.should_crawl:
                    start = time.time()
                    html_soup = BeautifulSoup(response.content,
                                              self.worker_config.parser,
                                              from_encoding=charset)
                    links = self.get_links(html_soup, final_url_split)
                    if self._has_content_to_check(worker_input):
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                unicode(html_soup), html_soup,
                                url_split_to_crawl,
                                final_url_split, worker_input.content_check)
                    process_time = time.time() - start
                else:
                    self.logger.debug(
                        "Won't crawl %s. MIME Type: %s. Should crawl: %s",
                        final_url_split, mime_type, worker_input.should_crawl)
                    if self._has_content_to_check(worker_input):
                        text_content = self.get_text_content(
                            response.content.read(), charset)
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                text_content, None, url_split_to_crawl,
                                final_url_split, worker_input.content_check)

                page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                                       final_url_split=final_url_split,
                                       status=response.status,
                                       is_timeout=False,
                                       is_redirect=response.is_redirect,
                                       links=links,
                                       exception=None,
                                       is_html=is_html,
                                       depth=worker_input.depth,
                                       response_time=response.response_time,
                                       process_time=process_time,
                                       site_origin=worker_input.site_origin,
                                       missing_content=missing_content,
                                       erroneous_content=erroneous_content)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                                   final_url_split=None,
                                   status=None,
                                   is_timeout=False,
                                   is_redirect=False,
                                   links=[],
                                   exception=exception,
                                   is_html=False,
                                   depth=worker_input.depth,
                                   response_time=None,
                                   process_time=None,
                                   site_origin=worker_input.site_origin)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl