Example #1
0
    def _build_multi_hosts(self, options, start_urls):
        hosts = {}

        extra_hosts = set()
        if options.accepted_hosts:
            for url in options.accepted_hosts.split(','):
                split_result = get_clean_url_split(url)
                extra_hosts.add(split_result.netloc)

        for start_url in start_urls:
            split_result = get_clean_url_split(start_url)
            host = split_result.netloc
            hosts[host] = extra_hosts.union(host)

        return hosts
Example #2
0
    def get_links(self, html_soup, original_url_split):
        """Gets links for desired types (e.g., a, link, img, script)

        :param html_soup: The page parsed by BeautifulSoup
        :param original_url_split: The URL of the page used to resolve relative
                links.
        :rtype: A sequence of Link objects
        """

        # This is a weird html tag that defines the base URL of a page.
        base_url_split = original_url_split

        bases = html_soup.find_all('base')
        if bases:
            base = bases[0]
            if 'href' in base.attrs:
                base_url_split = get_clean_url_split(base['href'])

        links = []
        for element_type in self.worker_config.types:
            if element_type not in TYPE_ATTRIBUTES:
                raise Exception(
                    "Unknown element type: {0}".format(element_type))
            attribute = TYPE_ATTRIBUTES[element_type]
            element_links = html_soup.find_all(element_type)
            links.extend(self._get_links(
                element_links, attribute, base_url_split, original_url_split))
        return links
Example #3
0
    def get_links(self, html_soup, original_url_split):
        """Gets links for desired types (e.g., a, link, img, script)

        :param html_soup: The page parsed by BeautifulSoup
        :param original_url_split: The URL of the page used to resolve relative
                links.
        :rtype: A sequence of Link objects
        """

        # This is a weird html tag that defines the base URL of a page.
        base_url_split = original_url_split

        bases = html_soup.find_all('base')
        if bases:
            base = bases[0]
            if 'href' in base.attrs:
                base_url_split = get_clean_url_split(base['href'])

        links = []
        for element_type in self.worker_config.types:
            if element_type not in TYPE_ATTRIBUTES:
                raise Exception(
                    "Unknown element type: {0}".format(element_type))
            attribute = TYPE_ATTRIBUTES[element_type]
            element_links = html_soup.find_all(element_type)
            links.extend(
                self._get_links(element_links, attribute, base_url_split,
                                original_url_split))
        return links
Example #4
0
    def _get_prefix_content(self, content, prefix=None):
        if not prefix:
            index = content.find(",")
            prefix = get_clean_url_split(content[:index])
            content = content[index+1:]

        return (prefix, content)
Example #5
0
    def _build_single_hosts(self, options, start_urls):
        hosts = set()
        urls = []

        if options.accepted_hosts:
            urls = options.accepted_hosts.split(',')
        urls = urls + start_urls

        for url in urls:
            split_result = get_clean_url_split(url)
            hosts.add(split_result.netloc)

        return hosts
Example #6
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
         "https://www.example.com/hello/index.html")
     self.assertEqual(
         "https://www.example2.com/test.js",
         get_absolute_url_split(
             "//www.example2.com/test.js", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello2/test.html",
         get_absolute_url_split(
             "/hello2/test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
Example #7
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
         "https://www.example.com/hello/index.html")
     self.assertEqual(
         "https://www.example2.com/test.js",
         get_absolute_url_split(
             "//www.example2.com/test.js", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello2/test.html",
         get_absolute_url_split(
             "/hello2/test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual(
         "https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
Example #8
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(
            username=None, password=None, types=['a', 'img', 'link', 'script'],
            timeout=5, parser=PARSER_STDLIB,
            strict_mode=False, prefer_server_encoding=False,
            extra_headers=[])

        worker_init = WorkerInit(
            worker_config=worker_config,
            input_queue=input_queue, output_queue=output_queue,
            logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split
Example #9
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(
            username=None, password=None, types=['a', 'img', 'link', 'script'],
            timeout=5, parser=PARSER_STDLIB,
            strict_mode=False, prefer_server_encoding=False,
            extra_headers=[])

        worker_init = WorkerInit(
            worker_config=worker_config,
            input_queue=input_queue, output_queue=output_queue,
            logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split
Example #10
0
    def test_clean_url_split(self):
        self.assertEqual("http://www.example.com",
                         get_clean_url_split("www.example.com").geturl())
        self.assertEqual("http://www.example.com",
                         get_clean_url_split("//www.example.com").geturl())
        self.assertEqual(
            "http://www.example.com",
            get_clean_url_split("http://www.example.com").geturl())

        self.assertEqual("http://www.example.com/",
                         get_clean_url_split("www.example.com/").geturl())
        self.assertEqual("http://www.example.com/",
                         get_clean_url_split("//www.example.com/").geturl())
        self.assertEqual(
            "http://www.example.com/",
            get_clean_url_split("http://www.example.com/").geturl())
        self.assertEqual(
            "http://www.example.com/media%20gallery",
            get_clean_url_split(
                "http://www.example.com/media gallery").geturl())
Example #11
0
    def test_clean_url_split(self):
        self.assertEqual(
            "http://www.example.com",
            get_clean_url_split("www.example.com").geturl())
        self.assertEqual(
            "http://www.example.com",
            get_clean_url_split("//www.example.com").geturl())
        self.assertEqual(
            "http://www.example.com",
            get_clean_url_split("http://www.example.com").geturl())

        self.assertEqual(
            "http://www.example.com/",
            get_clean_url_split("www.example.com/").geturl())
        self.assertEqual(
            "http://www.example.com/",
            get_clean_url_split("//www.example.com/").geturl())
        self.assertEqual(
            "http://www.example.com/",
            get_clean_url_split("http://www.example.com/").geturl())
Example #12
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        erroneous_content = []
        missing_content = []
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(
                self.urlopen, self.request_class,
                url_split_to_crawl.geturl(), self.worker_config.timeout,
                self.timeout_exception, self.auth_header,
                extra_headers=self.worker_config.extra_headers,
                logger=self.logger)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None, status=response.status,
                        is_timeout=False, is_redirect=False, links=[],
                        exception=None, is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=None,
                        site_origin=worker_input.site_origin)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None, status=None,
                        is_timeout=True, is_redirect=False, links=[],
                        exception=None, is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(
                        unicode(type(response.exception)),
                        unicode(response.exception))
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None, status=None,
                        is_timeout=False, is_redirect=False, links=[],
                        exception=exception, is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                message = response.content.info()
                mime_type = get_content_type(message)
                if self.worker_config.prefer_server_encoding:
                    charset = get_charset(message)
                else:
                    charset = None
                links = []

                is_html = mime_type == HTML_MIME_TYPE
                process_time = None

                if is_html and worker_input.should_crawl:
                    start = time.time()
                    html_soup = BeautifulSoup(
                        response.content, self.worker_config.parser,
                        from_encoding=charset)
                    links = self.get_links(html_soup, final_url_split)
                    if self._has_content_to_check(worker_input):
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                unicode(html_soup), html_soup,
                                url_split_to_crawl,
                                final_url_split, worker_input.content_check)
                    process_time = time.time() - start
                else:
                    self.logger.debug(
                        "Won't crawl %s. MIME Type: %s. Should crawl: %s",
                        final_url_split, mime_type,
                        worker_input.should_crawl)
                    if self._has_content_to_check(worker_input):
                        text_content = self.get_text_content(
                            response.content.read(), charset)
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                text_content, None, url_split_to_crawl,
                                final_url_split, worker_input.content_check)

                page_crawl = PageCrawl(
                    original_url_split=url_split_to_crawl,
                    final_url_split=final_url_split, status=response.status,
                    is_timeout=False, is_redirect=response.is_redirect,
                    links=links, exception=None, is_html=is_html,
                    depth=worker_input.depth,
                    response_time=response.response_time,
                    process_time=process_time,
                    site_origin=worker_input.site_origin,
                    missing_content=missing_content,
                    erroneous_content=erroneous_content)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(
                original_url_split=url_split_to_crawl,
                final_url_split=None, status=None,
                is_timeout=False, is_redirect=False, links=[],
                exception=exception, is_html=False,
                depth=worker_input.depth,
                response_time=None,
                process_time=None,
                site_origin=worker_input.site_origin)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl
Example #13
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        erroneous_content = []
        missing_content = []
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(self.urlopen,
                                self.request_class,
                                url_split_to_crawl.geturl(),
                                self.worker_config.timeout,
                                self.timeout_exception,
                                self.auth_header,
                                extra_headers=self.worker_config.extra_headers,
                                logger=self.logger)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=response.status,
                        is_timeout=False,
                        is_redirect=False,
                        links=[],
                        exception=None,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=None,
                        site_origin=worker_input.site_origin)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=None,
                        is_timeout=True,
                        is_redirect=False,
                        links=[],
                        exception=None,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(unicode(type(response.exception)),
                                             unicode(response.exception))
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=None,
                        is_timeout=False,
                        is_redirect=False,
                        links=[],
                        exception=exception,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                message = response.content.info()
                mime_type = get_content_type(message)
                if self.worker_config.prefer_server_encoding:
                    charset = get_charset(message)
                else:
                    charset = None
                links = []

                is_html = mime_type == HTML_MIME_TYPE
                process_time = None

                if is_html and worker_input.should_crawl:
                    start = time.time()
                    html_soup = BeautifulSoup(response.content,
                                              self.worker_config.parser,
                                              from_encoding=charset)
                    links = self.get_links(html_soup, final_url_split)
                    if self._has_content_to_check(worker_input):
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                unicode(html_soup), html_soup,
                                url_split_to_crawl,
                                final_url_split, worker_input.content_check)
                    process_time = time.time() - start
                else:
                    self.logger.debug(
                        "Won't crawl %s. MIME Type: %s. Should crawl: %s",
                        final_url_split, mime_type, worker_input.should_crawl)
                    if self._has_content_to_check(worker_input):
                        text_content = self.get_text_content(
                            response.content.read(), charset)
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                text_content, None, url_split_to_crawl,
                                final_url_split, worker_input.content_check)

                page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                                       final_url_split=final_url_split,
                                       status=response.status,
                                       is_timeout=False,
                                       is_redirect=response.is_redirect,
                                       links=links,
                                       exception=None,
                                       is_html=is_html,
                                       depth=worker_input.depth,
                                       response_time=response.response_time,
                                       process_time=process_time,
                                       site_origin=worker_input.site_origin,
                                       missing_content=missing_content,
                                       erroneous_content=erroneous_content)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                                   final_url_split=None,
                                   status=None,
                                   is_timeout=False,
                                   is_redirect=False,
                                   links=[],
                                   exception=exception,
                                   is_html=False,
                                   depth=worker_input.depth,
                                   response_time=None,
                                   process_time=None,
                                   site_origin=worker_input.site_origin)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl
Example #14
0
 def _process_start_urls(self):
     for start_url in self.start_urls:
         self.start_url_splits.append(get_clean_url_split(start_url))