Example #1
0
    def get_links(self, html_soup, original_url_split):
        """Get Link for desired types (e.g., a, link, img, script)

        :param html_soup: The page parsed by BeautifulSoup
        :param original_url_split: The URL of the page used to resolve relative
                links.
        :rtype: A sequence of Link objects
        """

        # This is a weird html tag that defines the base URL of a page.
        base_url_split = original_url_split

        bases = html_soup.find_all('base')
        if bases:
            base = bases[0]
            if 'href' in base.attrs:
                base_url_split = get_clean_url_split(base['href'])

        links = []
        for element_type in self.worker_config.types:
            if element_type not in TYPE_ATTRIBUTES:
                raise Exception("Unknown element type: {0}".
                        format(element_type))
            attribute = TYPE_ATTRIBUTES[element_type]
            element_links = html_soup.find_all(element_type)
            links.extend(self._get_links(element_links, attribute,
                    base_url_split, original_url_split))
        return links
Example #2
0
    def get_links(self, html_soup, original_url_split):
        """Get Link for desired types (e.g., a, link, img, script)

        :param html_soup: The page parsed by BeautifulSoup
        :param original_url_split: The URL of the page used to resolve relative
                links.
        :rtype: A sequence of Link objects
        """

        # This is a weird html tag that defines the base URL of a page.
        base_url_split = original_url_split

        bases = html_soup.find_all('base')
        if bases:
            base = bases[0]
            if 'href' in base.attrs:
                base_url_split = get_clean_url_split(base['href'])

        links = []
        for element_type in self.worker_config.types:
            if element_type not in TYPE_ATTRIBUTES:
                raise Exception("Unknown element type: {0}".
                        format(element_type))
            attribute = TYPE_ATTRIBUTES[element_type]
            element_links = html_soup.find_all(element_type)
            links.extend(self._get_links(element_links, attribute,
                    base_url_split, original_url_split))
        return links
Example #3
0
 def __init__(self, config, logger):
     self.config = config
     self.start_url_splits = []
     for start_url in config.start_urls:
         self.start_url_splits.append(get_clean_url_split(start_url))
     self.workers = []
     self.input_queue = self.build_queue(config)
     self.output_queue = self.build_queue(config)
     self.logger = logger
     self.site = Site(self.start_url_splits, config, self.logger)
Example #4
0
 def __init__(self, config, logger):
     self.config = config
     self.start_url_splits = []
     for start_url in config.start_urls:
         self.start_url_splits.append(get_clean_url_split(start_url))
     self.workers = []
     self.input_queue = self.build_queue(config)
     self.output_queue = self.build_queue(config)
     self.logger = logger
     self.site = Site(self.start_url_splits, config, self.logger)
Example #5
0
    def _build_accepted_hosts(self, options, start_urls):
        hosts = set()
        urls = []

        if self.options.accepted_hosts:
            urls = self.options.accepted_hosts.split(',')
        urls = urls + start_urls

        for url in urls:
            split_result = get_clean_url_split(url)
            hosts.add(split_result.netloc)

        return hosts
Example #6
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
             "https://www.example.com/hello/index.html")
     self.assertEqual("https://www.example2.com/test.js",
         get_absolute_url_split("//www.example2.com/test.js",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello2/test.html",
         get_absolute_url_split("/hello2/test.html",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual("https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
Example #7
0
    def _build_accepted_hosts(self, options, start_urls):
        hosts = set()
        urls = []

        if self.options.accepted_hosts:
            urls = self.options.accepted_hosts.split(',')
        urls = urls + start_urls

        for url in urls:
            split_result = get_clean_url_split(url)
            hosts.add(split_result.netloc)

        return hosts
Example #8
0
 def test_get_absolute_url(self):
     base_url_split = get_clean_url_split(
             "https://www.example.com/hello/index.html")
     self.assertEqual("https://www.example2.com/test.js",
         get_absolute_url_split("//www.example2.com/test.js",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello2/test.html",
         get_absolute_url_split("/hello2/test.html",
                 base_url_split).geturl())
     self.assertEqual("https://www.example.com/hello/test.html",
         get_absolute_url_split("test.html", base_url_split).geturl())
     self.assertEqual("https://www.example.com/test.html",
         get_absolute_url_split("../test.html", base_url_split).geturl())
Example #9
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(username=None, password=None, types=['a',
                'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB)

        worker_init = WorkerInit(worker_config=worker_config,
                input_queue=input_queue, output_queue=output_queue,
                logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split
Example #10
0
    def get_page_crawler(self, url):
        url = self.get_url(url)
        url_split = get_clean_url_split(url)
        input_queue = compat.Queue.Queue()
        output_queue = compat.Queue.Queue()

        worker_config = WorkerConfig(username=None, password=None, types=['a',
                'img', 'link', 'script'], timeout=5, parser=PARSER_STDLIB,
                strict_mode=False)

        worker_init = WorkerInit(worker_config=worker_config,
                input_queue=input_queue, output_queue=output_queue,
                logger=get_logger())

        page_crawler = PageCrawler(worker_init)

        return page_crawler, url_split
Example #11
0
    def test_clean_url_split(self):
        self.assertEqual("http://www.example.com",
            get_clean_url_split("www.example.com").geturl())
        self.assertEqual("http://www.example.com",
            get_clean_url_split("//www.example.com").geturl())
        self.assertEqual("http://www.example.com",
            get_clean_url_split("http://www.example.com").geturl())

        self.assertEqual("http://www.example.com/",
            get_clean_url_split("www.example.com/").geturl())
        self.assertEqual("http://www.example.com/",
            get_clean_url_split("//www.example.com/").geturl())
        self.assertEqual("http://www.example.com/",
            get_clean_url_split("http://www.example.com/").geturl())
Example #12
0
    def test_clean_url_split(self):
        self.assertEqual("http://www.example.com",
            get_clean_url_split("www.example.com").geturl())
        self.assertEqual("http://www.example.com",
            get_clean_url_split("//www.example.com").geturl())
        self.assertEqual("http://www.example.com",
            get_clean_url_split("http://www.example.com").geturl())

        self.assertEqual("http://www.example.com/",
            get_clean_url_split("www.example.com/").geturl())
        self.assertEqual("http://www.example.com/",
            get_clean_url_split("//www.example.com/").geturl())
        self.assertEqual("http://www.example.com/",
            get_clean_url_split("http://www.example.com/").geturl())
Example #13
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(self.urlopen, self.request_class,
                    url_split_to_crawl.geturl(), self.worker_config.timeout,
                    self.timeout_exception, self.worker_config.user_agent, self.auth_header)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=response.status,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=None, is_html=False)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=True, is_redirect=False, links=[],
                            exception=None, is_html=False)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(unicode(type(response.exception)),
                        unicode(response.exception))
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=exception, is_html=False)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                mime_type = get_content_type(response.content.info())
                links = []

                is_html = mime_type == HTML_MIME_TYPE

                if is_html and worker_input.should_crawl:
                    html_soup = BeautifulSoup(response.content,
                            self.worker_config.parser)
                    links = self.get_links(html_soup, final_url_split)
                else:
                    self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s",
                            final_url_split, mime_type,
                            worker_input.should_crawl)

                page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=final_url_split, status=response.status,
                    is_timeout=False, is_redirect=response.is_redirect,
                    links=links, exception=None, is_html=is_html)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=None, status=None,
                    is_timeout=False, is_redirect=False, links=[],
                    exception=exception, is_html=False)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl
Example #14
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(self.urlopen, self.request_class,
                    url_split_to_crawl.geturl(), self.worker_config.timeout,
                    self.timeout_exception, self.auth_header)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=response.status,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=None, is_html=False)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=True, is_redirect=False, links=[],
                            exception=None, is_html=False)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(unicode(type(response.exception)),
                        unicode(response.exception))
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=exception, is_html=False)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                mime_type = get_content_type(response.content.info())
                links = []

                is_html = mime_type == HTML_MIME_TYPE

                if is_html and worker_input.should_crawl:
                    html_soup = BeautifulSoup(response.content,
                            self.worker_config.parser)
                    links = self.get_links(html_soup, final_url_split)
                else:
                    self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s",
                            final_url_split, mime_type,
                            worker_input.should_crawl)

                page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=final_url_split, status=response.status,
                    is_timeout=False, is_redirect=response.is_redirect,
                    links=links, exception=None, is_html=is_html)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=None, status=None,
                    is_timeout=False, is_redirect=False, links=[],
                    exception=exception, is_html=False)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl