Esempio n. 1
0
    def _get_links(self, elements, attribute, base_url_split,
        original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]
                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if abs_url_split.scheme not in SUPPORTED_SCHEMES:
                    continue

                link = Link(type=unicode(element.name), url_split=abs_url_split,
                    original_url_split=original_url_split,
                    source_str=unicode(element))
                links.append(link)

        return links
Esempio n. 2
0
    def _get_links(self, elements, attribute, base_url_split,
        original_url_split):
        links = []
        for element in elements:
            if attribute in element.attrs:
                url = element[attribute]

                if not self.worker_config.strict_mode:
                    url = url.strip()

                if not is_link(url):
                    continue
                abs_url_split = get_absolute_url_split(url, base_url_split)

                if abs_url_split.scheme not in SUPPORTED_SCHEMES:
                    continue

                link = Link(type=unicode(element.name), url_split=abs_url_split,
                    original_url_split=original_url_split,
                    source_str=unicode(element))
                links.append(link)

        return links
Esempio n. 3
0
    def __init__(self, worker_init):
        self.worker_config = worker_init.worker_config
        self.input_queue = worker_init.input_queue
        self.output_queue = worker_init.output_queue
        self.urlopen = get_url_open()
        self.request_class = get_url_request()
        self.logger = worker_init.logger
        if not self.logger:
            # Get a new one!
            self.logger = get_logger()

        # We do this here to allow patching by gevent
        import socket
        self.timeout_exception = socket.timeout

        self.auth_header = None

        if self.worker_config.username and self.worker_config.password:
            base64string = unicode(base64.encodestring(
                    '{0}:{1}'.format(self.worker_config.username,
                    self.worker_config.password).encode("utf-8")), "utf-8")
            self.auth_header = ("Authorization", "Basic {0}".format(base64string))
Esempio n. 4
0
    def __init__(self, worker_init):
        self.worker_config = worker_init.worker_config
        self.input_queue = worker_init.input_queue
        self.output_queue = worker_init.output_queue
        self.urlopen = get_url_open()
        self.request_class = get_url_request()
        self.logger = worker_init.logger
        if not self.logger:
            # Get a new one!
            self.logger = get_logger()

        # We do this here to allow patching by gevent
        import socket
        self.timeout_exception = socket.timeout

        self.auth_header = None

        if self.worker_config.username and self.worker_config.password:
            base64string = unicode(base64.encodestring(
                    '{0}:{1}'.format(self.worker_config.username,
                    self.worker_config.password).encode("utf-8")), "utf-8")
            self.auth_header = ("Authorization", "Basic {0}".format(base64string))
Esempio n. 5
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(self.urlopen, self.request_class,
                    url_split_to_crawl.geturl(), self.worker_config.timeout,
                    self.timeout_exception, self.worker_config.user_agent, self.auth_header)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=response.status,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=None, is_html=False)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=True, is_redirect=False, links=[],
                            exception=None, is_html=False)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(unicode(type(response.exception)),
                        unicode(response.exception))
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=exception, is_html=False)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                mime_type = get_content_type(response.content.info())
                links = []

                is_html = mime_type == HTML_MIME_TYPE

                if is_html and worker_input.should_crawl:
                    html_soup = BeautifulSoup(response.content,
                            self.worker_config.parser)
                    links = self.get_links(html_soup, final_url_split)
                else:
                    self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s",
                            final_url_split, mime_type,
                            worker_input.should_crawl)

                page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=final_url_split, status=response.status,
                    is_timeout=False, is_redirect=response.is_redirect,
                    links=links, exception=None, is_html=is_html)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=None, status=None,
                    is_timeout=False, is_redirect=False, links=[],
                    exception=exception, is_html=False)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl
Esempio n. 6
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(self.urlopen, self.request_class,
                    url_split_to_crawl.geturl(), self.worker_config.timeout,
                    self.timeout_exception, self.auth_header)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=response.status,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=None, is_html=False)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=True, is_redirect=False, links=[],
                            exception=None, is_html=False)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(unicode(type(response.exception)),
                        unicode(response.exception))
                    page_crawl = PageCrawl(
                            original_url_split=url_split_to_crawl,
                            final_url_split=None, status=None,
                            is_timeout=False, is_redirect=False, links=[],
                            exception=exception, is_html=False)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                mime_type = get_content_type(response.content.info())
                links = []

                is_html = mime_type == HTML_MIME_TYPE

                if is_html and worker_input.should_crawl:
                    html_soup = BeautifulSoup(response.content,
                            self.worker_config.parser)
                    links = self.get_links(html_soup, final_url_split)
                else:
                    self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s",
                            final_url_split, mime_type,
                            worker_input.should_crawl)

                page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=final_url_split, status=response.status,
                    is_timeout=False, is_redirect=response.is_redirect,
                    links=links, exception=None, is_html=is_html)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                    final_url_split=None, status=None,
                    is_timeout=False, is_redirect=False, links=[],
                    exception=exception, is_html=False)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl