def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not self.worker_config.strict_mode: url = url.strip() if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if not is_supported_scheme( abs_url_split, self.worker_config.ignore_bad_tel_urls): continue link = Link(type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links
def __init__(self, worker_init): self.worker_config = worker_init.worker_config self.input_queue = worker_init.input_queue self.output_queue = worker_init.output_queue self.urlopen = get_url_open() self.request_class = get_url_request() self.logger = worker_init.logger if not self.logger: # Get a new one! self.logger = get_logger() # We do this here to allow patching by gevent import socket self.timeout_exception = socket.timeout self.auth_header = None if self.worker_config.username and self.worker_config.password: base64string = unicode( base64.encodestring( '{0}:{1}'.format( self.worker_config.username, self.worker_config.password) .encode("utf-8")), "utf-8") self.auth_header = ("Authorization", "Basic {0}".format(base64string))
def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not self.worker_config.strict_mode: url = url.strip() if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if abs_url_split.scheme not in SUPPORTED_SCHEMES: continue link = Link( type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links
def __init__(self, worker_init): self.worker_config = worker_init.worker_config self.input_queue = worker_init.input_queue self.output_queue = worker_init.output_queue self.urlopen = get_url_open() self.request_class = get_url_request() self.logger = worker_init.logger if not self.logger: # Get a new one! self.logger = get_logger() # We do this here to allow patching by gevent import socket self.timeout_exception = socket.timeout self.auth_header = None if self.worker_config.username and self.worker_config.password: base64string = unicode( base64.encodestring('{0}:{1}'.format( self.worker_config.username, self.worker_config.password).encode("utf-8")), "utf-8") self.auth_header = ("Authorization", "Basic {0}".format(base64string))
def _crawl_page(self, worker_input): page_crawl = None erroneous_content = [] missing_content = [] url_split_to_crawl = worker_input.url_split try: response = open_url( self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.auth_header, extra_headers=self.worker_config.extra_headers, logger=self.logger) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=None, site_origin=worker_input.site_origin) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: # Something bad happened when opening the url exception = ExceptionStr( unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: final_url_split = get_clean_url_split(response.final_url) message = response.content.info() mime_type = get_content_type(message) if self.worker_config.prefer_server_encoding: charset = get_charset(message) else: charset = None links = [] is_html = mime_type == HTML_MIME_TYPE process_time = None if is_html and worker_input.should_crawl: start = time.time() html_soup = BeautifulSoup( response.content, self.worker_config.parser, from_encoding=charset) links = self.get_links(html_soup, final_url_split) if self._has_content_to_check(worker_input): (missing_content, erroneous_content) =\ self.check_content( unicode(html_soup), html_soup, url_split_to_crawl, final_url_split, worker_input.content_check) process_time = time.time() - start else: self.logger.debug( "Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) if self._has_content_to_check(worker_input): text_content = self.get_text_content( response.content.read(), charset) (missing_content, erroneous_content) =\ self.check_content( text_content, None, url_split_to_crawl, final_url_split, worker_input.content_check) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html, depth=worker_input.depth, response_time=response.response_time, process_time=process_time, site_origin=worker_input.site_origin, missing_content=missing_content, erroneous_content=erroneous_content) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=None, process_time=None, site_origin=worker_input.site_origin) self.logger.exception("Exception occurred while crawling a page.") return page_crawl
def _crawl_page(self, worker_input): page_crawl = None erroneous_content = [] missing_content = [] url_split_to_crawl = worker_input.url_split try: response = open_url(self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.auth_header, extra_headers=self.worker_config.extra_headers, logger=self.logger) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=None, site_origin=worker_input.site_origin) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: # Something bad happened when opening the url exception = ExceptionStr(unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: final_url_split = get_clean_url_split(response.final_url) message = response.content.info() mime_type = get_content_type(message) if self.worker_config.prefer_server_encoding: charset = get_charset(message) else: charset = None links = [] is_html = mime_type == HTML_MIME_TYPE process_time = None if is_html and worker_input.should_crawl: start = time.time() html_soup = BeautifulSoup(response.content, self.worker_config.parser, from_encoding=charset) links = self.get_links(html_soup, final_url_split) if self._has_content_to_check(worker_input): (missing_content, erroneous_content) =\ self.check_content( unicode(html_soup), html_soup, url_split_to_crawl, final_url_split, worker_input.content_check) process_time = time.time() - start else: self.logger.debug( "Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) if self._has_content_to_check(worker_input): text_content = self.get_text_content( response.content.read(), charset) (missing_content, erroneous_content) =\ self.check_content( text_content, None, url_split_to_crawl, final_url_split, worker_input.content_check) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html, depth=worker_input.depth, response_time=response.response_time, process_time=process_time, site_origin=worker_input.site_origin, missing_content=missing_content, erroneous_content=erroneous_content) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=None, process_time=None, site_origin=worker_input.site_origin) self.logger.exception("Exception occurred while crawling a page.") return page_crawl