def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if abs_url_split.scheme not in SUPPORTED_SCHEMES: continue link = Link(type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links
def _get_links(self, elements, attribute, base_url_split, original_url_split): links = [] for element in elements: if attribute in element.attrs: url = element[attribute] if not self.worker_config.strict_mode: url = url.strip() if not is_link(url): continue abs_url_split = get_absolute_url_split(url, base_url_split) if abs_url_split.scheme not in SUPPORTED_SCHEMES: continue link = Link(type=unicode(element.name), url_split=abs_url_split, original_url_split=original_url_split, source_str=unicode(element)) links.append(link) return links
def __init__(self, worker_init): self.worker_config = worker_init.worker_config self.input_queue = worker_init.input_queue self.output_queue = worker_init.output_queue self.urlopen = get_url_open() self.request_class = get_url_request() self.logger = worker_init.logger if not self.logger: # Get a new one! self.logger = get_logger() # We do this here to allow patching by gevent import socket self.timeout_exception = socket.timeout self.auth_header = None if self.worker_config.username and self.worker_config.password: base64string = unicode(base64.encodestring( '{0}:{1}'.format(self.worker_config.username, self.worker_config.password).encode("utf-8")), "utf-8") self.auth_header = ("Authorization", "Basic {0}".format(base64string))
def _crawl_page(self, worker_input): page_crawl = None url_split_to_crawl = worker_input.url_split try: response = open_url(self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.worker_config.user_agent, self.auth_header) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False) else: # Something bad happened when opening the url exception = ExceptionStr(unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) else: final_url_split = get_clean_url_split(response.final_url) mime_type = get_content_type(response.content.info()) links = [] is_html = mime_type == HTML_MIME_TYPE if is_html and worker_input.should_crawl: html_soup = BeautifulSoup(response.content, self.worker_config.parser) links = self.get_links(html_soup, final_url_split) else: self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) self.logger.exception("Exception occurred while crawling a page.") return page_crawl
def _crawl_page(self, worker_input): page_crawl = None url_split_to_crawl = worker_input.url_split try: response = open_url(self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.auth_header) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False) else: # Something bad happened when opening the url exception = ExceptionStr(unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) else: final_url_split = get_clean_url_split(response.final_url) mime_type = get_content_type(response.content.info()) links = [] is_html = mime_type == HTML_MIME_TYPE if is_html and worker_input.should_crawl: html_soup = BeautifulSoup(response.content, self.worker_config.parser) links = self.get_links(html_soup, final_url_split) else: self.logger.debug("Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False) self.logger.exception("Exception occurred while crawling a page.") return page_crawl