def diagnose(data): """Diagnostic suite for isolating common problems.""" print "Diagnostic running on Beautiful Soup %s" % __version__ print "Python version %s" % sys.version basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: for builder in builder_registry.builders: if name in builder.features: break else: basic_parsers.remove(name) print( "I noticed that %s is not installed. Installing it may help." % name) if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) from lxml import etree print "Found lxml version %s" % ".".join( map(str, etree.LXML_VERSION)) if 'html5lib' in basic_parsers: import html5lib print "Found html5lib version %s" % html5lib.__version__ if hasattr(data, 'read'): data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data data = open(data).read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: soup = BeautifulSoup(data, parser) success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "Here's what %s did with the markup:" % parser print soup.prettify() print "-" * 80
def diagnose(data): """Diagnostic suite for isolating common problems.""" print "Diagnostic running on Beautiful Soup %s" % __version__ print "Python version %s" % sys.version basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: for builder in builder_registry.builders: if name in builder.features: break else: basic_parsers.remove(name) print ( "I noticed that %s is not installed. Installing it may help." % name) if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) from lxml import etree print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) if 'html5lib' in basic_parsers: import html5lib print "Found html5lib version %s" % html5lib.__version__ if hasattr(data, 'read'): data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data data = open(data).read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: soup = BeautifulSoup(data, parser) success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "Here's what %s did with the markup:" % parser print soup.prettify() print "-" * 80
def _compute_single_content_check( self, content_list, html_dict, raw_dict, prefix=None): if not content_list: # Catch None return for content in content_list: temp_prefix, content = self._get_prefix_content(content, prefix) content = content.strip() if content.startswith("<"): # html.parser because we do not want to automatically create # surrounding tags soup = BeautifulSoup(content, "html.parser") children = list(soup.children) if children: child = children[0] string = child.string if child.string and child.string.startswith(REGEX_CONTENT): string = re.compile(child.string[len(REGEX_CONTENT):], re.MULTILINE) html_check = HTMLCheck( child.name, child.attrs, string) html_dict[temp_prefix].append(html_check) else: if content and content.startswith(REGEX_CONTENT): content = re.compile(content[len(REGEX_CONTENT):], re.MULTILINE) raw_dict[temp_prefix].append(content)
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = BeautifulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b - a)
def _crawl_page(self, worker_input): page_crawl = None erroneous_content = [] missing_content = [] url_split_to_crawl = worker_input.url_split try: response = open_url(self.urlopen, self.request_class, url_split_to_crawl.geturl(), self.worker_config.timeout, self.timeout_exception, self.auth_header, extra_headers=self.worker_config.extra_headers, logger=self.logger) if response.exception: if response.status: # This is a http error. Good. page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=response.status, is_timeout=False, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=None, site_origin=worker_input.site_origin) elif response.is_timeout: # This is a timeout. No need to wrap the exception page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=True, is_redirect=False, links=[], exception=None, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: # Something bad happened when opening the url exception = ExceptionStr(unicode(type(response.exception)), unicode(response.exception)) page_crawl = PageCrawl( original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=response.response_time, process_time=0, site_origin=worker_input.site_origin) else: final_url_split = get_clean_url_split(response.final_url) message = response.content.info() mime_type = get_content_type(message) if self.worker_config.prefer_server_encoding: charset = get_charset(message) else: charset = None links = [] is_html = mime_type == HTML_MIME_TYPE process_time = None if is_html and worker_input.should_crawl: start = time.time() html_soup = BeautifulSoup(response.content, self.worker_config.parser, from_encoding=charset) links = self.get_links(html_soup, final_url_split) if self._has_content_to_check(worker_input): (missing_content, erroneous_content) =\ self.check_content( unicode(html_soup), html_soup, url_split_to_crawl, final_url_split, worker_input.content_check) process_time = time.time() - start else: self.logger.debug( "Won't crawl %s. MIME Type: %s. Should crawl: %s", final_url_split, mime_type, worker_input.should_crawl) if self._has_content_to_check(worker_input): text_content = self.get_text_content( response.content.read(), charset) (missing_content, erroneous_content) =\ self.check_content( text_content, None, url_split_to_crawl, final_url_split, worker_input.content_check) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=final_url_split, status=response.status, is_timeout=False, is_redirect=response.is_redirect, links=links, exception=None, is_html=is_html, depth=worker_input.depth, response_time=response.response_time, process_time=process_time, site_origin=worker_input.site_origin, missing_content=missing_content, erroneous_content=erroneous_content) except Exception as exc: exception = ExceptionStr(unicode(type(exc)), unicode(exc)) page_crawl = PageCrawl(original_url_split=url_split_to_crawl, final_url_split=None, status=None, is_timeout=False, is_redirect=False, links=[], exception=exception, is_html=False, depth=worker_input.depth, response_time=None, process_time=None, site_origin=worker_input.site_origin) self.logger.exception("Exception occurred while crawling a page.") return page_crawl