Esempio n. 1
0
    def diagnose(data):
        """Diagnostic suite for isolating common problems."""
        print "Diagnostic running on Beautiful Soup %s" % __version__
        print "Python version %s" % sys.version

        basic_parsers = ["html.parser", "html5lib", "lxml"]
        for name in basic_parsers:
            for builder in builder_registry.builders:
                if name in builder.features:
                    break
            else:
                basic_parsers.remove(name)
                print(
                    "I noticed that %s is not installed. Installing it may help."
                    % name)

        if 'lxml' in basic_parsers:
            basic_parsers.append(["lxml", "xml"])
            from lxml import etree
            print "Found lxml version %s" % ".".join(
                map(str, etree.LXML_VERSION))

        if 'html5lib' in basic_parsers:
            import html5lib
            print "Found html5lib version %s" % html5lib.__version__

        if hasattr(data, 'read'):
            data = data.read()
        elif os.path.exists(data):
            print '"%s" looks like a filename. Reading data from the file.' % data
            data = open(data).read()
        elif data.startswith("http:") or data.startswith("https:"):
            print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
            print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
            return
        print

        for parser in basic_parsers:
            print "Trying to parse your markup with %s" % parser
            success = False
            try:
                soup = BeautifulSoup(data, parser)
                success = True
            except Exception, e:
                print "%s could not parse the markup." % parser
                traceback.print_exc()
            if success:
                print "Here's what %s did with the markup:" % parser
                print soup.prettify()

            print "-" * 80
Esempio n. 2
0
    def diagnose(data):
        """Diagnostic suite for isolating common problems."""
        print "Diagnostic running on Beautiful Soup %s" % __version__
        print "Python version %s" % sys.version

        basic_parsers = ["html.parser", "html5lib", "lxml"]
        for name in basic_parsers:
            for builder in builder_registry.builders:
                if name in builder.features:
                    break
            else:
                basic_parsers.remove(name)
                print (
                    "I noticed that %s is not installed. Installing it may help." %
                    name)

        if 'lxml' in basic_parsers:
            basic_parsers.append(["lxml", "xml"])
            from lxml import etree
            print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))

        if 'html5lib' in basic_parsers:
            import html5lib
            print "Found html5lib version %s" % html5lib.__version__

        if hasattr(data, 'read'):
            data = data.read()
        elif os.path.exists(data):
            print '"%s" looks like a filename. Reading data from the file.' % data
            data = open(data).read()
        elif data.startswith("http:") or data.startswith("https:"):
            print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
            print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
            return
        print

        for parser in basic_parsers:
            print "Trying to parse your markup with %s" % parser
            success = False
            try:
                soup = BeautifulSoup(data, parser)
                success = True
            except Exception, e:
                print "%s could not parse the markup." % parser
                traceback.print_exc()
            if success:
                print "Here's what %s did with the markup:" % parser
                print soup.prettify()

            print "-" * 80
Esempio n. 3
0
    def _compute_single_content_check(
            self, content_list, html_dict, raw_dict, prefix=None):
        if not content_list:
            # Catch None
            return

        for content in content_list:
            temp_prefix, content = self._get_prefix_content(content, prefix)
            content = content.strip()
            if content.startswith("<"):
                # html.parser because we do not want to automatically create
                # surrounding tags
                soup = BeautifulSoup(content, "html.parser")
                children = list(soup.children)
                if children:
                    child = children[0]
                    string = child.string
                    if child.string and child.string.startswith(REGEX_CONTENT):
                        string = re.compile(child.string[len(REGEX_CONTENT):],
                                            re.MULTILINE)
                    html_check = HTMLCheck(
                        child.name, child.attrs, string)
                    html_dict[temp_prefix].append(html_check)
            else:
                if content and content.startswith(REGEX_CONTENT):
                    content = re.compile(content[len(REGEX_CONTENT):],
                                         re.MULTILINE)
                raw_dict[temp_prefix].append(content)
Esempio n. 4
0
    def benchmark_parsers(num_elements=100000):
        """Very basic head-to-head performance benchmark."""
        print "Comparative parser benchmark on Beautiful Soup %s" % __version__
        data = rdoc(num_elements)
        print "Generated a large invalid HTML document (%d bytes)." % len(data)

        for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
            success = False
            try:
                a = time.time()
                soup = BeautifulSoup(data, parser)
                b = time.time()
                success = True
            except Exception, e:
                print "%s could not parse the markup." % parser
                traceback.print_exc()
            if success:
                print "BS4+%s parsed the markup in %.2fs." % (parser, b - a)
Esempio n. 5
0
    def _crawl_page(self, worker_input):
        page_crawl = None
        erroneous_content = []
        missing_content = []
        url_split_to_crawl = worker_input.url_split

        try:
            response = open_url(self.urlopen,
                                self.request_class,
                                url_split_to_crawl.geturl(),
                                self.worker_config.timeout,
                                self.timeout_exception,
                                self.auth_header,
                                extra_headers=self.worker_config.extra_headers,
                                logger=self.logger)

            if response.exception:
                if response.status:
                    # This is a http error. Good.
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=response.status,
                        is_timeout=False,
                        is_redirect=False,
                        links=[],
                        exception=None,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=None,
                        site_origin=worker_input.site_origin)
                elif response.is_timeout:
                    # This is a timeout. No need to wrap the exception
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=None,
                        is_timeout=True,
                        is_redirect=False,
                        links=[],
                        exception=None,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
                else:
                    # Something bad happened when opening the url
                    exception = ExceptionStr(unicode(type(response.exception)),
                                             unicode(response.exception))
                    page_crawl = PageCrawl(
                        original_url_split=url_split_to_crawl,
                        final_url_split=None,
                        status=None,
                        is_timeout=False,
                        is_redirect=False,
                        links=[],
                        exception=exception,
                        is_html=False,
                        depth=worker_input.depth,
                        response_time=response.response_time,
                        process_time=0,
                        site_origin=worker_input.site_origin)
            else:
                final_url_split = get_clean_url_split(response.final_url)

                message = response.content.info()
                mime_type = get_content_type(message)
                if self.worker_config.prefer_server_encoding:
                    charset = get_charset(message)
                else:
                    charset = None
                links = []

                is_html = mime_type == HTML_MIME_TYPE
                process_time = None

                if is_html and worker_input.should_crawl:
                    start = time.time()
                    html_soup = BeautifulSoup(response.content,
                                              self.worker_config.parser,
                                              from_encoding=charset)
                    links = self.get_links(html_soup, final_url_split)
                    if self._has_content_to_check(worker_input):
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                unicode(html_soup), html_soup,
                                url_split_to_crawl,
                                final_url_split, worker_input.content_check)
                    process_time = time.time() - start
                else:
                    self.logger.debug(
                        "Won't crawl %s. MIME Type: %s. Should crawl: %s",
                        final_url_split, mime_type, worker_input.should_crawl)
                    if self._has_content_to_check(worker_input):
                        text_content = self.get_text_content(
                            response.content.read(), charset)
                        (missing_content, erroneous_content) =\
                            self.check_content(
                                text_content, None, url_split_to_crawl,
                                final_url_split, worker_input.content_check)

                page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                                       final_url_split=final_url_split,
                                       status=response.status,
                                       is_timeout=False,
                                       is_redirect=response.is_redirect,
                                       links=links,
                                       exception=None,
                                       is_html=is_html,
                                       depth=worker_input.depth,
                                       response_time=response.response_time,
                                       process_time=process_time,
                                       site_origin=worker_input.site_origin,
                                       missing_content=missing_content,
                                       erroneous_content=erroneous_content)
        except Exception as exc:
            exception = ExceptionStr(unicode(type(exc)), unicode(exc))
            page_crawl = PageCrawl(original_url_split=url_split_to_crawl,
                                   final_url_split=None,
                                   status=None,
                                   is_timeout=False,
                                   is_redirect=False,
                                   links=[],
                                   exception=exception,
                                   is_html=False,
                                   depth=worker_input.depth,
                                   response_time=None,
                                   process_time=None,
                                   site_origin=worker_input.site_origin)
            self.logger.exception("Exception occurred while crawling a page.")

        return page_crawl