예제 #1
0
 def test_combination(self):
     assert registry.lookup('strict', 'html') == HTMLParserTreeBuilder
     if LXML_PRESENT:
         assert registry.lookup('fast', 'html') == LXMLTreeBuilder
         assert registry.lookup('permissive', 'xml') == LXMLTreeBuilderForXML
     if HTML5LIB_PRESENT:
         assert registry.lookup('html5lib', 'html') == HTML5TreeBuilder
예제 #2
0
    def test_named_library(self):
        self.assertEqual(registry.lookup("lxml", "xml"), LXMLTreeBuilderForXML)
        self.assertEqual(registry.lookup("lxml", "html"), LXMLTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup("html5lib"), HTML5TreeBuilder)

        self.assertEqual(registry.lookup("html.parser"), HTMLParserTreeBuilder)
예제 #3
0
    def test_named_library(self):
        self.assertEqual(registry.lookup('lxml', 'xml'), LXMLTreeBuilderForXML)
        self.assertEqual(registry.lookup('lxml', 'html'), LXMLTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib'), HTML5TreeBuilder)

        self.assertEqual(registry.lookup('html.parser'), HTMLParserTreeBuilder)
예제 #4
0
 def test_combination(self):
     self.assertEquals(registry.lookup('fast', 'html'), LXMLTreeBuilder)
     self.assertEquals(registry.lookup('permissive', 'xml'),
                       LXMLTreeBuilderForXML)
     self.assertEquals(registry.lookup('strict', 'html'),
                       HTMLParserTreeBuilder)
     self.assertEquals(registry.lookup('permissive', 'html'),
                       HTML5TreeBuilder)
    def test_named_library(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup("lxml", "xml"), LXMLTreeBuilderForXML)
            self.assertEqual(registry.lookup("lxml", "html"), LXMLTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup("html5lib"), HTML5TreeBuilder)

        self.assertEqual(registry.lookup("html.parser"), HTMLParserTreeBuilder)
예제 #6
0
    def test_named_library(self):
        if LXML_PRESENT:
            assert registry.lookup('lxml', 'xml') == LXMLTreeBuilderForXML
            assert registry.lookup('lxml', 'html') == LXMLTreeBuilder
        if HTML5LIB_PRESENT:
            assert registry.lookup('html5lib') == HTML5TreeBuilder

        assert registry.lookup('html.parser') == HTMLParserTreeBuilder
예제 #7
0
 def test_combination(self):
     self.assertEquals(registry.lookup('fast', 'html'),
                       LXMLTreeBuilder)
     self.assertEquals(registry.lookup('permissive', 'xml'),
                       LXMLTreeBuilderForXML)
     self.assertEquals(registry.lookup('strict', 'html'),
                       HTMLParserTreeBuilder)
     self.assertEquals(registry.lookup('permissive', 'html'),
                       HTML5TreeBuilder)
예제 #8
0
 def test_lookup_by_markup_type(self):
     if LXML_PRESENT:
         self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
         self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
     else:
         self.assertEqual(registry.lookup('xml'), None)
         if HTML5LIB_PRESENT:
             self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
         else:
             self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
예제 #9
0
    def test_named_library(self):
        self.assertEquals(registry.lookup('lxml', 'xml'),
                          LXMLTreeBuilderForXML)
        self.assertEquals(registry.lookup('lxml', 'html'),
                          LXMLTreeBuilder)
        self.assertEquals(registry.lookup('html5lib'),
                          HTML5TreeBuilder)

        self.assertEquals(registry.lookup('html.parser'),
                          HTMLParserTreeBuilder)
예제 #10
0
 def test_lookup_by_markup_type(self):
     if LXML_PRESENT:
         assert registry.lookup('html') == LXMLTreeBuilder
         assert registry.lookup('xml') == LXMLTreeBuilderForXML
     else:
         assert registry.lookup('xml') == None
         if HTML5LIB_PRESENT:
             assert registry.lookup('html') == HTML5TreeBuilder
         else:
             assert registry.lookup('html') == HTMLParserTreeBuilder
    def test_combination(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup("fast", "html"), LXMLTreeBuilder)

        if LXML_PRESENT:
            self.assertEqual(
                registry.lookup("permissive", "xml"), LXMLTreeBuilderForXML
            )
        self.assertEqual(registry.lookup("strict", "html"), HTMLParserTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup("html5lib", "html"), HTML5TreeBuilder)
예제 #12
0
    def test_combination(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('fast', 'html'), LXMLTreeBuilder)

        if LXML_PRESENT:
            self.assertEqual(registry.lookup('permissive', 'xml'),
                             LXMLTreeBuilderForXML)
        self.assertEqual(registry.lookup('strict', 'html'),
                         HTMLParserTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib', 'html'),
                             HTML5TreeBuilder)
    def test_combination(self):
        if LXML_PRESENT:
            self.assertEqual(registry.lookup('fast', 'html'),
                             LXMLTreeBuilder)

        if LXML_PRESENT:
            self.assertEqual(registry.lookup('permissive', 'xml'),
                             LXMLTreeBuilderForXML)
        self.assertEqual(registry.lookup('strict', 'html'),
                          HTMLParserTreeBuilder)
        if HTML5LIB_PRESENT:
            self.assertEqual(registry.lookup('html5lib', 'html'),
                              HTML5TreeBuilder)
예제 #14
0
 def test_new_tag_creation(self):
     builder = builder_registry.lookup('html')()
     soup = self.soup("<body></body>", builder=builder)
     a = Tag(soup, builder, 'a')
     ol = Tag(soup, builder, 'ol')
     a['href'] = 'http://foo.com/'
     soup.body.insert(0, a)
     soup.body.insert(1, ol)
     assert soup.body.encode(
     ) == b'<body><a href="http://foo.com/"></a><ol></ol></body>'
예제 #15
0
파일: test_tree.py 프로젝트: speg/probet
 def test_new_tag_creation(self):
     builder = builder_registry.lookup('html')()
     soup = self.soup("<body></body>", builder=builder)
     a = Tag(soup, builder, 'a')
     ol = Tag(soup, builder, 'ol')
     a['href'] = 'http://foo.com/'
     soup.body.insert(0, a)
     soup.body.insert(1, ol)
     self.assertEqual(
         soup.body.encode(),
         b'<body><a href="http://foo.com/"></a><ol></ol></body>')
예제 #16
0
import copy
import pickle
import re
import warnings
from bs4 import BeautifulSoup
from bs4.builder import (
    builder_registry,
    HTMLParserTreeBuilder,
)
from bs4.element import CData, NavigableString, SoupStrainer, Tag
from bs4.testing import (
    SoupTest,
    skipIf,
)

XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)

class TreeTest(SoupTest):

    def assertSelects(self, tags, should_match):
        """Make sure that the given tags have the correct text.

        This is used in tests that define a bunch of tags, each
        containing a single string, and then select certain strings by
        some mechanism.
        """
        self.assertEqual([tag.string for tag in tags], should_match)

    def assertSelectsIDs(self, tags, should_match):
        """Make sure that the given tags have the correct IDs.
예제 #17
0
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None,
                 exclude_encodings=None,
                 **kwargs):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""

        if 'convertEntities' in kwargs:
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
                "to Unicode characters.")

        if 'markupMassage' in kwargs:
            del kwargs['markupMassage']
            warnings.warn(
                "BS4 does not respect the markupMassage argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for any necessary markup massage.")

        if 'smartQuotesTo' in kwargs:
            del kwargs['smartQuotesTo']
            warnings.warn(
                "BS4 does not respect the smartQuotesTo argument to the "
                "BeautifulSoup constructor. Smart quotes are always converted "
                "to Unicode characters.")

        if 'selfClosingTags' in kwargs:
            del kwargs['selfClosingTags']
            warnings.warn(
                "BS4 does not respect the selfClosingTags argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for understanding self-closing tags.")

        if 'isHTML' in kwargs:
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
                "BeautifulSoup constructor. Suggest you use "
                "features='lxml' for HTML and features='lxml-xml' for "
                "XML.")

        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
                warnings.warn(
                    'The "%s" argument to the BeautifulSoup constructor '
                    'has been renamed to "%s."' % (old_name, new_name))
                value = kwargs[old_name]
                del kwargs[old_name]
                return value
            return None

        parse_only = parse_only or deprecated_argument(
            "parseOnlyThese", "parse_only")

        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")

        if from_encoding and isinstance(markup, str):
            warnings.warn(
                "You provided Unicode markup but also provided a value for "
                "from_encoding. Your from_encoding will be ignored.")
            from_encoding = None

        if len(kwargs) > 0:
            arg = list(kwargs.keys()).pop()
            raise TypeError(
                "__init__() got an unexpected keyword argument '%s'" % arg)

        if builder is None:
            original_features = features
            if isinstance(features, str):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
            builder_class = builder_registry.lookup(*features)
            if builder_class is None:
                raise FeatureNotFound(
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
            if not (original_features == builder.NAME or
                            original_features in builder.ALTERNATE_NAMES):
                if builder.is_xml:
                    markup_type = "XML"
                else:
                    markup_type = "HTML"

                caller = traceback.extract_stack()[0]
                filename = caller[0]
                line_number = caller[1]
                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
                    filename=filename,
                    line_number=line_number,
                    parser=builder.NAME,
                    markup_type=markup_type))

        self.builder = builder
        self.is_xml = builder.is_xml
        self.known_xml = self.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        if hasattr(markup, 'read'):  # It's a file-type object.
            markup = markup.read()
        elif len(markup) <= 256 and (
                    (isinstance(markup, bytes) and not b'<' in markup)
                or (isinstance(markup, str) and not '<' in markup)
        ):
            # Print out warnings for a couple beginner problems
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
            # just in case that's what the user really wants.
            if (isinstance(markup, str)
                and not os.path.supports_unicode_filenames):
                possible_filename = markup.encode("utf8")
            else:
                possible_filename = markup
            is_file = False
            try:
                is_file = os.path.exists(possible_filename)
            except Exception as e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
                pass
            if is_file:
                if isinstance(markup, str):
                    markup = markup.encode("utf8")
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should'
                    'probably open this file and pass the filehandle into'
                    'Beautiful Soup.' % markup)
            self._check_markup_is_url(markup)

        for (
                self.markup, self.original_encoding,
                self.declared_html_encoding,
                self.contains_replacement_characters) in (
                self.builder.prepare_markup(
                    markup, from_encoding,
                    exclude_encodings=exclude_encodings)):
            self.reset()
            try:
                self._feed()
                break
            except ParserRejectedMarkup:
                pass

        # Clear out the markup and remove the builder's circular
        # reference to this object.
        self.markup = None
        self.builder.soup = None
예제 #18
0
 def test_lookup_by_markup_type(self):
     self.assertEquals(registry.lookup('html'), HTML5TreeBuilder)
     self.assertEquals(registry.lookup('xml'), LXMLTreeBuilderForXML)
예제 #19
0
 def test_unimplemented_combinations(self):
     self.assertEquals(registry.lookup('fast', 'permissive', 'html'), None)
예제 #20
0
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None,
                 exclude_encodings=None,
                 **kwargs):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""

        if 'convertEntities' in kwargs:
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
                "to Unicode characters.")

        if 'markupMassage' in kwargs:
            del kwargs['markupMassage']
            warnings.warn(
                "BS4 does not respect the markupMassage argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for any necessary markup massage.")

        if 'smartQuotesTo' in kwargs:
            del kwargs['smartQuotesTo']
            warnings.warn(
                "BS4 does not respect the smartQuotesTo argument to the "
                "BeautifulSoup constructor. Smart quotes are always converted "
                "to Unicode characters.")

        if 'selfClosingTags' in kwargs:
            del kwargs['selfClosingTags']
            warnings.warn(
                "BS4 does not respect the selfClosingTags argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for understanding self-closing tags.")

        if 'isHTML' in kwargs:
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
                "BeautifulSoup constructor. Suggest you use "
                "features='lxml' for HTML and features='lxml-xml' for "
                "XML.")

        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
                warnings.warn(
                    'The "%s" argument to the BeautifulSoup constructor '
                    'has been renamed to "%s."' % (old_name, new_name))
                value = kwargs[old_name]
                del kwargs[old_name]
                return value
            return None

        parse_only = parse_only or deprecated_argument(
            "parseOnlyThese", "parse_only")

        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")

        if from_encoding and isinstance(markup, str):
            warnings.warn(
                "You provided Unicode markup but also provided a value for "
                "from_encoding. Your from_encoding will be ignored.")
            from_encoding = None

        if len(kwargs) > 0:
            arg = list(kwargs.keys()).pop()
            raise TypeError(
                "__init__() got an unexpected keyword argument '%s'" % arg)

        if builder is None:
            original_features = features
            if isinstance(features, str):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
            builder_class = builder_registry.lookup(*features)
            if builder_class is None:
                raise FeatureNotFound(
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
            if not (original_features == builder.NAME or
                            original_features in builder.ALTERNATE_NAMES):
                if builder.is_xml:
                    markup_type = "XML"
                else:
                    markup_type = "HTML"

                caller = traceback.extract_stack()[0]
                filename = caller[0]
                line_number = caller[1]
                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
                    filename=filename,
                    line_number=line_number,
                    parser=builder.NAME,
                    markup_type=markup_type))

        self.builder = builder
        self.is_xml = builder.is_xml
        self.known_xml = self.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        if hasattr(markup, 'read'):  # It's a file-type object.
            markup = markup.read()
        elif len(markup) <= 256 and (
                    (isinstance(markup, bytes) and not b'<' in markup)
                or (isinstance(markup, str) and not '<' in markup)
        ):
            # Print out warnings for a couple beginner problems
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
            # just in case that's what the user really wants.
            if (isinstance(markup, str)
                and not os.path.supports_unicode_filenames):
                possible_filename = markup.encode("utf8")
            else:
                possible_filename = markup
            is_file = False
            try:
                is_file = os.path.exists(possible_filename)
            except Exception as e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
                pass
            if is_file:
                if isinstance(markup, str):
                    markup = markup.encode("utf8")
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should'
                    'probably open this file and pass the filehandle into'
                    'Beautiful Soup.' % markup)
            self._check_markup_is_url(markup)

        for (
                self.markup, self.original_encoding,
                self.declared_html_encoding,
                self.contains_replacement_characters) in (
                self.builder.prepare_markup(
                    markup, from_encoding,
                    exclude_encodings=exclude_encodings)):
            self.reset()
            try:
                self._feed()
                break
            except ParserRejectedMarkup:
                pass

        # Clear out the markup and remove the builder's circular
        # reference to this object.
        self.markup = None
        self.builder.soup = None
예제 #21
0
 def test_unimplemented_combinations(self):
     self.assertEquals(registry.lookup('fast', 'permissive', 'html'),
                       None)
예제 #22
0
 def test_lookup_by_markup_type(self):
     self.assertEquals(registry.lookup('html'), HTML5TreeBuilder)
     self.assertEquals(registry.lookup('xml'), LXMLTreeBuilderForXML)
예제 #23
0
 def test_combination(self):
     self.assertEqual(registry.lookup("fast", "html"), LXMLTreeBuilder)
     self.assertEqual(registry.lookup("permissive", "xml"), LXMLTreeBuilderForXML)
     self.assertEqual(registry.lookup("strict", "html"), HTMLParserTreeBuilder)
     if HTML5LIB_PRESENT:
         self.assertEqual(registry.lookup("html5lib", "html"), HTML5TreeBuilder)
예제 #24
0
from bs4.builder import builder_registry
import requests
import csv

url = "https://issues.apache.org/jira/browse/CAMEL-10597"
r = requests.get(url)
#soup=BeautifulSoup(r.content,"html")

#xml_doc = open('xml_doc')
#soup = BeautifulSoup(r.content, 'xml')
#soup = BeautifulSoup(xml_doc,'lxml')

doc = open('xml_file')
soup = BeautifulSoup(doc, 'lxml')
#print(soup)
print(builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES)

#print( builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES)
#print(soup.prettify())

tag = soup.title
type(tag)
print(tag)
people = {}
date = {}
for i in range(1):

    typee = soup.type
    title = soup.summary
    prio = soup.priority
    resol = soup.resolution
예제 #25
0
 def test_lookup_by_markup_type(self):
     if LXML_PRESENT:
         self.assertEqual(registry.lookup("html"), LXMLTreeBuilder)
     else:
         self.assertEqual(registry.lookup("html"), HTML5TreeBuilder)
     self.assertEqual(registry.lookup("xml"), LXMLTreeBuilderForXML)
예제 #26
0
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, **kwargs):
        """The Soup object is initialized as the 'root tag', and the
        provided markup (which can be a string or a file-like object)
        is fed into the underlying parser."""

        if 'convertEntities' in kwargs:
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
                "to Unicode characters.")

        if 'markupMassage' in kwargs:
            del kwargs['markupMassage']
            warnings.warn(
                "BS4 does not respect the markupMassage argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for any necessary markup massage.")

        if 'smartQuotesTo' in kwargs:
            del kwargs['smartQuotesTo']
            warnings.warn(
                "BS4 does not respect the smartQuotesTo argument to the "
                "BeautifulSoup constructor. Smart quotes are always converted "
                "to Unicode characters.")

        if 'selfClosingTags' in kwargs:
            del kwargs['selfClosingTags']
            warnings.warn(
                "BS4 does not respect the selfClosingTags argument to the "
                "BeautifulSoup constructor. The tree builder is responsible "
                "for understanding self-closing tags.")

        if 'isHTML' in kwargs:
            del kwargs['isHTML']
            warnings.warn(
                "BS4 does not respect the isHTML argument to the "
                "BeautifulSoup constructor. You can pass in features='html' "
                "or features='xml' to get a builder capable of handling "
                "one or the other.")

        def deprecated_argument(old_name, new_name):
            if old_name in kwargs:
                warnings.warn(
                    'The "%s" argument to the BeautifulSoup constructor '
                    'has been renamed to "%s."' % (old_name, new_name))
                value = kwargs[old_name]
                del kwargs[old_name]
                return value
            return None

        parse_only = parse_only or deprecated_argument(
            "parseOnlyThese", "parse_only")

        from_encoding = from_encoding or deprecated_argument(
            "fromEncoding", "from_encoding")

        if len(kwargs) > 0:
            arg = kwargs.keys().pop()
            raise TypeError(
                "__init__() got an unexpected keyword argument '%s'" % arg)

        if builder is None:
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
                features = self.DEFAULT_BUILDER_FEATURES
            builder_class = builder_registry.lookup(*features)
            if builder_class is None:
                raise FeatureNotFound(
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
            builder = builder_class()
        self.builder = builder
        self.is_xml = builder.is_xml
        self.builder.soup = self

        self.parse_only = parse_only

        if hasattr(markup, 'read'):        # It's a file-type object.
            markup = markup.read()
        elif len(markup) <= 256:
            # Print out warnings for a couple beginner problems
            # involving passing non-markup to Beautiful Soup.
            # Beautiful Soup will still parse the input as markup,
            # just in case that's what the user really wants.
            if (isinstance(markup, unicode)
                and not os.path.supports_unicode_filenames):
                possible_filename = markup.encode("utf8")
            else:
                possible_filename = markup
            is_file = False
            try:
                is_file = os.path.exists(possible_filename)
            except Exception, e:
                # This is almost certainly a problem involving
                # characters not valid in filenames on this
                # system. Just let it go.
                pass
            if is_file:
                warnings.warn(
                    '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
            if markup[:5] == "http:" or markup[:6] == "https:":
                # TODO: This is ugly but I couldn't get it to work in
                # Python 3 otherwise.
                if ((isinstance(markup, bytes) and not b' ' in markup)
                    or (isinstance(markup, unicode) and not u' ' in markup)):
                    warnings.warn(
                        '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)