def test_combination(self): assert registry.lookup('strict', 'html') == HTMLParserTreeBuilder if LXML_PRESENT: assert registry.lookup('fast', 'html') == LXMLTreeBuilder assert registry.lookup('permissive', 'xml') == LXMLTreeBuilderForXML if HTML5LIB_PRESENT: assert registry.lookup('html5lib', 'html') == HTML5TreeBuilder
def test_named_library(self): self.assertEqual(registry.lookup("lxml", "xml"), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup("lxml", "html"), LXMLTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup("html5lib"), HTML5TreeBuilder) self.assertEqual(registry.lookup("html.parser"), HTMLParserTreeBuilder)
def test_named_library(self): self.assertEqual(registry.lookup('lxml', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('lxml', 'html'), LXMLTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib'), HTML5TreeBuilder) self.assertEqual(registry.lookup('html.parser'), HTMLParserTreeBuilder)
def test_combination(self): self.assertEquals(registry.lookup('fast', 'html'), LXMLTreeBuilder) self.assertEquals(registry.lookup('permissive', 'xml'), LXMLTreeBuilderForXML) self.assertEquals(registry.lookup('strict', 'html'), HTMLParserTreeBuilder) self.assertEquals(registry.lookup('permissive', 'html'), HTML5TreeBuilder)
def test_named_library(self): if LXML_PRESENT: self.assertEqual(registry.lookup("lxml", "xml"), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup("lxml", "html"), LXMLTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup("html5lib"), HTML5TreeBuilder) self.assertEqual(registry.lookup("html.parser"), HTMLParserTreeBuilder)
def test_named_library(self): if LXML_PRESENT: assert registry.lookup('lxml', 'xml') == LXMLTreeBuilderForXML assert registry.lookup('lxml', 'html') == LXMLTreeBuilder if HTML5LIB_PRESENT: assert registry.lookup('html5lib') == HTML5TreeBuilder assert registry.lookup('html.parser') == HTMLParserTreeBuilder
def test_lookup_by_markup_type(self): if LXML_PRESENT: self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) else: self.assertEqual(registry.lookup('xml'), None) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) else: self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
def test_named_library(self): self.assertEquals(registry.lookup('lxml', 'xml'), LXMLTreeBuilderForXML) self.assertEquals(registry.lookup('lxml', 'html'), LXMLTreeBuilder) self.assertEquals(registry.lookup('html5lib'), HTML5TreeBuilder) self.assertEquals(registry.lookup('html.parser'), HTMLParserTreeBuilder)
def test_lookup_by_markup_type(self): if LXML_PRESENT: assert registry.lookup('html') == LXMLTreeBuilder assert registry.lookup('xml') == LXMLTreeBuilderForXML else: assert registry.lookup('xml') == None if HTML5LIB_PRESENT: assert registry.lookup('html') == HTML5TreeBuilder else: assert registry.lookup('html') == HTMLParserTreeBuilder
def test_combination(self): if LXML_PRESENT: self.assertEqual(registry.lookup("fast", "html"), LXMLTreeBuilder) if LXML_PRESENT: self.assertEqual( registry.lookup("permissive", "xml"), LXMLTreeBuilderForXML ) self.assertEqual(registry.lookup("strict", "html"), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup("html5lib", "html"), HTML5TreeBuilder)
def test_combination(self): if LXML_PRESENT: self.assertEqual(registry.lookup('fast', 'html'), LXMLTreeBuilder) if LXML_PRESENT: self.assertEqual(registry.lookup('permissive', 'xml'), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup('strict', 'html'), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup('html5lib', 'html'), HTML5TreeBuilder)
def test_new_tag_creation(self): builder = builder_registry.lookup('html')() soup = self.soup("<body></body>", builder=builder) a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') a['href'] = 'http://foo.com/' soup.body.insert(0, a) soup.body.insert(1, ol) assert soup.body.encode( ) == b'<body><a href="http://foo.com/"></a><ol></ol></body>'
def test_new_tag_creation(self): builder = builder_registry.lookup('html')() soup = self.soup("<body></body>", builder=builder) a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') a['href'] = 'http://foo.com/' soup.body.insert(0, a) soup.body.insert(1, ol) self.assertEqual( soup.body.encode(), b'<body><a href="http://foo.com/"></a><ol></ol></body>')
import copy import pickle import re import warnings from bs4 import BeautifulSoup from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) from bs4.element import CData, NavigableString, SoupStrainer, Tag from bs4.testing import ( SoupTest, skipIf, ) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) class TreeTest(SoupTest): def assertSelects(self, tags, should_match): """Make sure that the given tags have the correct text. This is used in tests that define a bunch of tags, each containing a single string, and then select certain strings by some mechanism. """ self.assertEqual([tag.string for tag in tags], should_match) def assertSelectsIDs(self, tags, should_match): """Make sure that the given tags have the correct IDs.
def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " "to Unicode characters.") if 'markupMassage' in kwargs: del kwargs['markupMassage'] warnings.warn( "BS4 does not respect the markupMassage argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for any necessary markup massage.") if 'smartQuotesTo' in kwargs: del kwargs['smartQuotesTo'] warnings.warn( "BS4 does not respect the smartQuotesTo argument to the " "BeautifulSoup constructor. Smart quotes are always converted " "to Unicode characters.") if 'selfClosingTags' in kwargs: del kwargs['selfClosingTags'] warnings.warn( "BS4 does not respect the selfClosingTags argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for understanding self-closing tags.") if 'isHTML' in kwargs: del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " "BeautifulSoup constructor. Suggest you use " "features='lxml' for HTML and features='lxml-xml' for " "XML.") def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument( "parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") if from_encoding and isinstance(markup, str): warnings.warn( "You provided Unicode markup but also provided a value for " "from_encoding. Your from_encoding will be ignored.") from_encoding = None if len(kwargs) > 0: arg = list(kwargs.keys()).pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: original_features = features if isinstance(features, str): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise FeatureNotFound( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() if not (original_features == builder.NAME or original_features in builder.ALTERNATE_NAMES): if builder.is_xml: markup_type = "XML" else: markup_type = "HTML" caller = traceback.extract_stack()[0] filename = caller[0] line_number = caller[1] warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( filename=filename, line_number=line_number, parser=builder.NAME, markup_type=markup_type)) self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml self.builder.soup = self self.parse_only = parse_only if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) or (isinstance(markup, str) and not '<' in markup) ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. if (isinstance(markup, str) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: possible_filename = markup is_file = False try: is_file = os.path.exists(possible_filename) except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass if is_file: if isinstance(markup, str): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' 'probably open this file and pass the filehandle into' 'Beautiful Soup.' % markup) self._check_markup_is_url(markup) for ( self.markup, self.original_encoding, self.declared_html_encoding, self.contains_replacement_characters) in ( self.builder.prepare_markup( markup, from_encoding, exclude_encodings=exclude_encodings)): self.reset() try: self._feed() break except ParserRejectedMarkup: pass # Clear out the markup and remove the builder's circular # reference to this object. self.markup = None self.builder.soup = None
def test_lookup_by_markup_type(self): self.assertEquals(registry.lookup('html'), HTML5TreeBuilder) self.assertEquals(registry.lookup('xml'), LXMLTreeBuilderForXML)
def test_unimplemented_combinations(self): self.assertEquals(registry.lookup('fast', 'permissive', 'html'), None)
def test_combination(self): self.assertEqual(registry.lookup("fast", "html"), LXMLTreeBuilder) self.assertEqual(registry.lookup("permissive", "xml"), LXMLTreeBuilderForXML) self.assertEqual(registry.lookup("strict", "html"), HTMLParserTreeBuilder) if HTML5LIB_PRESENT: self.assertEqual(registry.lookup("html5lib", "html"), HTML5TreeBuilder)
from bs4.builder import builder_registry import requests import csv url = "https://issues.apache.org/jira/browse/CAMEL-10597" r = requests.get(url) #soup=BeautifulSoup(r.content,"html") #xml_doc = open('xml_doc') #soup = BeautifulSoup(r.content, 'xml') #soup = BeautifulSoup(xml_doc,'lxml') doc = open('xml_file') soup = BeautifulSoup(doc, 'lxml') #print(soup) print(builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES) #print( builder_registry.lookup('html').DEFAULT_CDATA_LIST_ATTRIBUTES) #print(soup.prettify()) tag = soup.title type(tag) print(tag) people = {} date = {} for i in range(1): typee = soup.type title = soup.summary prio = soup.priority resol = soup.resolution
def test_lookup_by_markup_type(self): if LXML_PRESENT: self.assertEqual(registry.lookup("html"), LXMLTreeBuilder) else: self.assertEqual(registry.lookup("html"), HTML5TreeBuilder) self.assertEqual(registry.lookup("xml"), LXMLTreeBuilderForXML)
def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if 'convertEntities' in kwargs: warnings.warn( "BS4 does not respect the convertEntities argument to the " "BeautifulSoup constructor. Entities are always converted " "to Unicode characters.") if 'markupMassage' in kwargs: del kwargs['markupMassage'] warnings.warn( "BS4 does not respect the markupMassage argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for any necessary markup massage.") if 'smartQuotesTo' in kwargs: del kwargs['smartQuotesTo'] warnings.warn( "BS4 does not respect the smartQuotesTo argument to the " "BeautifulSoup constructor. Smart quotes are always converted " "to Unicode characters.") if 'selfClosingTags' in kwargs: del kwargs['selfClosingTags'] warnings.warn( "BS4 does not respect the selfClosingTags argument to the " "BeautifulSoup constructor. The tree builder is responsible " "for understanding self-closing tags.") if 'isHTML' in kwargs: del kwargs['isHTML'] warnings.warn( "BS4 does not respect the isHTML argument to the " "BeautifulSoup constructor. You can pass in features='html' " "or features='xml' to get a builder capable of handling " "one or the other.") def deprecated_argument(old_name, new_name): if old_name in kwargs: warnings.warn( 'The "%s" argument to the BeautifulSoup constructor ' 'has been renamed to "%s."' % (old_name, new_name)) value = kwargs[old_name] del kwargs[old_name] return value return None parse_only = parse_only or deprecated_argument( "parseOnlyThese", "parse_only") from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") if len(kwargs) > 0: arg = kwargs.keys().pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: if isinstance(features, basestring): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES builder_class = builder_registry.lookup(*features) if builder_class is None: raise FeatureNotFound( "Couldn't find a tree builder with the features you " "requested: %s. Do you need to install a parser library?" % ",".join(features)) builder = builder_class() self.builder = builder self.is_xml = builder.is_xml self.builder.soup = self self.parse_only = parse_only if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256: # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. if (isinstance(markup, unicode) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: possible_filename = markup is_file = False try: is_file = os.path.exists(possible_filename) except Exception, e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass if is_file: warnings.warn( '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) if markup[:5] == "http:" or markup[:6] == "https:": # TODO: This is ugly but I couldn't get it to work in # Python 3 otherwise. if ((isinstance(markup, bytes) and not b' ' in markup) or (isinstance(markup, unicode) and not u' ' in markup)): warnings.warn( '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)