def do_parse(num): collect() before = len(gc.get_objects()) for i in range(num): parse(HTML) collect() return len(gc.get_objects()) - before
def get_soup(self, src, url=None): nmassage = [] nmassage.extend(self.preprocess_regexps) # Remove comments as they can leave detritus when extracting tags leaves # multiple nested comments nmassage.append((re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')) usrc = xml_to_unicode(src, self.verbose, strip_encoding_pats=True)[0] usrc = self.preprocess_raw_html(usrc, url) for pat, repl in nmassage: usrc = pat.sub(repl, usrc) set_soup_module(sys.modules[BeautifulSoup.__module__]) soup = parse(usrc, return_root=False) replace = self.prepreprocess_html_ext(soup) if replace is not None: replace = xml_to_unicode(replace, self.verbose, strip_encoding_pats=True)[0] for pat, repl in nmassage: replace = pat.sub(repl, replace) soup = parse(replace, return_root=False) if self.keep_only_tags: body = Tag(soup, 'body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').findAll(**spec): body.insert(len(body.contents), tag) soup.find('body').replaceWith(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.extract() after = ns tag = tag.parent if self.remove_tags_after is not None: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'nextSibling') if self.remove_tags_before is not None: rt = [self.remove_tags_before] if isinstance(self.remove_tags_before, dict) else self.remove_tags_before for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'previousSibling') for kwds in self.remove_tags: for tag in soup.findAll(**kwds): tag.extract() return self.preprocess_html_ext(soup)
def test_soup_leak(self): HTML = '<p a=1>\n<a b=2 id=3>y</a>z<x:x class=4>1</x:x>' parse(HTML) # So that BS and html_parser set up any internal objects def do_parse(num): collect() before = len(gc.get_objects()) for i in range(num): parse(HTML) collect() return len(gc.get_objects()) - before for num in (1, 10, 100): self.assertLess(do_parse(num), 2)
def test_doctype_stays_intact(self): base = '\n<html><body><p>xxx</p></body></html>' for dt in ('html', 'html PUBLIC "-//W3C//DTD HTML 4.01//EN" ' '"http://www.w3.org/TR/html4/strict.dtd"'): dt = '<!DOCTYPE {}>'.format(dt) soup = parse(dt + base, return_root=False, keep_doctype=True) parsed_doctype = str(soup).split('\n', 1)[0] self.ae(dt, parsed_doctype)
def test_soup_list_attrs(self): if is_bs3(): self.skipTest('No bs4 module found') root = parse('<a class="a b" rel="x y">') self.ae(root.body.a.attrs, { 'class': 'a b'.split(), 'rel': 'x y'.split() })
def test_attr_soup(self): root = parse('<p a=1 b=2 ID=3><a a=a>') self.ae(dict(root.body.p.attrs), {'a': '1', 'b': '2', 'id': '3'}) self.ae(dict(root.body.p.a.attrs), {'a': 'a'}) self.ae(type('')(root.find(name='a', a='a')), '<a a="a"></a>') root = parse('<p a=1><svg><image xlink:href="h">') self.ae( type('')(root), '<html><head></head><body>' '<p a="1"><svg><image xlink:href="h"/></svg></p>' '</body></html>') root = parse('<html xml:lang="en" lang="fr"><p>') self.ae(dict(root.attrs), {'xml:lang': 'en', 'lang': 'fr'}) root = parse('<p><x xmlns:a="b">') self.ae( type('')(root), '<html><head></head><body><p><x xmlns:a="b"></x></p></body></html>' )
def test_simple_soup(self): root = parse('<p>\n<a>y</a>z<x:x>1</x:x>') self.ae( type('')(root), '<html><head></head><body><p>\n<a>y</a>z<x:x>1</x:x></p></body></html>' ) root = parse('<svg><image>') self.ae( type('')(root), '<html><head></head><body><svg><image></image></svg></body></html>' ) root = parse('<p><!-- ---->') self.ae( type('')(root), '<html><head></head><body><p><!-- ----></p></body></html>') root = parse('<p><i><b>') self.ae( type('')(root), '<html><head></head><body><p><i><b></b></i></p></body></html>')
def parse_html(markup): if isinstance(markup, str): markup = chardet.strip_encoding_declarations(markup) markup = chardet.substitute_entites(markup) else: markup = chardet.xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = cleantext.clean_xml_chars(markup) return html5_soup.parse(markup, return_root=False)
def parse_html(markup): from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites from calibre.utils.cleantext import clean_xml_chars if isinstance(markup, unicode_type): markup = strip_encoding_declarations(markup) markup = substitute_entites(markup) else: markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = clean_xml_chars(markup) from html5_parser.soup import parse return parse(markup, return_root=False)