def _work(listener, cached_doc_path): soup = _ps_lib().soup_via_locators_(url=_url, html_document_path=cached_doc_path, listener=listener) if soup is None: return readme, = soup.find_all('div', id='readme') # find the H2 called "Articles" for el in readme.find_all('h2'): if 'Articles' != el.text: continue ul = el.find_next_sibling() # not the H2 but whatever is after it assert ('ul' == ul.name) break for li in sv.filter('li', ul): these = sv.filter('*', li) # immediate children, skipping strings if 1 == len(these): a, = these assert ('a' == a.name) yikes = None else: a, desc = these assert ('a' == a.name) yikes = desc.name # NOTE this skips over some interesting strings that are like descs dct = {'label': a.text, 'url': a['href']} if yikes is not None: assert ('code' == yikes) # meh yield dct
def test_invalid_type_input_filter(self): """Test bad input into the filter API.""" flags = sv.DEBUG with self.assertRaises(TypeError): sv.filter('div', "not a tag", flags=flags)
def test_filter(self): """Test filter.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = bs4.BeautifulSoup(markup, 'html5lib') nodes = sv.filter('pre#\\36', soup.html.body) self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].attrs['id'], '6') nodes = sv.filter( 'pre#\\36', [el for el in soup.html.body.children if isinstance(el, bs4.Tag)]) self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].attrs['id'], '6')
def test_invalid_type_input(self): """Test bad input into the API.""" with self.assertRaises(TypeError): sv.match('div', "not a tag") with self.assertRaises(TypeError): sv.select('div', "not a tag") with self.assertRaises(TypeError): sv.filter('div', "not a tag") with self.assertRaises(TypeError): sv.comments('div', "not a tag")
def test_filter_list(self): """ Test filter list. Even if a list is created from the content of a tag, as long as the content is document nodes, filter will still handle it. It doesn't have to be just tags. """ markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') nodes = sv.filter('pre#\\36', [el for el in soup.html.body.children]) self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].attrs['id'], '6')
def test_filter_tag_order(self): """Test filter tag order.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html.parser') ids = [tag['id'] for tag in sv.filter('[id]', soup.html.body.p)] self.assertEqual(['2', '3'], ids)
def test_filter_tag(self): """Test filter tag.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') nodes = sv.filter('pre#\\36', soup.html.body) self.assertEqual(len(nodes), 1) self.assertEqual(nodes[0].attrs['id'], '6')
def _filter(sel, el): return sv.filter(sel, el)
def _filter(sel, el): import soupsieve as sv return sv.filter(sel, el)
def _filter(sel, el): # at #history-A.1 BeautifulSoup changed import soupsieve as sv return sv.filter(sel, el)
def _direct_children(node): return sv.filter('*', node) # omit strings, `_filter`