def test_invalid_type_input_match(self): """Test bad input into the match API.""" flags = sv.DEBUG with self.assertRaises(TypeError): sv.match('div', "not a tag", flags=flags)
def test_invalid_type_input(self): """Test bad input into the API.""" with self.assertRaises(TypeError): sv.match('div', "not a tag") with self.assertRaises(TypeError): sv.select('div', "not a tag") with self.assertRaises(TypeError): sv.filter('div', "not a tag") with self.assertRaises(TypeError): sv.comments('div', "not a tag")
def extract(tag): if type(tag) is NavigableString: return stringify(tag) if type(tag) is Comment: return str() text = '' for item in tag.contents: if type(item) is NavigableString: text += stringify(item) elif type(item) is Comment: pass elif item.name == 'br': text += '\n' elif item.name == 'p': text += '\n' text += extract(item) elif item.name == 'b': # text += '[b]' text += extract(item) # text += '[/b]' # elif item.name == 'i': # text += '[i]' # text += extract(item) # text += '[/i]' elif sv.match('div.codebox', item): text += '[code]' text += extact_code(item) text += '[/code]' else: text += '\n' text += extract(item) return text
def test_nth_child_no_parent(self): """Test `nth` child with no parent.""" markup = """ <body> <p id="0"></p> <p id="1"></p> <span id="2"></span> <span id="3"></span> <span id="4"></span> <span id="5"></span> <span id="6"></span> <p id="7"></p> <p id="8"></p> <p id="9"></p> <p id="10"></p> <span id="11"></span> </body> """ for parser in util.available_parsers('html.parser', 'lxml', 'html5lib'): # Paragraph is the root. There is no document. markup = """<p id="1">text</p>""" soup = self.soup(markup, parser) fragment = soup.p.extract() self.assertTrue(sv.match("p:nth-child(1)", fragment, flags=sv.DEBUG))
def test_dir_on_input_root(self): """Test input direction when input is the root.""" markup = """<input id="1" type="text" dir="auto">""" # Input is root for parser in util.available_parsers('html.parser', 'lxml', 'html5lib'): soup = self.soup(markup, parser) fragment = soup.input.extract() self.assertTrue(sv.match(":root:dir(ltr)", fragment, flags=sv.DEBUG))
def test_dir_on_input_root(self): """Test input direction when input is the root.""" markup = """<input id="1" type="text" dir="auto">""" # Input is root for parser in ('html.parser', 'lxml', 'html5lib'): soup = self.soup(markup, parser) fragment = soup.input.extract() self.assertTrue( sv.match(":root:dir(ltr)", fragment, flags=sv.DEBUG))
def extract(self, response, element_names, attr_names): soup = BeautifulSoup(self.get_content(response), "html.parser") filtered_elements = (element for element in soup.find_all() if ( (not element_names or element.name in element_names) and not any( soupsieve.match(selector, element) for selector in self.ignore_css_selectors))) for element in filtered_elements: for attr_name in attr_names: attr = element.get(attr_name) if attr: defragged_attr = urldefrag(attr)[0] yield Node(source=element.name, path=defragged_attr)
def test_match(self): """Test matching.""" markup = """ <!-- before header --> <html> <head> </head> <body> <!-- comment --> <p id="1"><code id="2"></code><img id="3" src="./image.png"/></p> <pre id="4"></pre> <p><span id="5" class="some-class"></span><span id="some-id"></span></p> <pre id="6" class='ignore'> <!-- don't ignore --> </pre> </body> </html> """ soup = self.soup(markup, 'html5lib') nodes = sv.select('span[id]', soup) self.assertTrue(sv.match('span#\\35', nodes[0])) self.assertFalse(sv.match('span#\\35', nodes[1]))
def extract_forms(self, path, response, ignore_form_fields=None): soup = BeautifulSoup(self.get_content(response), "html.parser") form_elements = soup.find_all('form') forms = [ Node(source=FORM, method=form_element.get('method', GET), path=form_element.get('action', path), params={ input_element['name']: input_element.get('value', '') for input_element in form_element.find_all( 'input', {'name': True}) }, ignore_form_fields=ignore_form_fields) for form_element in form_elements if not any( soupsieve.match(sel, form_element) for sel in self.ignore_css_selectors) ] return forms
def test_nth_child(self): """Test `nth` child.""" markup = """ <p id="0"></p> <p id="1"></p> <span id="2"></span> <span id="3"></span> <span id="4"></span> <span id="5"></span> <span id="6"></span> <p id="7"></p> <p id="8"></p> <p id="9"></p> <p id="10"></p> <span id="11"></span> """ self.assert_selector(markup, "p:nth-child(2n-5)", ['0', '8', '10']) self.assert_selector(markup, "p:NTH-CHILD(2n-5)", ['0', '8', '10']) self.assert_selector(markup, "p:nth-child(-2n+20)", ['1', '7', '9']) self.assert_selector(markup, "p:nth-child(50n-20)", []) self.assert_selector(markup, "p:nth-child(-2n-2)", []) self.assert_selector(markup, "p:nth-child(-2)", [], flags=util.HTML5) self.assert_selector(markup, "p:nth-child(2)", ['1'], flags=util.HTML5) self.assert_selector(markup, "p:nth-child(9n - 1)", ['7'], flags=util.HTML5) self.assert_selector(markup, "p:nth-child(2n + 1)", ['0', '8', '10'], flags=util.HTML5) self.assert_selector(markup, "p:nth-child(-n+3)", ['0', '1'], flags=util.HTML5) self.assert_selector(markup, "span:nth-child(-n+3)", ['2'], flags=util.HTML5) self.assert_selector(markup, "body *:nth-child(-n+3)", ['0', '1', '2'], flags=util.HTML5) self.assert_selector(markup, "p:nth-child(odd)", ['0', '8', '10'], flags=util.HTML5) self.assert_selector(markup, "p:nth-child(even)", ['1', '7', '9'], flags=util.HTML5) # Paragraph is the root. There is no document. markup = """<p id="1">text</p>""" soup = bs4.BeautifulSoup(markup, 'html5lib') fragment = soup.p.extract() self.assertTrue(sv.match("p:nth-child(1)", fragment, flags=sv.DEBUG))