def test_get_tags_by_filter(self): body = '<html><a href="/abc">foo</a><b>bar</b></html>' url = URL('http://www.w3af.com/') headers = Headers() headers['content-type'] = 'text/html' resp = HTTPResponse(200, body, headers, url, url, charset='utf-8') tags = self.mpdoc.get_tags_by_filter(resp, ('a', 'b'), yield_text=True) self.assertEqual([Tag('a', {'href': '/abc'}, 'foo'), Tag('b', {}, 'bar')], tags)
def test_get_tags_by_filter_empty_tag(self): body = '<html><script src="foo.js"></script></html>' url = URL('http://www.w3af.com/') headers = Headers() headers['content-type'] = 'text/html' resp = HTTPResponse(200, body, headers, url, url, charset='utf-8') tags = self.mpdoc.get_tags_by_filter(resp, ('script',), yield_text=True) # Note that lxml returns None for this tag text: self.assertEqual([Tag('script', {'src': 'foo.js'}, None)], tags)
def test_nested_with_text(self): body = '<html><a href="/abc">foo<div>bar</div></a></html>' url = URL('http://www.w3af.com/') headers = Headers() headers['content-type'] = 'text/html' resp = HTTPResponse(200, body, headers, url, url, charset='utf-8') p = SGMLParser(resp) tags = p.get_tags_by_filter(('a', 'b'), yield_text=True) tags = list(tags) self.assertEqual([Tag('a', {'href': '/abc'}, 'foo')], tags)
def load_tags_from_temp_file(filename, remove=True): """ :param filename: The filename that holds the Tags as msgpack :param remove: Remove the file after reading :return: A list containing tags """ try: data = msgpack.load(file(filename, 'rb'), raw=False) result = [Tag.from_dict(t) for t in data] except: if remove: remove_file_if_exists(filename) raise else: if remove: remove_file_if_exists(filename) return result