def test_extract_emails_mailto(self): body = u'<a href="mailto:[email protected]">test</a>' resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() expected_res = {u'*****@*****.**'} self.assertEqual(p.get_emails(), expected_res)
def test_mailto_ignored_in_links(self): body = u'<a href="mailto:[email protected]">a</a>' resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() parsed, _ = p.references self.assertEqual(parsed, [])
def test_mailto_subject_body(self): body = u'<a href="mailto:[email protected]?subject=testing out mailto'\ u'&body=Just testing">test</a>' resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() expected_res = {u'*****@*****.**'} self.assertEqual(p.get_emails(), expected_res)
def test_get_clear_text_body(self): html = 'header <b>ABC</b>-<b>DEF</b>-<b>XYZ</b> footer' clear_text = 'header ABC-DEF-XYZ footer' headers = Headers([('Content-Type', 'text/html')]) r = build_http_response(self.url, html, headers) p = SGMLParser(r) p.parse() self.assertEquals(clear_text, p.get_clear_text_body())
def test_meta_tags(self): body = HTML_DOC % \ {'head': META_REFRESH + META_REFRESH_WITH_URL, 'body': ''} resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() self.assertTrue(2, len(p.meta_redirs)) self.assertTrue("2;url=http://crawler.w3af.com/" in p.meta_redirs) self.assertTrue("600" in p.meta_redirs) self.assertEquals([URL('http://crawler.w3af.com/')], p.references[0])
def test_meta_tags(self): body = HTML_DOC % \ {'head': META_REFRESH + META_REFRESH_WITH_URL, 'body': ''} resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() self.assertTrue(2, len(p.meta_redirs)) self.assertTrue("2;url=http://crawler.w3af.com/" in p.meta_redirs) self.assertTrue("600" in p.meta_redirs) self.assertEquals([URL('http://crawler.w3af.com/')], p.references[0])
def test_meta_tags_with_single_quotes(self): body = HTML_DOC % {'head': META_REFRESH + META_REFRESH_WITH_URL_AND_QUOTES, 'body': ''} resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() self.assertEqual(2, len(p.meta_redirs)) self.assertIn("2;url='http://crawler.w3af.com/'", p.meta_redirs) self.assertIn("600", p.meta_redirs) self.assertEqual([URL('http://crawler.w3af.com/')], p.references[0])
def test_reference_with_colon(self): body = """ <html> <a href="d:url.html?id=13&subid=3">foo</a> </html>""" r = build_http_response(self.url, body) p = SGMLParser(r) p.parse() parsed_refs = p.references[0] # # Finding zero URLs is the correct behavior based on what # I've seen in Opera and Chrome. # self.assertEquals(0, len(parsed_refs))
def test_parsed_references(self): # The *parsed* urls *must* come both from valid tags and tag attributes # Also invalid urls like must be ignored (like javascript instructions) body = """ <html> <a href="/x.py?a=1" Invalid_Attr="/invalid_url.php"> <form action="javascript:history.back(1)"> <tagX href="/py.py"/> </form> </html>""" r = build_http_response(self.url, body) p = SGMLParser(r) p.parse() parsed_refs = p.references[0] self.assertEquals(1, len(parsed_refs)) self.assertEquals( 'http://w3af.com/x.py?a=1', parsed_refs[0].url_string)
def test_get_clear_text_body_encodings(self): raise SkipTest('Not sure why this one is failing :S') for lang_desc, (body, encoding) in TEST_RESPONSES.iteritems(): encoding_header = 'text/html; charset=%s' % encoding headers = Headers([('Content-Type', encoding_header)]) encoded_body = body.encode(encoding) r = build_http_response(self.url, encoded_body, headers) p = SGMLParser(r) p.parse() ct_body = p.get_clear_text_body() # These test strings don't really have tags, so they should be eq self.assertEqual(ct_body, body)
def test_get_clear_text_issue_4402(self): """ :see: https://github.com/andresriancho/w3af/issues/4402 """ test_file_path = 'core/data/url/tests/data/encoding_4402.php' test_file = os.path.join(ROOT_PATH, test_file_path) body = file(test_file, 'rb').read() sample_encodings = [encoding for _, (_, encoding) in TEST_RESPONSES.iteritems()] sample_encodings.extend(['', 'utf-8']) for encoding in sample_encodings: encoding_header = 'text/html; charset=%s' % encoding headers = Headers([('Content-Type', encoding_header)]) r = build_http_response(self.url, body, headers) p = SGMLParser(r) p.parse() p.get_clear_text_body()
def test_case_sensitivity(self): """ Ensure handler methods are *always* called with lowered-cased tag and attribute names """ def islower(s): il = False if isinstance(s, basestring): il = s.islower() else: il = all(k.islower() for k in s) assert il, "'%s' is not lowered-case" % s return il def start_wrapper(orig_start, tag): islower(tag.tag) islower(tag.attrib) return orig_start(tag) tags = (A_LINK_ABSOLUTE, INPUT_CHECKBOX_WITH_NAME, SELECT_WITH_NAME, TEXTAREA_WITH_ID_AND_DATA, INPUT_HIDDEN) ops = "lower", "upper", "title" for indexes in combinations(range(len(tags)), 2): body_elems = [] for index, tag in enumerate(tags): ele = tag if index in indexes: ele = getattr(tag, choice(ops))() body_elems.append(ele) body = HTML_DOC % {'head': '', 'body': ''.join(body_elems)} resp = build_http_response(self.url, body) p = SGMLParser(resp) orig_start = p.start wrapped_start = partial(start_wrapper, orig_start) p.start = wrapped_start p.parse()
def test_baseurl(self): body = HTML_DOC % {'head': BASE_TAG, 'body': ''} resp = build_http_response(self.url, body) p = SGMLParser(resp) p.parse() self.assertEquals(URL('http://www.w3afbase.com/'), p._base_url)