def test(test_url): data = '<a href="%s">my link here</a>' % test_url result = site_diff.extract_urls(base_url, data) if not result: return None return list(result)[0]
def testAll(self): """Tests all the variations.""" base_url = 'http://www.example.com/my-url/here' def test(test_url): data = '<a href="%s">my link here</a>' % test_url result = site_diff.extract_urls(base_url, data) if not result: return None return list(result)[0] self.assertEquals('http://www.example.com/my-url/dummy_page2.html', test('dummy_page2.html')) self.assertEquals('http://www.example.com/', test('/')) self.assertEquals('http://www.example.com/mypath-here', test('/mypath-here')) self.assertEquals(None, test('#fragment-only')) self.assertEquals('http://www.example.com/my/path/over/here.html', test('/my/path/01/13/../../over/here.html')) self.assertEquals('http://www.example.com/my/path/01/over/here.html', test('/my/path/01/13/./../over/here.html')) self.assertEquals('http://www.example.com/my-url/same-directory.html', test('same-directory.html')) self.assertEquals('http://www.example.com/relative-but-no/child', test('../../relative-but-no/child')) self.assertEquals('http://www.example.com/too/many/relative/paths', test('../../../../too/many/relative/paths')) self.assertEquals( 'http://www.example.com/this/is/scheme-relative.html', test('//www.example.com/this/is/scheme-relative.html')) self.assertEquals( 'http://www.example.com/okay-then', # Scheme changed test('https://www.example.com/okay-then#blah')) self.assertEquals('http://www.example.com/another-one', test('http://www.example.com/another-one')) self.assertEquals('http://www.example.com/this-has/a', test('/this-has/a?query=string')) self.assertEquals( 'http://www.example.com/this-also-has/a/', test('/this-also-has/a/?query=string&but=more-complex')) self.assertEquals( 'http://www.example.com/relative-with/some-(parenthesis%20here)', test('/relative-with/some-(parenthesis%20here)')) self.assertEquals( 'http://www.example.com/relative-with/some-(parenthesis%20here)', test('//www.example.com/relative-with/some-(parenthesis%20here)')) self.assertEquals( 'http://www.example.com/relative-with/some-(parenthesis%20here)', test('http://www.example.com/relative-with/some-' '(parenthesis%20here)')) self.assertIsNone(test('mailto:[email protected]')) # Known bad results self.assertEquals( 'http://www.example.com/my-url/ftp://[email protected]/', test('ftp://[email protected]/')) self.assertEquals( 'http://www.example.com/my-url/javascript:runme()', test('javascript:runme()')) self.assertEquals( 'http://www.example.com/my-url/tel:1-555-555-5555', test('tel:1-555-555-5555')) self.assertEquals('http://www.example.com/test.js', test('/test.js')) # Escaped sources (e.g. inside inline JavaScript) are scraped, # even though they shouldn't be. scriptTag = ('<script type=\"text\/javascript\"' ' src=\"\/\/platform.twitter.com\/widgets.js\"><\/script>') self.assertEquals( set([ 'http://www.example.com/my-url/' '\\/\\/platform.twitter.com\\/widgets.js' ]), site_diff.extract_urls(base_url, scriptTag)) spacesInTag = "<a href = 'spaced.html'>" self.assertEquals( set(['http://www.example.com/my-url/spaced.html']), site_diff.extract_urls(base_url, spacesInTag)) # JavaScript variable assignment isn't handled correctly. jsText = "var src = true;" self.assertEquals( set([ 'http://www.example.com/my-url/true' ]), site_diff.extract_urls(base_url, jsText))
def testAll(self): """Tests all the variations.""" base_url = "http://www.example.com/my-url/here" def test(test_url): data = '<a href="%s">my link here</a>' % test_url result = site_diff.extract_urls(base_url, data) if not result: return None return list(result)[0] self.assertEquals("http://www.example.com/my-url/dummy_page2.html", test("dummy_page2.html")) self.assertEquals("http://www.example.com/", test("/")) self.assertEquals("http://www.example.com/mypath-here", test("/mypath-here")) self.assertEquals(None, test("#fragment-only")) self.assertEquals("http://www.example.com/my/path/over/here.html", test("/my/path/01/13/../../over/here.html")) self.assertEquals( "http://www.example.com/my/path/01/over/here.html", test("/my/path/01/13/./../over/here.html") ) self.assertEquals("http://www.example.com/my-url/same-directory.html", test("same-directory.html")) self.assertEquals("http://www.example.com/relative-but-no/child", test("../../relative-but-no/child")) self.assertEquals("http://www.example.com/too/many/relative/paths", test("../../../../too/many/relative/paths")) self.assertEquals( "http://www.example.com/this/is/scheme-relative.html", test("//www.example.com/this/is/scheme-relative.html"), ) self.assertEquals( "http://www.example.com/okay-then", test("https://www.example.com/okay-then#blah") # Scheme changed ) self.assertEquals("http://www.example.com/another-one", test("http://www.example.com/another-one")) self.assertEquals("http://www.example.com/this-has/a", test("/this-has/a?query=string")) self.assertEquals( "http://www.example.com/this-also-has/a/", test("/this-also-has/a/?query=string&but=more-complex") ) self.assertEquals( "http://www.example.com/relative-with/some-(parenthesis%20here)", test("/relative-with/some-(parenthesis%20here)"), ) self.assertEquals( "http://www.example.com/relative-with/some-(parenthesis%20here)", test("//www.example.com/relative-with/some-(parenthesis%20here)"), ) self.assertEquals( "http://www.example.com/relative-with/some-(parenthesis%20here)", test("http://www.example.com/relative-with/some-" "(parenthesis%20here)"), ) # Known bad results self.assertEquals( "http://www.example.com/my-url/ftp://[email protected]/", test("ftp://[email protected]/") ) self.assertEquals("http://www.example.com/my-url/mailto:[email protected]", test("mailto:[email protected]")) self.assertEquals("http://www.example.com/my-url/javascript:runme()", test("javascript:runme()")) self.assertEquals("http://www.example.com/my-url/tel:1-555-555-5555", test("tel:1-555-555-5555")) self.assertEquals("http://www.example.com/test.js", test("/test.js")) # Escaped sources (e.g. inside inline JavaScript) are scraped, # even though they shouldn't be. scriptTag = '<script type="text\/javascript"' ' src="\/\/platform.twitter.com\/widgets.js"><\/script>' self.assertEquals( set(["http://www.example.com/my-url/" "\\/\\/platform.twitter.com\\/widgets.js"]), site_diff.extract_urls(base_url, scriptTag), ) spacesInTag = "<a href = 'spaced.html'>" self.assertEquals( set(["http://www.example.com/my-url/spaced.html"]), site_diff.extract_urls(base_url, spacesInTag) ) # JavaScript variable assignment isn't handled correctly. jsText = "var src = true;" self.assertEquals(set(["http://www.example.com/my-url/true"]), site_diff.extract_urls(base_url, jsText))
def testAll(self): """Tests all the variations.""" base_url = 'http://www.example.com/my-url/here' def test(test_url): data = '<a href="%s">my link here</a>' % test_url result = site_diff.extract_urls(base_url, data) if not result: return None return list(result)[0] self.assertEquals('http://www.example.com/my-url/dummy_page2.html', test('dummy_page2.html')) self.assertEquals('http://www.example.com/', test('/')) self.assertEquals('http://www.example.com/mypath-here', test('/mypath-here')) self.assertEquals(None, test('#fragment-only')) self.assertEquals('http://www.example.com/my/path/over/here.html', test('/my/path/01/13/../../over/here.html')) self.assertEquals('http://www.example.com/my/path/01/over/here.html', test('/my/path/01/13/./../over/here.html')) self.assertEquals('http://www.example.com/my-url/same-directory.html', test('same-directory.html')) self.assertEquals('http://www.example.com/relative-but-no/child', test('../../relative-but-no/child')) self.assertEquals('http://www.example.com/too/many/relative/paths', test('../../../../too/many/relative/paths')) self.assertEquals( 'http://www.example.com/this/is/scheme-relative.html', test('//www.example.com/this/is/scheme-relative.html')) self.assertEquals( 'http://www.example.com/okay-then', # Scheme changed test('https://www.example.com/okay-then#blah')) self.assertEquals('http://www.example.com/another-one', test('http://www.example.com/another-one')) self.assertEquals('http://www.example.com/this-has/a', test('/this-has/a?query=string')) self.assertEquals( 'http://www.example.com/this-also-has/a/', test('/this-also-has/a/?query=string&but=more-complex')) self.assertEquals( 'http://www.example.com/relative-with/some-(parenthesis%20here)', test('/relative-with/some-(parenthesis%20here)')) self.assertEquals( 'http://www.example.com/relative-with/some-(parenthesis%20here)', test('//www.example.com/relative-with/some-(parenthesis%20here)')) self.assertEquals( 'http://www.example.com/relative-with/some-(parenthesis%20here)', test('http://www.example.com/relative-with/some-' '(parenthesis%20here)')) # Known bad results self.assertEquals( 'http://www.example.com/my-url/ftp://[email protected]/', test('ftp://[email protected]/')) self.assertEquals( 'http://www.example.com/my-url/mailto:[email protected]', test('mailto:[email protected]')) self.assertEquals( 'http://www.example.com/my-url/javascript:runme()', test('javascript:runme()')) self.assertEquals( 'http://www.example.com/my-url/tel:1-555-555-5555', test('tel:1-555-555-5555')) self.assertEquals('http://www.example.com/test.js', test('/test.js')) # Escaped sources (e.g. inside inline JavaScript) are scraped, # even though they shouldn't be. scriptTag = ('<script type=\"text\/javascript\"' ' src=\"\/\/platform.twitter.com\/widgets.js\"><\/script>') self.assertEquals( set([ 'http://www.example.com/my-url/' '\\/\\/platform.twitter.com\\/widgets.js' ]), site_diff.extract_urls(base_url, scriptTag)) spacesInTag = "<a href = 'spaced.html'>" self.assertEquals( set(['http://www.example.com/my-url/spaced.html']), site_diff.extract_urls(base_url, spacesInTag)) # JavaScript variable assignment isn't handled correctly. jsText = "var src = true;" self.assertEquals( set([ 'http://www.example.com/my-url/true' ]), site_diff.extract_urls(base_url, jsText))