Ejemplo n.º 1
0
    def test_backward_filename_filter(self):
        url_filter = BackwardFilenameFilter(
            accepted=['html', 'image.*.png'],
            rejected=['bmp', 'jp[eg]', 'image.123.png']
        )

        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com/'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example/index.html'),
            mock_record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example/myimage.1003.png'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/myimage.123.png'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/blah.png'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/image.1003.png.bmp'),
            mock_record
        ))
Ejemplo n.º 2
0
 def test_url_info_invalids(self):
     self.assertRaises(ValueError, URLInfo.parse, '')
     self.assertRaises(ValueError, URLInfo.parse, '#')
     self.assertRaises(ValueError, URLInfo.parse, 'http://')
     self.assertRaises(ValueError, URLInfo.parse, 'example....com')
     self.assertRaises(ValueError, URLInfo.parse, 'http://example....com')
     self.assertRaises(ValueError, URLInfo.parse, 'http://example…com')
     self.assertRaises(ValueError, URLInfo.parse, 'http://[34.4kf]::4')
     self.assertRaises(ValueError, URLInfo.parse, 'http://[34.4kf::4')
     self.assertRaises(ValueError, URLInfo.parse, 'http://dmn3]:3a:45')
     self.assertRaises(ValueError, URLInfo.parse, ':38/3')
     self.assertRaises(ValueError, URLInfo.parse, 'http://][a:@1]')
     self.assertRaises(ValueError, URLInfo.parse, 'http://[[aa]]:4:]6')
     self.assertNotIn('[', URLInfo.parse('http://[a]').hostname)
     self.assertNotIn(']', URLInfo.parse('http://[a]').hostname)
     self.assertRaises(ValueError, URLInfo.parse, 'http://[[a]')
     self.assertRaises(ValueError, URLInfo.parse, 'http://[[a]]a]')
     self.assertRaises(ValueError, URLInfo.parse, 'http://[[a:a]]')
     self.assertRaises(ValueError, URLInfo.parse, 'http:///')
     self.assertRaises(ValueError, URLInfo.parse, 'http:///horse')
     self.assertRaises(ValueError, URLInfo.parse, 'http://?what?')
     self.assertRaises(ValueError, URLInfo.parse, 'http://#egg=wpull')
     self.assertRaises(ValueError, URLInfo.parse,
                       'http://:@example.com:?@/')
     self.assertRaises(ValueError, URLInfo.parse, 'http://\x00/')
     self.assertRaises(ValueError, URLInfo.parse, 'http:/a')
     self.assertRaises(ValueError, URLInfo.parse, 'http://@@example.com/@')
     self.assertRaises(
         ValueError, URLInfo.parse,
         'http://fat32defragmenter.internets::80')
     self.assertRaises(
         ValueError, URLInfo.parse,
         'http://fat32defragmenter.internets:80/')
     self.assertRaises(ValueError, URLInfo.parse, 'http:// /spaaaace')
     self.assertRaises(
         ValueError, URLInfo.parse,
         'http://a-long-long-time-ago-the-earth-was-ruled-by-dinosaurs-'
         'they-were-big-so-not-a-lot-of-people-went-around-hassling-them-'
         'actually-no-people-went-around-hassling-them-'
         'because-there-weren-t-any-people-yet-'
         'just-the-first-tiny-mammals-'
         'basically-life-was-good-'
         'lou-it-just-dont-get-no-better-than-this-'
         'yeah-'
         'then-something-happened-'
         'a-giant-meteorite-struck-the-earth-'
         'goodbye-dinosaurs-'
         'but-what-if-the-dinosaurs-werent-all-destroyed-'
         'what-if-the-impact-of-that-meteorite-created-a-parallel-dimension-'
         'where-the-dinosaurs-continue-to-thrive-'
         'and-evolved-into-intelligent-vicious-aggressive-beings-'
         'just-like-us-'
         'and-hey-what-if-they-found-their-way-back.movie'
     )
     self.assertRaises(
         ValueError, URLInfo.parse, 'http://[...]/python.xml%22')
     self.assertRaises(
         ValueError, URLInfo.parse, 'http://[…]/python.xml%22')
     self.assertRaises(
         ValueError, URLInfo.parse, 'http://[.]/python.xml%22')
Ejemplo n.º 3
0
    def test_regex_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com/blog/'

        url_filter = RegexFilter()
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net'),
            mock_record
        ))

        url_filter = RegexFilter(accepted=r'blo[a-z]/$')
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net/blob/'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net/blob/123'),
            mock_record
        ))

        url_filter = RegexFilter(rejected=r'\.gif$')
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net/blob/'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net/blob/123.gif'),
            mock_record
        ))
Ejemplo n.º 4
0
    def test_directory_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com/blog/'

        url_filter = DirectoryFilter()

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com'),
            mock_record
        ))

        url_filter = DirectoryFilter(accepted=['/blog'])

        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com'),
            mock_record
        ))

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            mock_record
        ))

        url_filter = DirectoryFilter(rejected=['/cgi-bin/'])

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com/cgi-bin'),
            mock_record
        ))
Ejemplo n.º 5
0
    def test_parent_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.inline = False
        url_filter = ParentFilter()

        mock_record.top_url = 'http://example.com/blog/topic2/'
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic2/'),
            mock_record
        ))
        mock_record.top_url = 'http://example.com/blog/topic1/'
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            mock_record
        ))

        mock_record.inline = True
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/styles.css'),
            mock_record
        ))
Ejemplo n.º 6
0
 def test_url_info_ipv6(self):
     self.assertEqual(
         'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6',
         URLInfo.parse(
             'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6'
         ).url
     )
     self.assertEqual(
         '[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080',
         URLInfo.parse(
             'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6'
         ).hostname_with_port
     )
     self.assertEqual(
         'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6',
         URLInfo.parse(
             'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6'
         ).url
     )
     self.assertEqual(
         '[2001:db8:85a3:8d3:1319:8a2e:370:7348]',
         URLInfo.parse(
             'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6'
         ).hostname_with_port
     )
Ejemplo n.º 7
0
    def test_css_detect(self):
        self.assertTrue(
            CSSReader.is_file(
                io.BytesIO('body { color: white }'.encode('utf-16le'))))
        self.assertFalse(
            CSSReader.is_file(io.BytesIO('hello world!'.encode('utf-16le'))))
        self.assertFalse(CSSReader.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(
            CSSReader.is_file(io.BytesIO(b'<html><body>hello')) is VeryFalse)
        self.assertTrue(
            CSSReader.is_file(io.BytesIO(b'h1 { background-color: red }')))
        self.assertTrue(CSSReader.is_file(io.BytesIO(b'@import url.css;')))
        self.assertTrue(
            CSSReader.is_url(URLInfo.parse('example.com/index.css')))
        self.assertFalse(
            CSSReader.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(CSSReader.is_request(Request('example.com/index.css')))
        self.assertFalse(CSSReader.is_request(
            Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/css'
        self.assertTrue(CSSReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(CSSReader.is_response(response))
Ejemplo n.º 8
0
    def test_regex_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com/blog/'

        url_filter = RegexFilter()
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net'),
            mock_record
        ))

        url_filter = RegexFilter(accepted=r'blo[a-z]/$')
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net/blob/'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net/blob/123'),
            mock_record
        ))

        url_filter = RegexFilter(rejected=r'\.gif$')
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net/blob/'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net/blob/123.gif'),
            mock_record
        ))
Ejemplo n.º 9
0
    def test_directory_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com/blog/'

        url_filter = DirectoryFilter()

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com'),
            mock_record
        ))

        url_filter = DirectoryFilter(accepted=['/blog'])

        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com'),
            mock_record
        ))

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            mock_record
        ))

        url_filter = DirectoryFilter(rejected=['/cgi-bin/'])

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.com/cgi-bin'),
            mock_record
        ))
Ejemplo n.º 10
0
    def test_backward_filename_filter(self):
        url_filter = BackwardFilenameFilter(
            accepted=['html', 'image.*.png'],
            rejected=['bmp', 'jp[eg]', 'image.123.png']
        )

        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com/'

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example/index.html'),
            mock_record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example/myimage.1003.png'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/myimage.123.png'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/blah.png'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example/image.1003.png.bmp'),
            mock_record
        ))
Ejemplo n.º 11
0
 def test_url_info_ipv6(self):
     self.assertEqual(
         'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6',
         URLInfo.parse(
             'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6'
         ).url
     )
     self.assertEqual(
         '[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080',
         URLInfo.parse(
             'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6'
         ).hostname_with_port
     )
     self.assertEqual(
         'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6',
         URLInfo.parse(
             'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6'
         ).url
     )
     self.assertEqual(
         '[2001:db8:85a3:8d3:1319:8a2e:370:7348]',
         URLInfo.parse(
             'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6'
         ).hostname_with_port
     )
Ejemplo n.º 12
0
    def test_xml_detect(self):
        self.assertTrue(
            XMLDetector.is_file(io.BytesIO(
                '<?xml version='.encode('utf-16le'))))
        self.assertFalse(
            XMLDetector.is_file(
                io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le'))))
        self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version')))
        self.assertTrue(
            XMLDetector.is_url(URLInfo.parse('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(
            XMLDetector.is_request(Request('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_request(Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'application/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(XMLDetector.is_response(response))
Ejemplo n.º 13
0
 def test_url_info_query(self):
     self.assertEqual('http://example.com/?a=',
                      URLInfo.parse('http://example.com?a=').url)
     self.assertEqual('http://example.com/?a=1',
                      URLInfo.parse('http://example.com?a=1').url)
     self.assertEqual('http://example.com/?a=1&b',
                      URLInfo.parse('http://example.com?a=1&b').url)
     self.assertEqual('http://example.com/?a=1&b=',
                      URLInfo.parse('http://example.com?a=1&b=').url)
Ejemplo n.º 14
0
 def test_to_dir_path_url(self):
     self.assertEqual('ftp://putfile.com/',
                      to_dir_path_url(URLInfo.parse('ftp://putfile.com/')))
     self.assertEqual(
         'ftp://putfile.com/',
         to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf')))
     self.assertEqual(
         'ftp://putfile.com/asdf/',
         to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf/qwer')))
Ejemplo n.º 15
0
    def test_url_info_trailing_dot(self):
        self.assertEqual(
            'http://example.com./',
            URLInfo.parse('http://example.com./').url
        )

        self.assertEqual(
            'http://example.com.:81/',
            URLInfo.parse('http://example.com.:81/').url
        )
Ejemplo n.º 16
0
    def test_url_info_trailing_dot(self):
        self.assertEqual(
            'http://example.com./',
            URLInfo.parse('http://example.com./').url
        )

        self.assertEqual(
            'http://example.com.:81/',
            URLInfo.parse('http://example.com.:81/').url
        )
Ejemplo n.º 17
0
    def test_url_info_naked(self):
        self.assertEqual('http://example.com/',
                         URLInfo.parse('Example.Com').url)
        self.assertEqual('http://example.com/',
                         URLInfo.parse('//example.com').url)
        self.assertEqual('http://example.com/Blah',
                         URLInfo.parse('//example.com/Blah').url)

        url_info = URLInfo.parse('example.com:8080')
        self.assertEqual('http://example.com:8080/', url_info.url)
        self.assertEqual('example.com:8080', url_info.hostname_with_port)
        self.assertEqual(8080, url_info.port)

        url_info = URLInfo.parse('localhost:8080/A/b/C:')
        self.assertEqual('http://localhost:8080/A/b/C:', url_info.url)
        self.assertEqual('localhost:8080', url_info.hostname_with_port)
        self.assertEqual(8080, url_info.port)

        self.assertEqual('http://example.com/Asdf',
                         URLInfo.parse('example.com/Asdf#Blah').url)
        self.assertEqual('http://example.com/asdf/Ghjk',
                         URLInfo.parse('example.com/asdf/Ghjk#blah').url)
        self.assertEqual('http://example.com/',
                         URLInfo.parse('example.com/').url)
        self.assertEqual('https://example.com/',
                         URLInfo.parse('https://example.com').url)
Ejemplo n.º 18
0
 def test_url_info_default_port(self):
     self.assertEqual(80, URLInfo.parse('http://example.com').port)
     self.assertEqual(443, URLInfo.parse('https://example.com').port)
     self.assertEqual(
         'example.com',
         URLInfo.parse('http://example.com').hostname_with_port)
     self.assertEqual(
         'example.com',
         URLInfo.parse('https://example.com').hostname_with_port)
     self.assertEqual('http://example.com/',
                      URLInfo.parse('http://example.com:80').url)
Ejemplo n.º 19
0
 def test_url_info_reserved_char_is_ok(self):
     self.assertEqual(
         'http://example.com/@49IMG.DLL/$SESSION$/image.png;large',
         URLInfo.parse(
             'http://example.com/@49IMG.DLL/$SESSION$/image.png;large').url)
     self.assertEqual(
         'http://example.com/@49IMG.DLL/$SESSION$/imag%C3%A9.png;large',
         URLInfo.parse(
             'http://example.com/@49IMG.DLL/$SESSION$/imagé.png;large').url)
     self.assertEqual(
         'http://example.com/$c/%system.exe/',
         URLInfo.parse('http://example.com/$c/%system.exe/').url)
Ejemplo n.º 20
0
    def test_http_filter(self):
        mock_record = MockURLTableRecord()

        url_filter = HTTPFilter()
        self.assertTrue(
            url_filter.test(URLInfo.parse('http://example.net'), mock_record))
        self.assertFalse(
            url_filter.test(URLInfo.parse('mailto:[email protected]'),
                            mock_record))
        self.assertFalse(
            url_filter.test(URLInfo.parse("javascript:alert('hello!')"),
                            mock_record))
Ejemplo n.º 21
0
 def test_to_dir_path_url(self):
     self.assertEqual(
         'ftp://putfile.com/',
         to_dir_path_url(URLInfo.parse('ftp://putfile.com/'))
     )
     self.assertEqual(
         'ftp://putfile.com/',
         to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf'))
     )
     self.assertEqual(
         'ftp://putfile.com/asdf/',
         to_dir_path_url(URLInfo.parse('ftp://putfile.com/asdf/qwer'))
     )
Ejemplo n.º 22
0
    def test_url_info_round_trip(self):
        urls = [
            'http://example.com/blah%20blah/',
            'example.com:81?blah=%c3%B0',
            'http://example.com/a/../../b/style.css',
            'http://example.com/'
            '?blah=http%3A%2F%2Fexample.com%2F%3Ffail%3Dtrue',
            'http://example.com/??blah=blah[0:]=bl%61h?blah"&d%26_',
            'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6',
        ]

        for url in urls:
            URLInfo.parse(URLInfo.parse(url).url)
Ejemplo n.º 23
0
 def test_url_info_path_folding(self):
     self.assertEqual('http://example.com/',
                      URLInfo.parse('http://example.com/.').url)
     self.assertEqual('http://example.com/',
                      URLInfo.parse('http://example.com/../').url)
     self.assertEqual('http://example.com/index.html',
                      URLInfo.parse('http://example.com/../index.html').url)
     self.assertEqual(
         'http://example.com/b/style.css',
         URLInfo.parse('http://example.com/a/../../b/style.css').url)
     self.assertEqual(
         'http://example.com/a/style.css',
         URLInfo.parse('http://example.com/a/b/../style.css').url)
Ejemplo n.º 24
0
    def test_url_info_round_trip(self):
        urls = [
            'http://example.com/blah%20blah/',
            'example.com:81?blah=%c3%B0',
            'http://example.com/a/../../b/style.css',
            'http://example.com/'
            '?blah=http%3A%2F%2Fexample.com%2F%3Ffail%3Dtrue',
            'http://example.com/??blah=blah[0:]=bl%61h?blah"&d%26_',
            'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]/ipv6',
        ]

        for url in urls:
            URLInfo.parse(URLInfo.parse(url).url)
Ejemplo n.º 25
0
 def test_ip_address_normalization(self):
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('http://0xC0.0x00.0x02.0xEB').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('http://0300.0000.0002.0353').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('http://0xC00002EB/').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('http://3221226219/').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('http://030000001353/').url
     )
     self.assertEqual(
         'http://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6',
         URLInfo.parse(
             'http://[2001:Db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6'
         ).url
     )
     self.assertEqual(
         'http://[::1]/',
         URLInfo.parse('http://[0:0:0:0:0:0:0:1]').url
     )
     self.assertEqual(
         'http://[::ffff:c000:280]/',
         URLInfo.parse('http://[::ffff:192.0.2.128]/').url
     )
Ejemplo n.º 26
0
    def test_parent_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.inline = False
        url_filter = ParentFilter()

        mock_record.top_url = 'http://example.com/blog/topic2/'
        self.assertTrue(
            url_filter.test(URLInfo.parse('http://example.com/blog/topic2/'),
                            mock_record))
        mock_record.top_url = 'http://example.com/blog/topic1/'
        self.assertTrue(
            url_filter.test(
                URLInfo.parse('http://example.com/blog/topic1/blah.html'),
                mock_record))
        self.assertTrue(
            url_filter.test(
                URLInfo.parse('https://example.com/blog/topic1/blah2.html'),
                mock_record))
        self.assertFalse(
            url_filter.test(URLInfo.parse('http://example.com/blog/'),
                            mock_record))
        self.assertFalse(
            url_filter.test(URLInfo.parse('https://example.com/blog/'),
                            mock_record))
        self.assertTrue(
            url_filter.test(URLInfo.parse('http://somewhere.com/'),
                            mock_record))
        self.assertTrue(
            url_filter.test(URLInfo.parse('https://somewhere.com/'),
                            mock_record))

        mock_record.inline = True
        self.assertTrue(
            url_filter.test(URLInfo.parse('http://example.com/styles.css'),
                            mock_record))
Ejemplo n.º 27
0
 def test_url_info_misleading_parts(self):
     self.assertEqual(
         'http://example.com/?a',
         URLInfo.parse('http://example.com?a').url
     )
     self.assertEqual(
         'http://example.com/?a?',
         URLInfo.parse('http://example.com?a?').url
     )
     self.assertEqual(
         'http://example.com/',
         URLInfo.parse('http://example.com#a').url
     )
     self.assertEqual(
         'http://example.com/',
         URLInfo.parse('http://example.com#a?').url
     )
     self.assertEqual(
         'http://example.com/?a',
         URLInfo.parse('http://example.com?a#').url
     )
     self.assertEqual(
         'http://example.com/:10',
         URLInfo.parse('http://example.com/:10').url
     )
     self.assertEqual(
         'http://example.com/?@/',
         URLInfo.parse('http://:@example.com?@/').url
     )
     self.assertEqual(
         'http://example.com/http:/example.com',
         URLInfo.parse('http://:@example.com/http://example.com').url
     )
Ejemplo n.º 28
0
 def test_ip_address_normalization(self):
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('https://0xC0.0x00.0x02.0xEB').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('https://0301.1680.0002.0353').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('https://0xC00002EB/').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('https://3221226219/').url
     )
     self.assertEqual(
         'http://192.0.2.235/',
         URLInfo.parse('https://030000001353/').url
     )
     self.assertEqual(
         'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6',
         URLInfo.parse(
             'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6'
         ).url
     )
     self.assertEqual(
         'https://[::1]/',
         URLInfo.parse('https://[0:0:0:0:0:0:0:1]').url
     )
     self.assertEqual(
         'https://[::ffff:192.0.2.128]/',
         URLInfo.parse('https://[::ffff:c000:0280]').url
     )
Ejemplo n.º 29
0
 def test_url_info_misleading_parts(self):
     self.assertEqual(
         'http://example.com/?a',
         URLInfo.parse('http://example.com?a').url
     )
     self.assertEqual(
         'http://example.com/?a?',
         URLInfo.parse('http://example.com?a?').url
     )
     self.assertEqual(
         'http://example.com/',
         URLInfo.parse('http://example.com#a').url
     )
     self.assertEqual(
         'http://example.com/',
         URLInfo.parse('http://example.com#a?').url
     )
     self.assertEqual(
         'http://example.com/?a',
         URLInfo.parse('http://example.com?a#').url
     )
     self.assertEqual(
         'http://example.com/:10',
         URLInfo.parse('http://example.com/:10').url
     )
     self.assertEqual(
         'http://example.com/?@/',
         URLInfo.parse('http://:@example.com?@/').url
     )
     self.assertEqual(
         'http://example.com/http:/example.com',
         URLInfo.parse('http://:@example.com/http://example.com').url
     )
Ejemplo n.º 30
0
 def test_url_info_reserved_char_is_ok(self):
     self.assertEqual(
         'http://example.com/@49IMG.DLL/$SESSION$/image.png;large',
         URLInfo.parse(
             'http://example.com/@49IMG.DLL/$SESSION$/image.png;large').url
     )
     self.assertEqual(
         'http://example.com/@49IMG.DLL/$SESSION$/imag%C3%A9.png;large',
         URLInfo.parse(
             'http://example.com/@49IMG.DLL/$SESSION$/imagé.png;large').url
     )
     self.assertEqual(
         'http://example.com/$c/%system.exe/',
         URLInfo.parse('http://example.com/$c/%system.exe/').url
     )
Ejemplo n.º 31
0
    def parse_url(cls, url, encoding):
        '''Parse and return a URLInfo.

        This function logs a warning if the URL cannot be parsed and returns
        None.
        '''
        try:
            url_info = URLInfo.parse(url, encoding=encoding)
            # FIXME: workaround detection of bad URL unsplit. See issue #132.
            URLInfo.parse(url_info.url, encoding=encoding)
        except ValueError as error:
            _logger.warning(__(_('Discarding malformed URL ‘{url}’: {error}.'),
                               url=url, error=error))
        else:
            return url_info
Ejemplo n.º 32
0
    def test_url_info_parts(self):
        url_info = URLInfo.parse(
            'HTTP://*****:*****@[A::1]:81/ásdF\u200C/ghjK?a=b=c&D#/?')
        self.assertEqual(
            'http://*****:*****@[a::1]:81/'
            '%C3%A1sdF%E2%80%8C/ghjK?a=b=c&D',
            url_info.url
        )
        self.assertEqual('http', url_info.scheme)
        self.assertEqual('userName:pass%3Aword@[A::1]:81',
                         url_info.authority)
        self.assertEqual('/ásdF\u200C/ghjK?a=b=c&D#/?', url_info.resource)
        self.assertEqual('userName', url_info.username)
        self.assertEqual('pass:word', url_info.password)
        self.assertEqual('[A::1]:81', url_info.host)
        self.assertEqual('[a::1]:81', url_info.hostname_with_port)
        self.assertEqual('a::1', url_info.hostname)
        self.assertEqual(81, url_info.port)
        self.assertEqual('/%C3%A1sdF%E2%80%8C/ghjK', url_info.path)
        self.assertEqual('a=b=c&D', url_info.query)
        self.assertEqual('/?', url_info.fragment)
        self.assertEqual('utf-8', url_info.encoding)
        self.assertEqual(
            'HTTP://*****:*****@[A::1]:81/ásdF\u200C/ghjK?a=b=c&D#/?',
            url_info.raw)
        self.assertEqual(('/%C3%A1sdF%E2%80%8C', 'ghjK'), url_info.split_path())

        url_info = URLInfo.parse(
            'Ftp://*****:*****@LocalHost.Example/mydocs/'
        )
        self.assertEqual('ftp', url_info.scheme)
        self.assertEqual('N00B:[email protected]',
                         url_info.authority)
        self.assertEqual('/mydocs/', url_info.resource)
        self.assertEqual('N00B', url_info.username)
        self.assertEqual('hunter2', url_info.password)
        self.assertEqual('LocalHost.Example', url_info.host)
        self.assertEqual('localhost.example', url_info.hostname_with_port)
        self.assertEqual('localhost.example', url_info.hostname)
        self.assertEqual(21, url_info.port)
        self.assertEqual('/mydocs/', url_info.path)
        self.assertFalse(url_info.query)
        self.assertFalse(url_info.fragment)
        self.assertEqual('utf-8', url_info.encoding)
        self.assertEqual(
            'Ftp://*****:*****@LocalHost.Example/mydocs/',
            url_info.raw)
        self.assertEqual(('/mydocs', ''), url_info.split_path())
Ejemplo n.º 33
0
    def test_url_info_parts(self):
        url_info = URLInfo.parse(
            'HTTP://*****:*****@[A::1]:81/ásdF\u200C/ghjK?a=b=c&D#/?')
        self.assertEqual(
            'http://*****:*****@[a::1]:81/'
            '%C3%A1sdF%E2%80%8C/ghjK?a=b=c&D',
            url_info.url
        )
        self.assertEqual('http', url_info.scheme)
        self.assertEqual('userName:pass%3Aword@[A::1]:81',
                         url_info.authority)
        self.assertEqual('/ásdF\u200C/ghjK?a=b=c&D#/?', url_info.resource)
        self.assertEqual('userName', url_info.username)
        self.assertEqual('pass:word', url_info.password)
        self.assertEqual('[A::1]:81', url_info.host)
        self.assertEqual('[a::1]:81', url_info.hostname_with_port)
        self.assertEqual('a::1', url_info.hostname)
        self.assertEqual(81, url_info.port)
        self.assertEqual('/%C3%A1sdF%E2%80%8C/ghjK', url_info.path)
        self.assertEqual('a=b=c&D', url_info.query)
        self.assertEqual('/?', url_info.fragment)
        self.assertEqual('utf-8', url_info.encoding)
        self.assertEqual(
            'HTTP://*****:*****@[A::1]:81/ásdF\u200C/ghjK?a=b=c&D#/?',
            url_info.raw)
        self.assertEqual(('/%C3%A1sdF%E2%80%8C', 'ghjK'), url_info.split_path())

        url_info = URLInfo.parse(
            'Ftp://*****:*****@LocalHost.Example/mydocs/'
        )
        self.assertEqual('ftp', url_info.scheme)
        self.assertEqual('N00B:[email protected]',
                         url_info.authority)
        self.assertEqual('/mydocs/', url_info.resource)
        self.assertEqual('N00B', url_info.username)
        self.assertEqual('hunter2', url_info.password)
        self.assertEqual('LocalHost.Example', url_info.host)
        self.assertEqual('localhost.example', url_info.hostname_with_port)
        self.assertEqual('localhost.example', url_info.hostname)
        self.assertEqual(21, url_info.port)
        self.assertEqual('/mydocs/', url_info.path)
        self.assertFalse(url_info.query)
        self.assertFalse(url_info.fragment)
        self.assertEqual('utf-8', url_info.encoding)
        self.assertEqual(
            'Ftp://*****:*****@LocalHost.Example/mydocs/',
            url_info.raw)
        self.assertEqual(('/mydocs', ''), url_info.split_path())
Ejemplo n.º 34
0
 def test_append_slash_to_path_url(self):
     self.assertEqual(
         'ftp://putfile.com/example/',
         append_slash_to_path_url(
             URLInfo.parse('ftp://putfile.com/example')
         )
     )
Ejemplo n.º 35
0
    def test_http_filter(self):
        mock_record = MockURLTableRecord()

        url_filter = HTTPFilter()
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.net'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse("javascript:alert('hello!')"),
            mock_record
        ))
Ejemplo n.º 36
0
    def fetch_robots_txt(self, request, file=None):
        '''Fetch the robots.txt file for the request.

        Coroutine.
        '''
        url_info = request.url_info
        url = URLInfo.parse('{0}://{1}/robots.txt'.format(
            url_info.scheme, url_info.hostname_with_port)).url

        if not file:
            file = wpull.body.new_temp_file(os.getcwd(), hint='robots')

        with contextlib.closing(file):
            request = self._web_client.request_factory(url)

            session = self._web_client.session(request)
            while not session.done():
                wpull.util.truncate_file(file.name)

                try:
                    response = yield From(session.fetch(file=file))
                except ProtocolError:
                    self._accept_as_blank(url_info)

                    return

            status_code = response.status_code

            if 500 <= status_code <= 599:
                raise ServerError('Server returned error for robots.txt.')

            if status_code == 200:
                self._read_content(response, url_info)
            else:
                self._accept_as_blank(url_info)
Ejemplo n.º 37
0
    def fetch_robots_txt(self, request, file=None):
        '''Fetch the robots.txt file for the request.

        Coroutine.
        '''
        url_info = request.url_info
        url = URLInfo.parse('{0}://{1}/robots.txt'.format(
            url_info.scheme, url_info.hostname_with_port)).url

        if not file:
            file = wpull.body.new_temp_file(os.getcwd(), hint='robots')

        with contextlib.closing(file):
            request = self._web_client.request_factory(url)

            session = self._web_client.session(request)
            while not session.done():
                wpull.util.truncate_file(file.name)

                try:
                    response = yield From(session.fetch(file=file))
                except ProtocolError:
                    self._accept_as_blank(url_info)

                    return

            status_code = response.status_code

            if 500 <= status_code <= 599:
                raise ServerError('Server returned error for robots.txt.')

            if status_code == 200:
                self._read_content(response, url_info)
            else:
                self._accept_as_blank(url_info)
Ejemplo n.º 38
0
 def test_url_info_query(self):
     self.assertEqual(
         'http://example.com/?a=',
         URLInfo.parse('http://example.com?a=').url
     )
     self.assertEqual(
         'http://example.com/?a=1',
         URLInfo.parse('http://example.com?a=1').url
     )
     self.assertEqual(
         'http://example.com/?a=1&b',
         URLInfo.parse('http://example.com?a=1&b').url
     )
     self.assertEqual(
         'http://example.com/?a=1&b=',
         URLInfo.parse('http://example.com?a=1&b=').url
     )
Ejemplo n.º 39
0
 def test_url_info_to_dict(self):
     url_info = URLInfo.parse('https://example.com/file.jpg')
     url_info_dict = url_info.to_dict()
     self.assertEqual('/file.jpg', url_info_dict['path'])
     self.assertEqual('example.com', url_info_dict['hostname'])
     self.assertEqual('https', url_info_dict['scheme'])
     self.assertEqual(443, url_info_dict['port'])
     self.assertEqual('utf-8', url_info_dict['encoding'])
Ejemplo n.º 40
0
    def _get_cookie_referrer_host(self):
        '''Return the referrer hostname.'''
        referer = self._original_request.fields.get('Referer')

        if referer:
            return URLInfo.parse(referer).hostname
        else:
            return None
Ejemplo n.º 41
0
    def _get_cookie_referrer_host(self):
        """Return the referrer hostname."""
        referer = self._original_request.fields.get("Referer")

        if referer:
            return URLInfo.parse(referer).hostname
        else:
            return None
Ejemplo n.º 42
0
 def test_url_info_to_dict(self):
     url_info = URLInfo.parse('https://example.com/file.jpg')
     url_info_dict = url_info.to_dict()
     self.assertEqual('/file.jpg', url_info_dict['path'])
     self.assertEqual('example.com', url_info_dict['hostname'])
     self.assertEqual('https', url_info_dict['scheme'])
     self.assertEqual(443, url_info_dict['port'])
     self.assertEqual('utf-8', url_info_dict['encoding'])
Ejemplo n.º 43
0
    def _build_input_urls(self, default_scheme='http'):
        '''Read the URLs provided by the user.'''

        url_string_iter = self._args.urls or ()

        if self._args.input_file:
            if self._args.force_html:
                urls = self._read_input_file_as_html()
            else:
                urls = self._read_input_file_as_lines()

            url_string_iter = itertools.chain(url_string_iter, urls)

        sitemap_url_infos = set()
        base_url = self._args.base

        for url_string in url_string_iter:
            _logger.debug(__('Parsing URL {0}', url_string))

            if base_url:
                url_string = wpull.url.urljoin(base_url, url_string)

            url_info = self._factory.class_map['URLInfo'].parse(
                url_string, default_scheme=default_scheme)

            _logger.debug(__('Parsed URL {0}', url_info))
            yield url_info

            if self._args.sitemaps:
                sitemap_url_infos.update((
                    URLInfo.parse(
                        '{0}://{1}/robots.txt'.format(
                            url_info.scheme,
                            url_info.hostname_with_port)
                    ),
                    URLInfo.parse(
                        '{0}://{1}/sitemap.xml'.format(
                            url_info.scheme,
                            url_info.hostname_with_port)
                    )
                ))

        for url_info in sitemap_url_infos:
            yield url_info
Ejemplo n.º 44
0
    def test_consult_filters(self):
        fetch_rule = self.get_fetch_rule()

        url_info = URLInfo.parse('http://example.com')
        url_record = new_mock_url_record()

        verdict, reason, test_info = fetch_rule.consult_filters(url_info, url_record)

        self.assertTrue(verdict)
        self.assertEqual('filters', reason)
Ejemplo n.º 45
0
    def _convert_plain(self, link_info, root, encoding):
        base_url = wpull.util.to_str(root.base_url)

        if link_info.base_link:
            base_url = wpull.url.urljoin(base_url, link_info.base_link)

        url = wpull.url.urljoin(base_url, link_info.link)
        url_info = URLInfo.parse(url, encoding=encoding)
        new_url = self._get_new_url(url_info)

        link_info.element.set(link_info.attrib, new_url)
Ejemplo n.º 46
0
    def _convert_plain(self, link_info, root, encoding):
        base_url = wpull.util.to_str(root.base_url)

        if link_info.base_link:
            base_url = wpull.url.urljoin(base_url, link_info.base_link)

        url = wpull.url.urljoin(base_url, link_info.link)
        url_info = URLInfo.parse(url, encoding=encoding)
        new_url = self._get_new_url(url_info)

        link_info.element.set(link_info.attrib, new_url)
Ejemplo n.º 47
0
    def test_https_filter(self):
        record= URLRecord()

        url_filter = HTTPSOnlyFilter()
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://example.net'),
            record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('https://example.net'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('mailto:[email protected]'),
            record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse("javascript:alert('hello!')"),
            record
        ))
Ejemplo n.º 48
0
 def test_url_info_default_port(self):
     self.assertEqual(
         80,
         URLInfo.parse('http://example.com').port
     )
     self.assertEqual(
         443,
         URLInfo.parse('https://example.com').port
     )
     self.assertEqual(
         'example.com',
         URLInfo.parse('http://example.com').hostname_with_port
     )
     self.assertEqual(
         'example.com',
         URLInfo.parse('https://example.com').hostname_with_port
     )
     self.assertEqual(
         'http://example.com/',
         URLInfo.parse('http://example.com:80').url
     )
Ejemplo n.º 49
0
 def test_url_info_path_folding(self):
     self.assertEqual(
         'http://example.com/',
         URLInfo.parse('http://example.com/.').url
     )
     self.assertEqual(
         'http://example.com/',
         URLInfo.parse('http://example.com/../').url
     )
     self.assertEqual(
         'http://example.com/index.html',
         URLInfo.parse('http://example.com/../index.html').url
     )
     self.assertEqual(
         'http://example.com/b/style.css',
         URLInfo.parse('http://example.com/a/../../b/style.css').url
     )
     self.assertEqual(
         'http://example.com/a/style.css',
         URLInfo.parse('http://example.com/a/b/../style.css').url
     )
Ejemplo n.º 50
0
    def test_span_hosts_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com'

        url_filter = SpanHostsFilter([
                URLInfo.parse('http://example.com/blog/'),
            ],
            enabled=False
        )

        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            mock_record
        ))
        self.assertFalse(url_filter.test(
            URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
            mock_record
        ))

        url_filter = SpanHostsFilter([
                URLInfo.parse('http://example.com/blog/'),
            ],
            enabled=True
        )
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://example.com/blog/topic1/blah.html'),
            mock_record
        ))
        self.assertTrue(url_filter.test(
            URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
            mock_record
        ))
Ejemplo n.º 51
0
 def test_url_info_usrename_password(self):
     self.assertEqual(
         'http://[email protected]/',
         URLInfo.parse('http://[email protected]/').url
     )
     self.assertEqual(
         'http://*****:*****@example.com/',
         URLInfo.parse('http://*****:*****@example.com/').url
     )
     self.assertEqual(
         'http://:[email protected]/',
         URLInfo.parse('http://:[email protected]/').url
     )
     self.assertEqual(
         'http://*****:*****@example.com/',
         URLInfo.parse('http://*****:*****@example.com/').url
     )
     self.assertEqual(
         'http://User%40Name:Pass:[email protected]/',
         URLInfo.parse('http://User%40Name:Pass%[email protected]/').url
     )
     self.assertEqual(
         'http://User%20Name%[email protected]/',
         URLInfo.parse('http://User Name%3A:@example.com/').url
     )
Ejemplo n.º 52
0
 def test_url_info_usrename_password(self):
     self.assertEqual(
         'http://[email protected]/',
         URLInfo.parse('http://[email protected]/').url
     )
     self.assertEqual(
         'http://*****:*****@example.com/',
         URLInfo.parse('http://*****:*****@example.com/').url
     )
     self.assertEqual(
         'http://:[email protected]/',
         URLInfo.parse('http://:[email protected]/').url
     )
     self.assertEqual(
         'http://*****:*****@example.com/',
         URLInfo.parse('http://*****:*****@example.com/').url
     )
     self.assertEqual(
         'http://User%40Name:Pass:[email protected]/',
         URLInfo.parse('http://User%40Name:Pass%[email protected]/').url
     )
     self.assertEqual(
         'http://User%20Name%[email protected]/',
         URLInfo.parse('http://User Name%3A:@example.com/').url
     )
Ejemplo n.º 53
0
    def test_span_hosts_filter(self):
        mock_record = MockURLTableRecord()
        mock_record.url = 'http://example.com'

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/'),
        ],
                                     enabled=False)

        self.assertTrue(
            url_filter.test(
                URLInfo.parse('http://example.com/blog/topic1/blah.html'),
                mock_record))
        self.assertFalse(
            url_filter.test(
                URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
                mock_record))

        url_filter = SpanHostsFilter([
            URLInfo.parse('http://example.com/blog/'),
        ],
                                     enabled=True)
        self.assertTrue(
            url_filter.test(
                URLInfo.parse('http://example.com/blog/topic1/blah.html'),
                mock_record))
        self.assertTrue(
            url_filter.test(
                URLInfo.parse('http://hotdog.example/blog/topic1/blah.html'),
                mock_record))
Ejemplo n.º 54
0
    def parse_url(cls, url, encoding):
        '''Parse and return a URLInfo.

        This function logs a warning if the URL cannot be parsed and returns
        None.
        '''
        try:
            url_info = URLInfo.parse(url, encoding=encoding)
        except ValueError as error:
            _logger.warning(_('Discarding malformed URL ‘{url}’: {error}.')\
                .format(url=url, error=error))
        else:
            return url_info
Ejemplo n.º 55
0
    def test_sitemap_detect(self):
        # It should detect without BOM
        self.assertTrue(SitemapReader.is_file(
            io.BytesIO('<?xml > <urlset >'.encode('utf-16le'))
        ))
        self.assertFalse(SitemapReader.is_file(
            io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le'))
        ))
        self.assertFalse(SitemapReader.is_file(
            io.BytesIO(b'<html><body>hello<urlset>')
        ))
        self.assertTrue(SitemapReader.is_file(
            io.BytesIO(b'<?xml version> <urlset>')
        ))

        data_file = io.BytesIO()
        g_file = gzip.GzipFile(fileobj=data_file, mode='wb')
        g_file.write('<?xml version> <urlset>'.encode('utf-16le'))
        g_file.close()
        data_file.seek(0)
        self.assertTrue(SitemapReader.is_file(
            data_file
        ))

        self.assertTrue(
            SitemapReader.is_url(URLInfo.parse('example.com/sitemaps1.xml'))
        )
        self.assertTrue(
            SitemapReader.is_url(URLInfo.parse('example.com/robots.txt'))
        )
        self.assertFalse(
            SitemapReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            SitemapReader.is_request(Request('example.com/sitemaps34.xml'))
        )
        self.assertFalse(
            SitemapReader.is_request(Request('example.com/image.jpg'))
        )
Ejemplo n.º 56
0
        def repl(match):
            url = match.group(1) or match.group(2)

            if base_url:
                url = wpull.url.urljoin(base_url, url)

            if url in self._url_table \
            and self._url_table[url].status == Status.done:
                new_url = self._path_namer.get_filename(URLInfo.parse(url))
            else:
                new_url = url

            return match.group().replace(url, new_url)