Ejemplo n.º 1
0
 def test_host_normalization(self):
     """
     Asserts the scheme and hosts with a normalizable scheme are
     converted to lower-case.
     """
     url_host_map = {
         # Hosts
         'HTTP://GOOGLE.COM/mail/': ('http', 'google.com', None),
         'GOogle.COM/mail': ('http', 'google.com', None),
         'HTTP://GoOgLe.CoM:8000/mail/': ('http', 'google.com', 8000),
         'HTTP://*****:*****@EXAMPLE.COM:1234': ('http', 'example.com', 1234),
         '173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80),
         'HTTP://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html': (
             'http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000),
         'HTTPS://[1080:0:0:0:8:800:200c:417A]/index.html': (
             'https', '[1080:0:0:0:8:800:200c:417a]', None),
         'abOut://eXamPlE.com?info=1': ('about', 'eXamPlE.com', None),
         'http+UNIX://%2fvar%2frun%2fSOCKET/path': (
             'http+unix', '%2fvar%2frun%2fSOCKET', None),
     }
     for url, expected_host in url_host_map.items():
         returned_host = get_host(url)
         self.assertEqual(returned_host, expected_host)
Ejemplo n.º 2
0
 def test_host_normalization(self):
     """
     Asserts the scheme and hosts with a normalizable scheme are
     converted to lower-case.
     """
     url_host_map = {
         # Hosts
         'HTTP://GOOGLE.COM/mail/': ('http', 'google.com', None),
         'GOogle.COM/mail': ('http', 'google.com', None),
         'HTTP://GoOgLe.CoM:8000/mail/': ('http', 'google.com', 8000),
         'HTTP://*****:*****@EXAMPLE.COM:1234': ('http', 'example.com',
                                                   1234),
         '173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://[2a00:1450:4001:c01::67]:80/test':
         ('http', '[2a00:1450:4001:c01::67]', 80),
         'HTTP://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html':
         ('http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000),
         'HTTPS://[1080:0:0:0:8:800:200c:417A]/index.html':
         ('https', '[1080:0:0:0:8:800:200c:417a]', None),
         'abOut://eXamPlE.com?info=1': ('about', 'eXamPlE.com', None),
         'http+UNIX://%2fvar%2frun%2fSOCKET/path':
         ('http+unix', '%2fvar%2frun%2fSOCKET', None),
     }
     for url, expected_host in url_host_map.items():
         returned_host = get_host(url)
         self.assertEqual(returned_host, expected_host)
Ejemplo n.º 3
0
    def work(self, site):
        _, hostname, _ = get_host(site)

        conn = utils.http_req(site)
        item = {
            "site": site,
            "hostname": hostname,
            "ip":"",
            "title": utils.get_title(conn.content),
            "status": conn.status_code,
            "headers": utils.get_headers(conn),
            "http_server":  conn.headers.get("Server", ""),
            "body_length": len(conn.content),
            "finger": [],
            "favicon": fetch_favicon(site)
        }
        domain_parsed = utils.domain_parsed(hostname)
        if domain_parsed:
            item["fld"] = domain_parsed["fld"]
            ips = utils.get_ip(hostname)
            if ips:
                item["ip"] = ips[0]
        else:
            item["ip"] = hostname

        self.site_info_list.append(item)
        if conn.status_code == 301 or conn.status_code == 302:
            url_302 = urljoin(site, conn.headers.get("Location", ""))
            if url_302 != site and url_302.startswith(site):
                self.work(url_302)
Ejemplo n.º 4
0
    def test_get_host(self):
        url_host_map = {
            # Hosts
            'http://google.com/mail': ('http', 'google.com', None),
            'http://google.com/mail/': ('http', 'google.com', None),
            'google.com/mail': ('http', 'google.com', None),
            'http://google.com/': ('http', 'google.com', None),
            'http://google.com': ('http', 'google.com', None),
            'http://www.google.com': ('http', 'www.google.com', None),
            'http://mail.google.com': ('http', 'mail.google.com', None),
            'http://google.com:8000/mail/': ('http', 'google.com', 8000),
            'http://google.com:8000': ('http', 'google.com', 8000),
            'https://google.com': ('https', 'google.com', None),
            'https://google.com:8000': ('https', 'google.com', 8000),
            'http://*****:*****@127.0.0.1:1234': ('http', '127.0.0.1', 1234),
            'http://google.com/foo=http://bar:42/baz': ('http', 'google.com',
                                                        None),
            'http://google.com?foo=http://bar:42/baz': ('http', 'google.com',
                                                        None),
            'http://google.com#foo=http://bar:42/baz': ('http', 'google.com',
                                                        None),

            # IPv4
            '173.194.35.7': ('http', '173.194.35.7', None),
            'http://173.194.35.7': ('http', '173.194.35.7', None),
            'http://173.194.35.7/test': ('http', '173.194.35.7', None),
            'http://173.194.35.7:80': ('http', '173.194.35.7', 80),
            'http://173.194.35.7:80/test': ('http', '173.194.35.7', 80),

            # IPv6
            '[2a00:1450:4001:c01::67]': ('http', '[2a00:1450:4001:c01::67]',
                                         None),
            'http://[2a00:1450:4001:c01::67]':
            ('http', '[2a00:1450:4001:c01::67]', None),
            'http://[2a00:1450:4001:c01::67]/test':
            ('http', '[2a00:1450:4001:c01::67]', None),
            'http://[2a00:1450:4001:c01::67]:80':
            ('http', '[2a00:1450:4001:c01::67]', 80),
            'http://[2a00:1450:4001:c01::67]:80/test':
            ('http', '[2a00:1450:4001:c01::67]', 80),

            # More IPv6 from http://www.ietf.org/rfc/rfc2732.txt
            'http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html':
            ('http', '[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]', 8000),
            'http://[1080:0:0:0:8:800:200C:417A]/index.html':
            ('http', '[1080:0:0:0:8:800:200C:417A]', None),
            'http://[3ffe:2a00:100:7031::1]':
            ('http', '[3ffe:2a00:100:7031::1]', None),
            'http://[1080::8:800:200C:417A]/foo':
            ('http', '[1080::8:800:200C:417A]', None),
            'http://[::192.9.5.5]/ipng': ('http', '[::192.9.5.5]', None),
            'http://[::FFFF:129.144.52.38]:42/index.html':
            ('http', '[::FFFF:129.144.52.38]', 42),
            'http://[2010:836B:4179::836B:4179]':
            ('http', '[2010:836B:4179::836B:4179]', None),
        }
        for url, expected_host in url_host_map.items():
            returned_host = get_host(url)
            self.assertEqual(returned_host, expected_host)
Ejemplo n.º 5
0
    def test_get_host(self):
        url_host_map = {
            # Hosts
            'http://google.com/mail': ('http', 'google.com', None),
            'http://google.com/mail/': ('http', 'google.com', None),
            'google.com/mail': ('http', 'google.com', None),
            'http://google.com/': ('http', 'google.com', None),
            'http://google.com': ('http', 'google.com', None),
            'http://www.google.com': ('http', 'www.google.com', None),
            'http://mail.google.com': ('http', 'mail.google.com', None),
            'http://google.com:8000/mail/': ('http', 'google.com', 8000),
            'http://google.com:8000': ('http', 'google.com', 8000),
            'https://google.com': ('https', 'google.com', None),
            'https://google.com:8000': ('https', 'google.com', 8000),
            'http://*****:*****@127.0.0.1:1234': ('http', '127.0.0.1', 1234),
            'http://google.com/foo=http://bar:42/baz': ('http', 'google.com', None),
            'http://google.com?foo=http://bar:42/baz': ('http', 'google.com', None),
            'http://google.com#foo=http://bar:42/baz': ('http', 'google.com', None),

            # IPv4
            '173.194.35.7': ('http', '173.194.35.7', None),
            'http://173.194.35.7': ('http', '173.194.35.7', None),
            'http://173.194.35.7/test': ('http', '173.194.35.7', None),
            'http://173.194.35.7:80': ('http', '173.194.35.7', 80),
            'http://173.194.35.7:80/test': ('http', '173.194.35.7', 80),

            # IPv6
            '[2a00:1450:4001:c01::67]': ('http', '[2a00:1450:4001:c01::67]', None),
            'http://[2a00:1450:4001:c01::67]': ('http', '[2a00:1450:4001:c01::67]', None),
            'http://[2a00:1450:4001:c01::67]/test': ('http', '[2a00:1450:4001:c01::67]', None),
            'http://[2a00:1450:4001:c01::67]:80': ('http', '[2a00:1450:4001:c01::67]', 80),
            'http://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80),

            # More IPv6 from http://www.ietf.org/rfc/rfc2732.txt
            'http://[fedc:ba98:7654:3210:fedc:ba98:7654:3210]:8000/index.html': (
                'http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000),
            'http://[1080:0:0:0:8:800:200c:417a]/index.html': (
                'http', '[1080:0:0:0:8:800:200c:417a]', None),
            'http://[3ffe:2a00:100:7031::1]': ('http', '[3ffe:2a00:100:7031::1]', None),
            'http://[1080::8:800:200c:417a]/foo': ('http', '[1080::8:800:200c:417a]', None),
            'http://[::192.9.5.5]/ipng': ('http', '[::192.9.5.5]', None),
            'http://[::ffff:129.144.52.38]:42/index.html': ('http', '[::ffff:129.144.52.38]', 42),
            'http://[2010:836b:4179::836b:4179]': ('http', '[2010:836b:4179::836b:4179]', None),
        }
        for url, expected_host in url_host_map.items():
            returned_host = get_host(url)
            self.assertEqual(returned_host, expected_host)
Ejemplo n.º 6
0
 def test_host_normalization(self):
     """Asserts the scheme and host is normalized to lower-case."""
     url_host_map = {
         # Hosts
         'HTTP://GOOGLE.COM/mail/': ('http', 'google.com', None),
         'GOogle.COM/mail': ('http', 'google.com', None),
         'HTTP://GoOgLe.CoM:8000/mail/': ('http', 'google.com', 8000),
         'HTTP://*****:*****@EXAMPLE.COM:1234': ('http', 'example.com', 1234),
         '173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://[2a00:1450:4001:c01::67]:80/test': ('http', '[2a00:1450:4001:c01::67]', 80),
         'HTTP://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html': ('http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000),
         'HTTPS://[1080:0:0:0:8:800:200c:417A]/index.html': ('https', '[1080:0:0:0:8:800:200c:417a]', None),
     }
     for url, expected_host in url_host_map.items():
         returned_host = get_host(url)
         self.assertEqual(returned_host, expected_host)
Ejemplo n.º 7
0
 def test_get_host(self):
     url_host_map = {
         # Hosts
         "http://google.com/mail": ("http", "google.com", None),
         "http://google.com/mail/": ("http", "google.com", None),
         "google.com/mail": ("http", "google.com", None),
         "http://google.com/": ("http", "google.com", None),
         "http://google.com": ("http", "google.com", None),
         "http://www.google.com": ("http", "www.google.com", None),
         "http://mail.google.com": ("http", "mail.google.com", None),
         "http://google.com:8000/mail/": ("http", "google.com", 8000),
         "http://google.com:8000": ("http", "google.com", 8000),
         "https://google.com": ("https", "google.com", None),
         "https://google.com:8000": ("https", "google.com", 8000),
         "http://*****:*****@127.0.0.1:1234": ("http", "127.0.0.1", 1234),
         "http://google.com/foo=http://bar:42/baz": ("http", "google.com", None),
         "http://google.com?foo=http://bar:42/baz": ("http", "google.com", None),
         "http://google.com#foo=http://bar:42/baz": ("http", "google.com", None),
         # IPv4
         "173.194.35.7": ("http", "173.194.35.7", None),
         "http://173.194.35.7": ("http", "173.194.35.7", None),
         "http://173.194.35.7/test": ("http", "173.194.35.7", None),
         "http://173.194.35.7:80": ("http", "173.194.35.7", 80),
         "http://173.194.35.7:80/test": ("http", "173.194.35.7", 80),
         # IPv6
         "[2a00:1450:4001:c01::67]": ("http", "[2a00:1450:4001:c01::67]", None),
         "http://[2a00:1450:4001:c01::67]": ("http", "[2a00:1450:4001:c01::67]", None),
         "http://[2a00:1450:4001:c01::67]/test": ("http", "[2a00:1450:4001:c01::67]", None),
         "http://[2a00:1450:4001:c01::67]:80": ("http", "[2a00:1450:4001:c01::67]", 80),
         "http://[2a00:1450:4001:c01::67]:80/test": ("http", "[2a00:1450:4001:c01::67]", 80),
         # More IPv6 from http://www.ietf.org/rfc/rfc2732.txt
         "http://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html": (
             "http",
             "[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]",
             8000,
         ),
         "http://[1080:0:0:0:8:800:200C:417A]/index.html": ("http", "[1080:0:0:0:8:800:200C:417A]", None),
         "http://[3ffe:2a00:100:7031::1]": ("http", "[3ffe:2a00:100:7031::1]", None),
         "http://[1080::8:800:200C:417A]/foo": ("http", "[1080::8:800:200C:417A]", None),
         "http://[::192.9.5.5]/ipng": ("http", "[::192.9.5.5]", None),
         "http://[::FFFF:129.144.52.38]:42/index.html": ("http", "[::FFFF:129.144.52.38]", 42),
         "http://[2010:836B:4179::836B:4179]": ("http", "[2010:836B:4179::836B:4179]", None),
     }
     for url, expected_host in url_host_map.items():
         returned_host = get_host(url)
         self.assertEqual(returned_host, expected_host)
Ejemplo n.º 8
0
 def test_host_normalization(self):
     """Asserts the scheme and host is normalized to lower-case."""
     url_host_map = {
         # Hosts
         'HTTP://GOOGLE.COM/mail/': ('http', 'google.com', None),
         'GOogle.COM/mail': ('http', 'google.com', None),
         'HTTP://GoOgLe.CoM:8000/mail/': ('http', 'google.com', 8000),
         'HTTP://*****:*****@EXAMPLE.COM:1234':
         ('http', 'example.com', 1234),
         '173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://173.194.35.7': ('http', '173.194.35.7', None),
         'HTTP://[2a00:1450:4001:c01::67]:80/test':
         ('http', '[2a00:1450:4001:c01::67]', 80),
         'HTTP://[FEDC:BA98:7654:3210:FEDC:BA98:7654:3210]:8000/index.html':
         ('http', '[fedc:ba98:7654:3210:fedc:ba98:7654:3210]', 8000),
         'HTTPS://[1080:0:0:0:8:800:200c:417A]/index.html':
         ('https', '[1080:0:0:0:8:800:200c:417a]', None),
     }
     for url, expected_host in url_host_map.items():
         returned_host = get_host(url)
         self.assertEqual(returned_host, expected_host)
Ejemplo n.º 9
0
 def test_invalid_host(self, location):
     with pytest.raises(LocationParseError):
         get_host(location)
Ejemplo n.º 10
0
 def test_get_host(self, url, expected_host):
     returned_host = get_host(url)
     assert returned_host == expected_host
Ejemplo n.º 11
0
 def test_invalid_host(self, location):
     with pytest.raises(LocationParseError):
         get_host(location)
Ejemplo n.º 12
0
 def test_get_host(self, url, expected_host):
     returned_host = get_host(url)
     assert returned_host == expected_host