Beispiel #1
0
    def test_basic_requests(self):
        proxy_http_client = Client(recorder=DebugPrintRecorder())
        proxy_server = HTTPProxyServer(proxy_http_client)
        proxy_socket, proxy_port = tornado.testing.bind_unused_port()

        yield From(trollius.start_server(proxy_server, sock=proxy_socket))

        connection_pool = HTTPProxyConnectionPool(('127.0.0.1', proxy_port))
        http_client = Client(connection_pool=connection_pool,
                             recorder=DebugPrintRecorder())

        for dummy in range(3):
            with http_client.session() as session:
                response = yield From(session.fetch(Request(self.get_url('/'))))
                self.assertEqual(200, response.status_code)

                file = io.BytesIO()
                yield From(session.read_content(file=file))
                data = file.getvalue().decode('ascii', 'replace')
                self.assertTrue(data.endswith('</html>'))

            with http_client.session() as session:
                response = yield From(session.fetch(Request(
                    self.get_url('/always_error'))))
                self.assertEqual(500, response.status_code)
                self.assertEqual('Dragon In Data Center', response.reason)

                file = io.BytesIO()
                yield From(session.read_content(file=file))
                data = file.getvalue().decode('ascii', 'replace')
                self.assertEqual('Error', data)
Beispiel #2
0
    def test_client_did_not_complete(self):
        client = Client()

        with warnings.catch_warnings(record=True) as warn_list:
            warnings.simplefilter("always")

            with client.session() as session:
                request = Request(self.get_url('/'))
                yield From(session.fetch(request))
                self.assertFalse(session.done())

            for warn_obj in warn_list:
                print(warn_obj)

            # Unrelated warnings may occur in PyPy
            # https://travis-ci.org/chfoo/wpull/jobs/51420202
            self.assertGreaterEqual(len(warn_list), 1)

            for warn_obj in warn_list:
                if str(warn_obj.message) == 'HTTP session did not complete.':
                    break
            else:
                self.fail('Warning did not occur.')

        client = Client()

        with self.assertRaises(MyException):
            with client.session() as session:
                request = Request(self.get_url('/'))
                yield From(session.fetch(request))
                raise MyException('Oops')
Beispiel #3
0
    def test_xml_detect(self):
        self.assertTrue(
            XMLDetector.is_file(io.BytesIO(
                '<?xml version='.encode('utf-16le'))))
        self.assertFalse(
            XMLDetector.is_file(
                io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le'))))
        self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version')))
        self.assertTrue(
            XMLDetector.is_url(URLInfo.parse('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(
            XMLDetector.is_request(Request('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_request(Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'application/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(XMLDetector.is_response(response))
Beispiel #4
0
    def test_to_dict_body(self):
        request = Request()
        request.body = Body()
        request_dict = request.to_dict()

        self.assertTrue(request_dict['body'])
        request.body.close()

        request = Request()
        request.body = NotImplemented
        request_dict = request.to_dict()

        self.assertFalse(request_dict['body'])

        response = Response()
        response.body = Body()
        response_dict = response.to_dict()

        self.assertTrue(response_dict['body'])
        response.body.close()

        response = Response()
        response.body = NotImplemented
        response_dict = response.to_dict()

        self.assertFalse(response_dict['body'])
Beispiel #5
0
    def test_overrun(self):
        stream = self.new_stream()
        request = Request(self.get_url('/overrun'))

        for dummy in range(3):
            response, content = yield From(self.fetch(stream, request))

            self.assertEqual(b'a' * 100, content)

        request = Request(self.get_url('/'))
        yield From(self.fetch(stream, request))
Beispiel #6
0
    def test_header_early_close(self):
        stream = self.new_stream()
        request = Request(self.get_url('/header_early_close'))
        try:
            yield From(self.fetch(stream, request))
        except NetworkError:
            pass
        else:
            self.fail()  # pragma: no cover

        request = Request(self.get_url('/'))
        yield From(self.fetch(stream, request))
Beispiel #7
0
    def test_html_krokozyabry(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=KOI8-R'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'krokozyabry.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('koi8-r', scrape_result.encoding)

        self.assertEqual(
            set(),
            inline_urls
        )
        self.assertEqual(
            {'http://example.com/Кракозябры'},
            linked_urls
        )
Beispiel #8
0
    def test_html_soup(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual(
            {'http://example.com/ABOUTM~1.JPG'},
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            },
            linked_urls
        )
Beispiel #9
0
    def test_client_duration_timeout(self):
        client = Client()

        with self.assertRaises(DurationTimeout), client.session() as session:
            request = Request(self.get_url('/sleep_long'))
            yield From(session.fetch(request))
            yield From(session.read_content(duration_timeout=0.1))
Beispiel #10
0
    def test_xhtml_invalid(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'xhtml_invalid.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual(
            {
                'http://example.com/image.png',
                'http://example.com/script.js',
            },
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/link'
            },
            linked_urls
        )
Beispiel #11
0
    def test_http_request(self):
        request = Request('http://example.com')
        request.fields['hello'] = 'world'
        new_request = convert_http_request(request)

        self.assertEqual('example.com', new_request.host)
        self.assertEqual('world', new_request.get_header('Hello'))
Beispiel #12
0
    def test_html_not_quite_charset(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'videogame_top.htm')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://example.com/copyright_2001_2006_rtype.gif',
            inline_urls
        )
        self.assertIn(
            'http://www.geocities.jp/gamehouse_grindcrusher/',
            linked_urls
        )
Beispiel #13
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)
Beispiel #14
0
 def test_content_length_and_chunked(self):
     stream = self.new_stream()
     request = Request(self.get_url('/content_length_and_chunked'))
     response, content = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     self.assertEqual('chunked', response.fields['Transfer-Encoding'])
     self.assertEqual(b'hello world!', content)
Beispiel #15
0
    def test_false_gzip(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/false_gzip'))
        response, content = yield From(self.fetch(stream, request))

        self.assertEqual('gzip', response.fields['Content-Encoding'])
        self.assertEqual(b'a' * 100, content)
Beispiel #16
0
 def test_utf8_header(self):
     stream = self.new_stream()
     request = Request(self.get_url('/utf8_header'))
     response, dummy = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     self.assertEqual('🐱'.encode('utf-8').decode('latin-1'),
                      response.fields['whoa'])
Beispiel #17
0
    def test_javascript_heavy_inline_monstrosity(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/test.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                in_file.seek(0x147)
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png', inline_urls)
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls)

        print('\n'.join(inline_urls))
        print('\n'.join(linked_urls))
Beispiel #18
0
    def test_status_line_only(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/status_line_only'))
        response, content = yield From(self.fetch(stream, request))

        self.assertEqual(200, response.status_code)
        self.assertEqual(b'Hey', content)
Beispiel #19
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
        }, linked_urls)
        self.assertFalse(inline_urls)
Beispiel #20
0
    def test_rss_as_html(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(
            inline_urls
        )
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            },
            linked_urls
        )
Beispiel #21
0
    def test_client_exception_throw(self):
        client = Client()

        with client.session() as session:
            request = Request('http://wpull-no-exist.invalid')

        with self.assertRaises(NetworkError):
            yield From(session.fetch(request))
Beispiel #22
0
 def test_basic_chunked_trailer(self):
     stream = self.new_stream()
     request = Request(self.get_url('/chunked_trailer'))
     response, content = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     self.assertEqual('chunked', response.fields['Transfer-Encoding'])
     self.assertEqual('dolphin', response.fields['Animal'])
     self.assertEqual(b'hello world!', content)
Beispiel #23
0
 def test_basic_content_length(self):
     stream = self.new_stream()
     request = Request(self.get_url('/content_length'))
     response, content = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     self.assertEqual('100', response.fields['Content-Length'])
     self.assertEqual(b'a' * 100, content)
     self.assertEqual(100, len(content))
Beispiel #24
0
    def test_ignore_length(self):
        stream = self.new_stream('127.0.0.1', self._port,
                                 keep_alive=False, ignore_length=True)
        request = Request(self.get_url('/underrun'))

        response, content = yield From(self.fetch(stream, request))

        self.assertEqual(50, len(content))
Beispiel #25
0
 def test_connection_reuse(self):
     stream = self.new_stream()
     request = Request(self.get_url('/'))
     request.version = 'HTTP/1.0'
     response, dummy = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     response, dummy = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
Beispiel #26
0
 def test_connection_refused(self):
     stream = self.new_stream('127.0.0.1', 1)
     try:
         yield From(self.fetch(stream, Request('http://localhost:1/')))
     except ConnectionRefused:
         pass
     else:
         self.fail()  # pragma: no cover
Beispiel #27
0
    def test_basic(self):
        client = WebClient()
        session = client.session(Request(self.get_url('/')))

        self.assertFalse(session.done())
        response = yield From(session.fetch())

        self.assertEqual(200, response.status_code)
        self.assertTrue(session.done())
Beispiel #28
0
 def test_bad_chunk_size(self):
     stream = self.new_stream()
     request = Request(self.get_url('/bad_chunk_size'))
     try:
         yield From(self.fetch(stream, request))
     except ProtocolError:
         pass
     else:
         self.fail()  # pragma: no cover
Beispiel #29
0
 def test_no_such_host(self):
     stream = self.new_stream('wpull-no-exist.invalid', 80)
     try:
         yield From(
             self.fetch(stream, Request('http://wpull-no-exist.invalid')))
     except NetworkError:
         pass
     else:
         self.fail()  # pragma: no cover
Beispiel #30
0
 def test_gzip_corrupt_footer(self):
     stream = self.new_stream()
     request = Request(self.get_url('/gzip_corrupt_footer'))
     try:
         yield From(self.fetch(stream, request))
     except ProtocolError:
         pass
     else:
         self.fail()  # pragma: no cover