Exemple #1
0
    def test_header_early_close(self):
        stream = self.new_stream()
        request = Request(self.get_url('/header_early_close'))
        try:
            yield from self.fetch(stream, request)
        except NetworkError:
            pass
        else:
            self.fail()  # pragma: no cover

        request = Request(self.get_url('/'))
        yield from self.fetch(stream, request)
Exemple #2
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request('example.com/image.jpg'))
        )

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
Exemple #3
0
    def test_javascript_heavy_inline_monstrosity(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/test.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                in_file.seek(0x147)
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png', inline_urls)
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls)

        print('\n'.join(inline_urls))
        print('\n'.join(linked_urls))
    def test_http_request(self):
        request = Request('http://example.com')
        request.fields['hello'] = 'world'
        new_request = convert_http_request(request)

        self.assertEqual('example.com', new_request.host)
        self.assertEqual('world', new_request.get_header('Hello'))
Exemple #5
0
    def test_duration_timeout(self):
        client = WebClient()
        session = client.session(Request(self.get_url('/sleep_long')))

        with self.assertRaises(DurationTimeout):
            yield from session.start()
            yield from session.download(duration_timeout=0.1)
Exemple #6
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)
Exemple #7
0
    def test_status_line_only(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/status_line_only'))
        response, content = yield from self.fetch(stream, request)

        self.assertEqual(200, response.status_code)
        self.assertEqual(b'Hey', content)
Exemple #8
0
 def test_utf8_header(self):
     stream = self.new_stream()
     request = Request(self.get_url('/utf8_header'))
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('🐱'.encode('utf-8').decode('latin-1'),
                      response.fields['whoa'])
Exemple #9
0
    def test_false_gzip(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/false_gzip'))
        response, content = yield from self.fetch(stream, request)

        self.assertEqual('gzip', response.fields['Content-Encoding'])
        self.assertEqual(b'a' * 100, content)
Exemple #10
0
    def test_redirect_loop(self):
        checker = RobotsTxtChecker(web_client=MockWebClient())
        request = Request('http://example.com')
        request.prepare_for_send()

        nonlocal_dict = {'counter': 0}

        def response_callback(request):
            request.prepare_for_send()
            self.assertTrue(request.url_info.url.endswith('robots.txt'))
            response = Response(302, 'See else')
            response.request = request
            response.fields['Location'] = '/robots.txt'

            nonlocal_dict['counter'] += 1

            if nonlocal_dict['counter'] > 20:
                raise ProtocolError('Mock redirect loop error.')

            return response

        checker.web_client.mock_response_callback = response_callback

        self.assertTrue((yield from checker.can_fetch(request)))
        self.assertTrue(checker.can_fetch_pool(request))
Exemple #11
0
    def test_rss_as_html(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(inline_urls)
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            }, linked_urls)
Exemple #12
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
        }, linked_urls)
        self.assertFalse(inline_urls)
Exemple #13
0
    def test_html_soup(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls)
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            }, linked_urls)
Exemple #14
0
    def test_html_scraper_links_base_href(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'basehref.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-8', scrape_result.encoding)

        self.assertEqual(
            {
                'http://cdn.example.com/stylesheet1.css',
                'http://www.example.com/stylesheet2.css',
                'http://example.com/a/stylesheet3.css',
                'http://example.com/a/dir/image1.png',
                'http://example.com/dir/image2.png',
                'http://example.net/image3.png',
                'http://example.com/dir/image4.png',
            }, inline_urls)
        self.assertEqual({'http://example.com/a/'}, linked_urls)
Exemple #15
0
    def test_client_duration_timeout(self):
        client = Client()

        with self.assertRaises(DurationTimeout), client.session() as session:
            request = Request(self.get_url('/sleep_long'))
            yield from session.start(request)
            yield from session.download(duration_timeout=0.1)
Exemple #16
0
 def test_content_length_and_chunked(self):
     stream = self.new_stream()
     request = Request(self.get_url('/content_length_and_chunked'))
     response, content = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('chunked', response.fields['Transfer-Encoding'])
     self.assertEqual(b'hello world!', content)
Exemple #17
0
 def test_basic_chunked_trailer(self):
     stream = self.new_stream()
     request = Request(self.get_url('/chunked_trailer'))
     response, content = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('chunked', response.fields['Transfer-Encoding'])
     self.assertEqual('dolphin', response.fields['Animal'])
     self.assertEqual(b'hello world!', content)
Exemple #18
0
 def test_connection_reuse(self):
     stream = self.new_stream()
     request = Request(self.get_url('/'))
     request.version = 'HTTP/1.0'
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     response, dummy = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
Exemple #19
0
    def test_client_exception_recovery(self):
        connection_factory = functools.partial(Connection, timeout=2.0)
        connection_pool = ConnectionPool(connection_factory=connection_factory)
        client = Client(connection_pool=connection_pool)

        for dummy in range(7):
            with self.assertRaises(NetworkError), client.session() as session:
                request = Request(self.get_url('/header_early_close'))
                yield from session.start(request)

        for dummy in range(7):
            with client.session() as session:
                request = Request(self.get_url('/'))
                response = yield from session.start(request)
                self.assertEqual(200, response.status_code)
                yield from session.download()
                self.assertTrue(session.done())
Exemple #20
0
    def test_client_exception_throw(self):
        client = Client()

        with client.session() as session:
            request = Request('http://wpull-no-exist.invalid')

        with self.assertRaises(NetworkError):
            yield from session.start(request)
Exemple #21
0
 def test_basic_content_length(self):
     stream = self.new_stream()
     request = Request(self.get_url('/content_length'))
     response, content = yield from self.fetch(stream, request)
     self.assertEqual(200, response.status_code)
     self.assertEqual('100', response.fields['Content-Length'])
     self.assertEqual(b'a' * 100, content)
     self.assertEqual(100, len(content))
Exemple #22
0
 def test_connection_refused(self):
     stream = self.new_stream('127.0.0.1', 1)
     try:
         yield from self.fetch(stream, Request('http://localhost:1/'))
     except ConnectionRefused:
         pass
     else:
         self.fail()  # pragma: no cover
Exemple #23
0
    def test_bad_redirect_ipv6(self):
        client = WebClient()
        session = client.session(Request(self.get_url('/bad_redirect_ipv6')))

        with self.assertRaises(ProtocolError):
            while not session.done():
                yield from session.start()
                yield from session.download()
Exemple #24
0
    def test_add_referer_https_to_http(self):
        request = Request()
        url_record = URLRecord()
        url_record.parent_url = 'https://example.com/'
        url_record.url = 'http://example.com/image.png'

        WebProcessorSession._add_referrer(request, url_record)

        self.assertNotIn('referer', request.fields)
Exemple #25
0
 def test_no_such_host(self):
     stream = self.new_stream('wpull-no-exist.invalid', 80)
     try:
         yield from \
             self.fetch(stream, Request('http://wpull-no-exist.invalid'))
     except NetworkError:
         pass
     else:
         self.fail()  # pragma: no cover
Exemple #26
0
 def test_read_timeout(self):
     stream = self.new_stream(connection_kwargs=dict(timeout=0.1))
     request = Request(self.get_url('/sleep_long'))
     try:
         yield from self.fetch(stream, request)
     except NetworkError:
         pass
     else:
         self.fail()  # pragma: no cover
Exemple #27
0
    def test_add_referer(self):
        request = Request()
        url_record = URLRecord()
        url_record.parent_url = 'http://example.com/'
        url_record.url = 'http://example.com/image.png'

        WebProcessorSession._add_referrer(request, url_record)

        self.assertEqual('http://example.com/', request.fields['Referer'])
Exemple #28
0
 def test_gzip_corrupt_footer(self):
     stream = self.new_stream()
     request = Request(self.get_url('/gzip_corrupt_footer'))
     try:
         yield from self.fetch(stream, request)
     except ProtocolError:
         pass
     else:
         self.fail()  # pragma: no cover
Exemple #29
0
 def test_buffer_overflow_header(self):
     stream = self.new_stream()
     request = Request(self.get_url('/buffer_overflow_header'))
     try:
         yield from self.fetch(stream, request)
     except ProtocolError:
         pass
     else:
         self.fail()  # pragma: no cover
Exemple #30
0
 def test_bad_chunk_size(self):
     stream = self.new_stream()
     request = Request(self.get_url('/bad_chunk_size'))
     try:
         yield from self.fetch(stream, request)
     except ProtocolError:
         pass
     else:
         self.fail()  # pragma: no cover