Ejemplo n.º 1
0
    def test_http_request(self):
        request = Request('http://example.com')
        request.fields['hello'] = 'world'
        new_request = convert_http_request(request)

        self.assertEqual('example.com', new_request.host)
        self.assertEqual('world', new_request.get_header('Hello'))
Ejemplo n.º 2
0
    def test_xml_detect(self):
        self.assertTrue(
            XMLDetector.is_file(io.BytesIO(
                '<?xml version='.encode('utf-16le'))))
        self.assertFalse(
            XMLDetector.is_file(
                io.BytesIO('<!DOCTYPE html><html><body>'.encode('utf-16le'))))
        self.assertFalse(XMLDetector.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(XMLDetector.is_file(io.BytesIO(b'<?xml version')))
        self.assertTrue(
            XMLDetector.is_url(URLInfo.parse('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(
            XMLDetector.is_request(Request('example.com/index.xml')))
        self.assertFalse(
            XMLDetector.is_request(Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'application/xml'
        self.assertTrue(XMLDetector.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(XMLDetector.is_response(response))
Ejemplo n.º 3
0
    def test_basic_requests(self):
        proxy_http_client = Client(recorder=DebugPrintRecorder())
        proxy_server = HTTPProxyServer(proxy_http_client)
        proxy_socket, proxy_port = tornado.testing.bind_unused_port()

        yield From(trollius.start_server(proxy_server, sock=proxy_socket))

        connection_pool = HTTPProxyConnectionPool(('127.0.0.1', proxy_port))
        http_client = Client(connection_pool=connection_pool,
                             recorder=DebugPrintRecorder())

        for dummy in range(3):
            with http_client.session() as session:
                response = yield From(session.fetch(Request(self.get_url('/'))))
                self.assertEqual(200, response.status_code)

                file = io.BytesIO()
                yield From(session.read_content(file=file))
                data = file.getvalue().decode('ascii', 'replace')
                self.assertTrue(data.endswith('</html>'))

            with http_client.session() as session:
                response = yield From(session.fetch(Request(
                    self.get_url('/always_error'))))
                self.assertEqual(500, response.status_code)
                self.assertEqual('Dragon In Data Center', response.reason)

                file = io.BytesIO()
                yield From(session.read_content(file=file))
                data = file.getvalue().decode('ascii', 'replace')
                self.assertEqual('Error', data)
Ejemplo n.º 4
0
    def test_client_did_not_complete(self):
        client = Client()

        with warnings.catch_warnings(record=True) as warn_list:
            warnings.simplefilter("always")

            with client.session() as session:
                request = Request(self.get_url('/'))
                yield From(session.fetch(request))
                self.assertFalse(session.done())

            for warn_obj in warn_list:
                print(warn_obj)

            # Unrelated warnings may occur in PyPy
            # https://travis-ci.org/chfoo/wpull/jobs/51420202
            self.assertGreaterEqual(len(warn_list), 1)

            for warn_obj in warn_list:
                if str(warn_obj.message) == 'HTTP session did not complete.':
                    break
            else:
                self.fail('Warning did not occur.')

        client = Client()

        with self.assertRaises(MyException):
            with client.session() as session:
                request = Request(self.get_url('/'))
                yield From(session.fetch(request))
                raise MyException('Oops')
Ejemplo n.º 5
0
 def test_connection_reuse(self):
     stream = self.new_stream()
     request = Request(self.get_url('/'))
     request.version = 'HTTP/1.0'
     response, dummy = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     response, dummy = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
Ejemplo n.º 6
0
 def test_request(self):
     request = Request('http://example.com/robots.txt')
     request.prepare_for_send()
     self.assertEqual(
         (b'GET /robots.txt HTTP/1.1\r\n'
          b'Host: example.com\r\n'
          b'\r\n'),
         request.to_bytes()
     )
Ejemplo n.º 7
0
    def test_overrun(self):
        stream = self.new_stream()
        request = Request(self.get_url('/overrun'))

        for dummy in range(3):
            response, content = yield From(self.fetch(stream, request))

            self.assertEqual(b'a' * 100, content)

        request = Request(self.get_url('/'))
        yield From(self.fetch(stream, request))
Ejemplo n.º 8
0
    def test_header_early_close(self):
        stream = self.new_stream()
        request = Request(self.get_url('/header_early_close'))
        try:
            yield From(self.fetch(stream, request))
        except NetworkError:
            pass
        else:
            self.fail()  # pragma: no cover

        request = Request(self.get_url('/'))
        yield From(self.fetch(stream, request))
Ejemplo n.º 9
0
    def test_to_dict_body(self):
        request = Request()
        request.body = Body()
        request_dict = request.to_dict()

        self.assertTrue(request_dict['body'])
        request.body.close()

        request = Request()
        request.body = NotImplemented
        request_dict = request.to_dict()

        self.assertFalse(request_dict['body'])

        response = Response()
        response.body = Body()
        response_dict = response.to_dict()

        self.assertTrue(response_dict['body'])
        response.body.close()

        response = Response()
        response.body = NotImplemented
        response_dict = response.to_dict()

        self.assertFalse(response_dict['body'])
Ejemplo n.º 10
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request.new('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request.new('example.com/image.jpg'))
        )

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
Ejemplo n.º 11
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request.new('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request.new('example.com/image.jpg'))
        )

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
Ejemplo n.º 12
0
    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.to_bytes())

            class BadRecord(WARCRecord):
                def __init__(self, original_record):
                    super().__init__()
                    self.block_file = original_record.block_file
                    self.fields = original_record.fields

                def __iter__(self):
                    for dummy in range(1000):
                        yield b"where's my elephant?"
                    raise OSError('Oops')

            session._child_session._request_record = \
                BadRecord(session._child_session._request_record)
            original_offset = os.path.getsize(warc_filename)

            with self.assertRaises((OSError, IOError)):
                session.request(request)

            new_offset = os.path.getsize(warc_filename)
            self.assertEqual(new_offset, original_offset)
            self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

            _logger.debug('original offset {0}'.format(original_offset))
Ejemplo n.º 13
0
    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                max_size=1,
                appending=True
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.to_bytes())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.to_bytes())
            session.response_data(response.body.content())
            session.response(response)

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
Ejemplo n.º 14
0
    def test_javascript_heavy_inline_monstrosity(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                'testing', 'samples', 'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
                '20090718115357%21195Quagsire.png',
            inline_urls
        )
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
                'user%2FGoldenSandslash15&sa=D&sntz=1&'
                'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A',
            linked_urls
        )
Ejemplo n.º 15
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper()
        request = Request.new('http://example.com/sitemap.xml')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(
                b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            '''
            )

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual({
            'http://www.example.com/',
            },
            linked_urls
        )
        self.assertFalse(inline_urls)
Ejemplo n.º 16
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper()
        request = Request.new('http://example.com/sitemap.xml')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(
                b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            '''
            )

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
            },
            linked_urls
        )
        self.assertFalse(inline_urls)
Ejemplo n.º 17
0
    def test_rss_as_html(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                'testing', 'samples', 'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']
        self.assertFalse(
            inline_urls
        )
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            },
            linked_urls
        )
Ejemplo n.º 18
0
    def test_xhtml_invalid(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                'testing', 'samples', 'xhtml_invalid.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual(
            {
                'http://example.com/image.png',
                'http://example.com/script.js',
            },
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/link'
            },
            linked_urls
        )
Ejemplo n.º 19
0
    def test_html_krokozyabry(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=KOI8-R'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'krokozyabry.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('koi8-r', scrape_result.encoding)

        self.assertEqual(
            set(),
            inline_urls
        )
        self.assertEqual(
            {'http://example.com/Кракозябры'},
            linked_urls
        )
Ejemplo n.º 20
0
    def test_xhtml_invalid(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                                          'testing', 'samples',
                                          'xhtml_invalid.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual(
            {
                'http://example.com/image.png',
                'http://example.com/script.js',
            },
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/link'
            },
            linked_urls
        )
Ejemplo n.º 21
0
    def test_html_krokozyabry(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')
        response.fields['content-type'] = 'text/html; charset=KOI8-R'

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                                          'testing', 'samples',
                                          'krokozyabry.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual('koi8-r', scrape_info['encoding'])

        self.assertEqual(
            set(),
            inline_urls
        )
        self.assertEqual(
            {'http://example.com/Кракозябры'},
            linked_urls
        )
Ejemplo n.º 22
0
    def test_html_not_quite_charset(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'videogame_top.htm')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://example.com/copyright_2001_2006_rtype.gif',
            inline_urls
        )
        self.assertIn(
            'http://www.geocities.jp/gamehouse_grindcrusher/',
            linked_urls
        )
Ejemplo n.º 23
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)
Ejemplo n.º 24
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
        }, linked_urls)
        self.assertFalse(inline_urls)
Ejemplo n.º 25
0
    def test_fetch_disallow(self):
        http_client = MockHTTPClient()
        pool = RobotsTxtPool()
        client = RichClient(http_client, pool)
        session = MockRobotsTxtRichClientSession(
            client, Request.new('http://example.com'))

        self.assertEqual(RobotsState.unknown, session._robots_state)

        request = session.next_request
        self.assertTrue(request.url_info.url.endswith('robots.txt'))

        response = Response('HTTP/1.0', 200, 'OK')
        response.body.content_file = io.StringIO('User-agent:*\nDisallow: /\n')

        http_client.response = response
        yield session.fetch()

        self.assertEqual(RobotsState.denied, session._robots_state)

        request = session.next_request
        self.assertIsNone(request)

        try:
            yield session.fetch()
        except RobotsDenied:
            pass
        else:
            self.fail()

        self.assertTrue(session.done)
Ejemplo n.º 26
0
    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper()
        request = Request.new('http://example.com/sitemap.xml')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(
                b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            '''
            )

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
            },
            linked_urls
        )
        self.assertFalse(inline_urls)
Ejemplo n.º 27
0
    def test_html_soup(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual(
            {'http://example.com/ABOUTM~1.JPG'},
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            },
            linked_urls
        )
Ejemplo n.º 28
0
    def test_javascript_heavy_inline_monstrosity(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                                          'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png',
            inline_urls
        )
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A',
            linked_urls
        )
Ejemplo n.º 29
0
    def test_redirect_loop(self):
        http_client = MockHTTPClient()
        pool = RobotsTxtPool()
        client = RichClient(http_client, pool)
        session = MockRobotsTxtRichClientSession(
            client, Request.new('http://example.com')
        )

        self.assertEqual(RobotsState.unknown, session._robots_state)

        for dummy in range(21):
            request = session.next_request
            self.assertTrue(request.url_info.url.endswith('robots.txt'))

            response = Response('HTTP/1.0', 302, 'See else')
            response.url_info = request.url_info
            response.fields['location'] = '/robots.txt'

            http_client.response = response
            yield session.fetch()

        request = session.next_request
        self.assertTrue(request)

        response = Response('HTTP/1.0', 200, 'OK')

        http_client.response = response
        yield session.fetch()

        self.assertEqual(RobotsState.ok, session._robots_state)

        print(session.next_request)
        self.assertTrue(session.done)
Ejemplo n.º 30
0
    def test_fetch_disallow(self):
        http_client = MockHTTPClient()
        pool = RobotsTxtPool()
        client = RichClient(http_client, pool)
        session = MockRobotsTxtRichClientSession(
            client, Request.new('http://example.com')
        )

        self.assertEqual(RobotsState.unknown, session._robots_state)

        request = session.next_request
        self.assertTrue(request.url_info.url.endswith('robots.txt'))

        response = Response('HTTP/1.0', 200, 'OK')
        response.body.content_file = io.StringIO('User-agent:*\nDisallow: /\n')

        http_client.response = response
        yield session.fetch()

        self.assertEqual(RobotsState.denied, session._robots_state)

        request = session.next_request
        self.assertIsNone(request)

        try:
            yield session.fetch()
        except RobotsDenied:
            pass
        else:
            self.fail()

        self.assertTrue(session.done)
Ejemplo n.º 31
0
    def test_rss_as_html(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(
            inline_urls
        )
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            },
            linked_urls
        )
Ejemplo n.º 32
0
    def test_client_duration_timeout(self):
        client = Client()

        with self.assertRaises(DurationTimeout), client.session() as session:
            request = Request(self.get_url('/sleep_long'))
            yield From(session.fetch(request))
            yield From(session.read_content(duration_timeout=0.1))
Ejemplo n.º 33
0
    def test_html_soup(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                                          'testing', 'samples', 'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual(
            {'http://example.com/ABOUTM~1.JPG'},
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            },
            linked_urls
        )
Ejemplo n.º 34
0
    def test_html_soup(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                'testing', 'samples', 'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual(
            {'http://example.com/ABOUTM~1.JPG'},
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            },
            linked_urls
        )
Ejemplo n.º 35
0
    def test_html_scraper_links_base_href(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                                          'testing', 'samples',
                                          'basehref.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual('utf-8', scrape_info['encoding'])

        self.assertEqual({
            'http://cdn.example.com/stylesheet1.css',
            'http://www.example.com/stylesheet2.css',
            'http://example.com/a/stylesheet3.css',
            'http://example.com/a/dir/image1.png',
            'http://example.com/dir/image2.png',
            'http://example.net/image3.png',
            'http://example.com/dir/image4.png',
            },
            inline_urls
        )
        self.assertEqual({
            'http://example.com/a/'
            },
            linked_urls
        )
Ejemplo n.º 36
0
    def test_html_krokozyabry(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')
        response.fields['content-type'] = 'text/html; charset=KOI8-R'

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                'testing', 'samples', 'krokozyabry.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual('koi8-r', scrape_info['encoding'])

        self.assertEqual(
            set(),
            inline_urls
        )
        self.assertEqual(
            {'http://example.com/Кракозябры'},
            linked_urls
        )
Ejemplo n.º 37
0
    def test_rss_as_html(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, '')
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                                          'testing', 'samples', 'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']
        self.assertFalse(
            inline_urls
        )
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            },
            linked_urls
        )
Ejemplo n.º 38
0
    def test_html_scraper_links_base_href(self):
        scraper = HTMLScraper()
        request = Request.new('http://example.com/')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            html_file_path = os.path.join(os.path.dirname(__file__),
                'testing', 'samples', 'basehref.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body.content_file)

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual('utf-8', scrape_info['encoding'])

        self.assertEqual({
            'http://cdn.example.com/stylesheet1.css',
            'http://www.example.com/stylesheet2.css',
            'http://example.com/a/stylesheet3.css',
            'http://example.com/a/dir/image1.png',
            'http://example.com/dir/image2.png',
            'http://example.net/image3.png',
            'http://example.com/dir/image4.png',
            },
            inline_urls
        )
        self.assertEqual({
            'http://example.com/a/'
            },
            linked_urls
        )
Ejemplo n.º 39
0
    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper()
        request = Request.new('http://example.com/sitemap.xml')
        response = Response('HTTP/1.0', 200, 'OK')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(
                b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            '''
            )

        scrape_info = scraper.scrape(request, response)
        inline_urls = scrape_info['inline_urls']
        linked_urls = scrape_info['linked_urls']

        self.assertEqual({
            'http://www.example.com/',
            },
            linked_urls
        )
        self.assertFalse(inline_urls)
Ejemplo n.º 40
0
    def test_xhtml_invalid(self):
        element_walker = ElementWalker(
            css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'xhtml_invalid.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual(
            {
                'http://example.com/image.png',
                'http://example.com/script.js',
            },
            inline_urls
        )
        self.assertEqual(
            {
                'http://example.com/link'
            },
            linked_urls
        )
Ejemplo n.º 41
0
    def test_server_error(self):
        http_client = MockHTTPClient()
        pool = RobotsTxtPool()
        client = RichClient(http_client, pool)
        session = MockRobotsTxtRichClientSession(
            client, Request.new('http://example.com')
        )

        self.assertEqual(RobotsState.unknown, session._robots_state)

        for dummy in range(21):
            request = session.next_request
            self.assertTrue(request.url_info.url.endswith('robots.txt'))

            response = Response('HTTP/1.0', 500, 'Opps')

            http_client.response = response
            yield session.fetch()

        request = session.next_request
        self.assertIsNone(request)

        try:
            yield session.fetch()
        except RobotsDenied:
            pass
        else:
            self.fail()

        self.assertTrue(session.done)
Ejemplo n.º 42
0
    def test_status_line_only(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/status_line_only'))
        response, content = yield From(self.fetch(stream, request))

        self.assertEqual(200, response.status_code)
        self.assertEqual(b'Hey', content)
Ejemplo n.º 43
0
 def test_utf8_header(self):
     stream = self.new_stream()
     request = Request(self.get_url('/utf8_header'))
     response, dummy = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     self.assertEqual('🐱'.encode('utf-8').decode('latin-1'),
                      response.fields['whoa'])
Ejemplo n.º 44
0
    def test_false_gzip(self):
        stream = self.new_stream('127.0.0.1', self._port)
        request = Request(self.get_url('/false_gzip'))
        response, content = yield From(self.fetch(stream, request))

        self.assertEqual('gzip', response.fields['Content-Encoding'])
        self.assertEqual(b'a' * 100, content)
Ejemplo n.º 45
0
 def test_content_length_and_chunked(self):
     stream = self.new_stream()
     request = Request(self.get_url('/content_length_and_chunked'))
     response, content = yield From(self.fetch(stream, request))
     self.assertEqual(200, response.status_code)
     self.assertEqual('chunked', response.fields['Transfer-Encoding'])
     self.assertEqual(b'hello world!', content)
Ejemplo n.º 46
0
 def test_ssl_fail(self):
     connection = Connection('localhost', self.get_http_port())
     try:
         yield connection.fetch(Request.new(self.get_url('/')))
     except SSLVerficationError:
         pass
     else:
         self.fail()
Ejemplo n.º 47
0
 def test_request(self):
     request = Request.new('http://example.com/robots.txt')
     self.assertEqual(
         (b'GET /robots.txt HTTP/1.1\r\n'
         b'Host: example.com\r\n'
         b'\r\n'),
         request.header()
     )
Ejemplo n.º 48
0
 def test_connection_reuse(self):
     connection = Connection('localhost', self._port)
     request = Request.new(self.get_url('/'))
     request.version = 'HTTP/1.0'
     response = yield connection.fetch(request)
     self.assertEqual(200, response.status_code)
     response = yield connection.fetch(request)
     self.assertEqual(200, response.status_code)
Ejemplo n.º 49
0
    def test_warc_recorder_journal(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.to_bytes())
            test_instance = self

            class MockRecord(WARCRecord):
                def __init__(self, original_record):
                    super().__init__()
                    self.block_file = original_record.block_file
                    self.fields = original_record.fields

                def __iter__(self):
                    print(list(os.walk('.')))
                    test_instance.assertTrue(
                        os.path.exists(warc_filename + '-wpullinc')
                    )

                    for dummy in range(1000):
                        yield b"where's my elephant?"

            session._child_session._request_record = \
                MockRecord(session._child_session._request_record)

            session.request(request)

            self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
Ejemplo n.º 50
0
 def test_connection_refused(self):
     connection = Connection('localhost', 1)
     try:
         yield connection.fetch(
             Request.new('http://localhost:1/'))
     except ConnectionRefused:
         pass
     else:
         self.fail()
Ejemplo n.º 51
0
    def test_client_exception_throw(self):
        client = Client()

        try:
            yield client.fetch(Request.new('http://wpull-no-exist.invalid'))
        except NetworkError:
            pass
        else:
            self.fail()
Ejemplo n.º 52
0
 def test_no_such_host(self):
     connection = Connection('wpull-no-exist.invalid', 80)
     try:
         yield connection.fetch(
             Request.new('http://wpull-no-exist.invalid'))
     except NetworkError:
         pass
     else:
         self.fail()
Ejemplo n.º 53
0
 def test_connection_timeout(self):
     connection = Connection('1.0.0.0', 1, connect_timeout=0.1)
     try:
         yield connection.fetch(
             Request.new('http://1.0.0.0:1/'))
     except NetworkError:
         pass
     else:
         self.fail()
Ejemplo n.º 54
0
 def test_read_timeout(self):
     connection = Connection('localhost', self._port, read_timeout=0.1)
     request = Request.new(self.get_url('/sleep_long'))
     try:
         yield connection.fetch(request)
     except NetworkError:
         pass
     else:
         self.fail()
Ejemplo n.º 55
0
    def _read_request_header(self):
        request = Request()

        for dummy in range(100):
            line = yield From(self._reader.readline())

            _logger.debug(__('Got line {0}', line))

            if line[-1:] != b'\n':
                return

            if not line.strip():
                break

            request.parse(line)
        else:
            raise ProtocolError('Request has too many headers.')

        raise Return(request)
Ejemplo n.º 56
0
 def test_buffer_overflow(self):
     connection = Connection('localhost', self._port,
         connect_timeout=2.0, read_timeout=5.0, buffer_size=1000)
     request = Request.new(self.get_url('/buffer_overflow'))
     try:
         yield connection.fetch(request)
     except (ProtocolError, NetworkError):
         pass
     else:
         self.fail()
Ejemplo n.º 57
0
    def test_basic(self):
        http_client = Client()
        client = RichClient(http_client)
        session = client.session(Request.new(self.get_url('/')))

        self.assertFalse(session.done)
        response = yield session.fetch()

        self.assertEqual(200, response.status_code)
        self.assertTrue(session.done)
Ejemplo n.º 58
0
    def test_http_request(self):
        request = Request.new('http://example.com')
        request.fields['hello'] = 'world'
        new_request = convert_http_request(request)

        if sys.version_info[0] == 2:
            self.assertEqual('example.com', new_request.get_host())
        else:
            self.assertEqual('example.com', new_request.host)

        self.assertEqual('world', new_request.get_header('Hello'))