Python Response.body Examples, wpull.protocol.http.request.Response.body Python Examples

Example #1

0

Show file

File: request_test.py Project: fakegit/ludios_wpull

    def test_to_dict_body(self):
        request = Request()
        request.body = Body()
        request_dict = request.to_dict()

        self.assertTrue(request_dict['body'])
        request.body.close()

        request = Request()
        request.body = NotImplemented
        request_dict = request.to_dict()

        self.assertFalse(request_dict['body'])

        response = Response()
        response.body = Body()
        response_dict = response.to_dict()

        self.assertTrue(response_dict['body'])
        response.body.close()

        response = Response()
        response.body = NotImplemented
        response_dict = response.to_dict()

        self.assertFalse(response_dict['body'])

Example #2

0

Show file

File: request_test.py Project: Super-Rad/wpull

    def test_to_dict_body(self):
        request = Request()
        request.body = Body()
        request_dict = request.to_dict()

        self.assertTrue(request_dict['body'])
        request.body.close()

        request = Request()
        request.body = NotImplemented
        request_dict = request.to_dict()

        self.assertFalse(request_dict['body'])

        response = Response()
        response.body = Body()
        response_dict = response.to_dict()

        self.assertTrue(response_dict['body'])
        response.body.close()

        response = Response()
        response.body = NotImplemented
        response_dict = response.to_dict()

        self.assertFalse(response_dict['body'])

Example #3

0

Show file

    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)

Example #4

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_scraper_links_base_href(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "basehref.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual("utf-8", scrape_result.encoding)

        self.assertEqual(
            {
                "http://cdn.example.com/stylesheet1.css",
                "http://www.example.com/stylesheet2.css",
                "http://example.com/a/stylesheet3.css",
                "http://example.com/a/dir/image1.png",
                "http://example.com/dir/image2.png",
                "http://example.net/image3.png",
                "http://example.com/dir/image4.png",
            },
            inline_urls,
        )
        self.assertEqual({"http://example.com/a/"}, linked_urls)

Example #5

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_wrong_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "kcna.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual("utf-16-le", scrape_result.encoding)

        self.assertEqual(
            {
                "http://example.com/utm/__utm.js",
                "http://example.com/Knewskage.gif",
                "http://example.com/Lline.gif",
                "http://example.com/Sline.gif",
                "http://example.com/korean01.gif",
                "http://example.com/korean02.gif",
                "http://example.com/english01.gif",
                "http://example.com/english02.gif",
                "http://example.com/Tongsinkage.gif",
                "http://example.com/Knewskage.gif",
            },
            inline_urls,
        )
        self.assertEqual({"http://example.com/index-k.htm", "http://example.com/index-e.htm"}, linked_urls)

Example #6

0

Show file

File: javascript_test.py Project: Super-Rad/wpull

    def test_javascript_heavy_inline_monstrosity(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/test.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                in_file.seek(0x147)
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png',
            inline_urls
        )
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A',
            linked_urls
        )

        print('\n'.join(inline_urls))
        print('\n'.join(linked_urls))

Example #7

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_soup(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['Refresh'] = 'yes'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'soup.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls)
        self.assertEqual(
            {
                'http://example.com/BLOG',
                'http://example.com/web ring/Join.htm',
            }, linked_urls)

Example #8

0

Show file

    def test_javascript_heavy_inline_monstrosity(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/test.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'twitchplayspokemonfirered.html')
            with open(html_file_path, 'rb') as in_file:
                in_file.seek(0x147)
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn(
            'http://cdn.bulbagarden.net/upload/archive/a/a4/'
            '20090718115357%21195Quagsire.png', inline_urls)
        self.assertIn(
            'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F'
            'user%2FGoldenSandslash15&sa=D&sntz=1&'
            'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls)

        print('\n'.join(inline_urls))
        print('\n'.join(linked_urls))

Example #9

0

Show file

    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/sitemap1.xml.gz',
        }, linked_urls)
        self.assertFalse(inline_urls)

Example #10

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_scraper_links_base_href(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'basehref.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-8', scrape_result.encoding)

        self.assertEqual(
            {
                'http://cdn.example.com/stylesheet1.css',
                'http://www.example.com/stylesheet2.css',
                'http://example.com/a/stylesheet3.css',
                'http://example.com/a/dir/image1.png',
                'http://example.com/dir/image2.png',
                'http://example.net/image3.png',
                'http://example.com/dir/image4.png',
            }, inline_urls)
        self.assertEqual({'http://example.com/a/'}, linked_urls)

Example #11

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_rss_as_html(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'application/rss+xml'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'rss.xml')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(inline_urls)
        self.assertEqual(
            {
                'http://www.someexamplerssdomain.com/main.html',
                'http://www.wikipedia.org/'
            }, linked_urls)

Example #12

0

Show file

File: sitemap_test.py Project: charygao/wpull

    def test_sitemap_scraper_xml_index(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/sitemap.xml")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b"""<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            """
            )

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://www.example.com/sitemap1.xml.gz"}, linked_urls)
        self.assertFalse(inline_urls)

Example #13

0

Show file

File: sitemap_test.py Project: charygao/wpull

    def test_sitemap_scraper_xml(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/sitemap.xml")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b"""<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
                      <lastmod>2005-01-01</lastmod>
                      <changefreq>monthly</changefreq>
                      <priority>0.8</priority>
                   </url>
                </urlset>
            """
            )

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://www.example.com/"}, linked_urls)
        self.assertFalse(inline_urls)

Example #14

0

Show file

 def response_callback(request):
     request.prepare_for_send()
     self.assertTrue(request.url_info.url.endswith('robots.txt'))
     response = Response(200, 'OK')
     response.request = request
     response.body = io.StringIO('User-agent:*\nDisallow: /\n')
     checker.web_client.session_obj.done_value = True
     return response

Example #15

0

Show file

        def response_callback_3(request):
            request.prepare_for_send()
            self.assertEqual('http://www.example.net/robots.txt',
                             request.url_info.url)

            response = Response(200, 'OK')
            response.request = request
            response.body = io.StringIO('User-agent:*\nAllow: /\n')

            checker.web_client.session_obj.done_value = True
            return response

Example #16

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_scraper_reject_type(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "many_urls.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response, link_type=LinkType.css)
        self.assertFalse(scrape_result)

Example #17

0

Show file

File: recorder_test.py Project: Super-Rad/wpull

    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                max_size=1,
                appending=True
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))

Example #18

0

Show file

File: sitemap_test.py Project: charygao/wpull

    def test_sitemap_scraper_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/robots.txt")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b"Sitemap: http://example.com/sitemap00.xml")

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://example.com/sitemap00.xml"}, linked_urls)
        self.assertFalse(inline_urls)

Example #19

0

Show file

File: css_test.py Project: Super-Rad/wpull

    def test_css_scraper_reject_type(self):
        scraper = CSSScraper()
        request = Request('http://example.com/styles.css')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'styles.css')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response,
                                       link_type=LinkType.html)
        self.assertFalse(scrape_result)

Example #20

0

Show file

File: sitemap_test.py Project: charygao/wpull

    def test_sitemap_scraper_invalid_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request("http://example.com/robots.txt")
        response = Response(200, "OK")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b"dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk")

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertFalse(linked_urls)
        self.assertFalse(inline_urls)

Example #21

0

Show file

File: recorder_test.py Project: Super-Rad/wpull

    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())

        class BadRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                for dummy in range(1000):
                    yield b"where's my elephant?"
                raise OSError('Oops')

        session._request_record = BadRecord(session._request_record)
        original_offset = os.path.getsize(warc_filename)

        with self.assertRaises((OSError, IOError)):
            session.end_request(request)

        new_offset = os.path.getsize(warc_filename)
        self.assertEqual(new_offset, original_offset)
        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

        _logger.debug('original offset {0}'.format(original_offset))

Example #22

0

Show file

    def test_javascript_reject_type(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/script.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'script.js')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request,
                                       response,
                                       link_type=LinkType.css)
        self.assertFalse(scrape_result)

Example #23

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_encoding_lxml_name_mismatch(self):
        """It should accept encoding names with underscore."""
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "text/html; charset=EUC_KR"

        with wpull.util.reset_file_offset(response.body):
            response.body.write("힖".encode("euc_kr"))

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
        self.assertEqual("euc_kr", scrape_info["encoding"])

Example #24

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_serious_bad_encoding(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker, encoding_override="utf8")
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "text/html; charset=utf8"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "xkcd_1_evil.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)

Example #25

0

Show file

    def test_sitemap_scraper_invalid_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/robots.txt')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b'dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertFalse(linked_urls)
        self.assertFalse(inline_urls)

Example #26

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_bad_xml(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'foxstripcomics_bad_xml.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        # No crash
        scraper.scrape(request, response, link_type=LinkType.html)

Example #27

0

Show file

File: recorder_test.py Project: fakegit/ludios_wpull

    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(compress=False,
                                      max_size=1,
                                      appending=True),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))

Example #28

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_encoding_lxml_name_mismatch(self):
        '''It should accept encoding names with underscore.'''
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=EUC_KR'

        with wpull.util.reset_file_offset(response.body):
            response.body.write('힖'.encode('euc_kr'))

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)
        self.assertEqual('euc_kr', scrape_info['encoding'])

Example #29

0

Show file

    def test_sitemap_scraper_robots(self):
        scraper = SitemapScraper(self.get_html_parser())
        request = Request('http://example.com/robots.txt')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'Sitemap: http://example.com/sitemap00.xml')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://example.com/sitemap00.xml',
        }, linked_urls)
        self.assertFalse(inline_urls)

Example #30

0

Show file

File: recorder_test.py Project: fakegit/ludios_wpull

    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(warc_prefix,
                                     params=WARCRecorderParams(
                                         compress=False, ))

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())

        class BadRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                for dummy in range(1000):
                    yield b"where's my elephant?"
                raise OSError('Oops')

        session._request_record = BadRecord(session._request_record)
        original_offset = os.path.getsize(warc_filename)

        with self.assertRaises((OSError, IOError)):
            session.end_request(request)

        new_offset = os.path.getsize(warc_filename)
        self.assertEqual(new_offset, original_offset)
        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

        _logger.debug('original offset {0}'.format(original_offset))

Example #31

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_not_quite_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "videogame_top.htm")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn("http://example.com/copyright_2001_2006_rtype.gif", inline_urls)
        self.assertIn("http://www.geocities.jp/gamehouse_grindcrusher/", linked_urls)

Example #32

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_garbage(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "text/html"

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b"\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f"
                b"\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo"
                b"\x7f\xff\xff\xffV\xc1\xff\x7f\xff7"
            )

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)

Example #33

0

Show file

File: html_test.py Project: charygao/wpull

    def test_xhtml_invalid(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "xhtml_invalid.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://example.com/image.png", "http://example.com/script.js"}, inline_urls)
        self.assertEqual({"http://example.com/link"}, linked_urls)

Example #34

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_garbage(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html'

        with wpull.util.reset_file_offset(response.body):
            response.body.write(
                b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f'
                b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo'
                b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7')

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)

Example #35

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_scraper_reject_type(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'many_urls.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request,
                                       response,
                                       link_type=LinkType.css)
        self.assertFalse(scrape_result)

Example #36

0

Show file

File: recorder_test.py Project: Super-Rad/wpull

    def test_warc_recorder_journal(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        warc_recorder = WARCRecorder(
            warc_prefix,
            params=WARCRecorderParams(
                compress=False,
            )
        )

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        test_instance = self

        class MockRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                print(list(os.walk('.')))
                test_instance.assertTrue(
                    os.path.exists(warc_filename + '-wpullinc')
                )

                for dummy in range(1000):
                    yield b"where's my elephant?"

        session._request_record = MockRecord(session._request_record)

        session.end_request(request)

        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

Example #37

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_soup(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["Refresh"] = "yes"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "soup.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({"http://example.com/ABOUTM~1.JPG"}, inline_urls)
        self.assertEqual({"http://example.com/BLOG", "http://example.com/web ring/Join.htm"}, linked_urls)

Example #38

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_serious_bad_encoding(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(),
                              element_walker,
                              encoding_override='utf8')
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=utf8'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'xkcd_1_evil.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_info = scraper.scrape(request, response)

        self.assertTrue(scrape_info)

Example #39

0

Show file

File: css_test.py Project: fakegit/ludios_wpull

    def test_css_scraper_mojibake(self):
        scraper = CSSScraper()
        request = Request('http://example.com/styles.css')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'mojibake.css')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://example.com/文字化け.png',
        }, inline_urls)
        self.assertFalse(linked_urls)

Example #40

0

Show file

File: html_test.py Project: charygao/wpull

    def test_rss_as_html(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "application/rss+xml"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "rss.xml")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)

        self.assertTrue(scrape_result)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links
        self.assertFalse(inline_urls)
        self.assertEqual({"http://www.someexamplerssdomain.com/main.html", "http://www.wikipedia.org/"}, linked_urls)

Example #41

0

Show file

File: javascript_test.py Project: Super-Rad/wpull

    def test_javascript_scraper(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/script.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'script.js')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://example.com/script_variable.png',
            'http://example.com/dragonquery.js',
        },
            inline_urls
        )
        self.assertEqual({
            'http://example.com/document_write.html',
            'http://example.com/http_document_write.html',
            'http://example.com/http_document_write2.html',
            'http://example.com/http document write.html',
            'http://example.com/script_variable.html',
            'http://example.com/http_script_variable.html',
            'https://example.com/https_script_variable.html',
            'ftp://example.com/ftp_script_variable.html',
            'http://example.com/end_dir_script_variable/',
            'http://example.com/start_dir_script_variable',
            'http://example.com/../relative_dir_script_variable'
            if sys.version_info < (3, 5) else
            'http://example.com/relative_dir_script_variable',
            'http://example.com/script_json.html',
            'http://example.com/http_script_json.html?a=b',
        },
            linked_urls
        )

Example #42

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_krokozyabry(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "")
        response.body = Body()
        response.fields["content-type"] = "text/html; charset=KOI8-R"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "krokozyabry.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual("koi8-r", scrape_result.encoding)

        self.assertEqual(set(), inline_urls)
        self.assertEqual({"http://example.com/Кракозябры"}, linked_urls)

Example #43

0

Show file

    def test_sitemap_scraper_invalid_xml(self):
        scraper = SitemapScraper(HTMLParser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <url>
                      <loc>http://www.example.com/</loc>
            ''')

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://www.example.com/',
        }, linked_urls)
        self.assertFalse(inline_urls)

Example #44

0

Show file

    def test_sitemap_scraper_reject_type(self):
        scraper = SitemapScraper(HTMLParser())
        request = Request('http://example.com/sitemap.xml')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?>
                <sitemapindex
                xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
                   <sitemap>
                      <loc>http://www.example.com/sitemap1.xml.gz</loc>
                      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
                   </sitemap>
                </sitemapindex>
            ''')

        scrape_result = scraper.scrape(request,
                                       response,
                                       link_type=LinkType.css)
        self.assertFalse(scrape_result)

Example #45

0

Show file

File: recorder_test.py Project: Super-Rad/wpull

    def test_warc_move_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        os.mkdir('./blah/')

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                cdx=True,
                move_to='./blah/',
                max_size=1,
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('./blah/asdf-00000.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-00001.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-meta.warc'))
        self.assertTrue(os.path.exists('./blah/' + cdx_filename))

Example #46

0

Show file

File: recorder_test.py Project: fakegit/ludios_wpull

    def test_warc_move_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        os.mkdir('./blah/')

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                cdx=True,
                move_to='./blah/',
                max_size=1,
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('./blah/asdf-00000.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-00001.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-meta.warc'))
        self.assertTrue(os.path.exists('./blah/' + cdx_filename))

Example #47

0

Show file

File: recorder_test.py Project: fakegit/ludios_wpull

    def test_warc_recorder_journal(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        warc_recorder = WARCRecorder(warc_prefix,
                                     params=WARCRecorderParams(
                                         compress=False, ))

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        test_instance = self

        class MockRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                print(list(os.walk('.')))
                test_instance.assertTrue(
                    os.path.exists(warc_filename + '-wpullinc'))

                for dummy in range(1000):
                    yield b"where's my elephant?"

        session._request_record = MockRecord(session._request_record)

        session.end_request(request)

        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

Example #48

0

Show file

File: css_test.py Project: Super-Rad/wpull

    def test_css_scraper_mojibake(self):
        scraper = CSSScraper()
        request = Request('http://example.com/styles.css')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH,
                                          'testing', 'samples', 'mojibake.css')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual({
            'http://example.com/文字化け.png',
        },
            inline_urls
        )
        self.assertFalse(linked_urls)

Example #49

0

Show file

    def test_javascript_scraper(self):
        scraper = JavaScriptScraper()
        request = Request('http://example.com/script.js')
        response = Response(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'script.js')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual(
            {
                'http://example.com/script_variable.png',
                'http://example.com/dragonquery.js',
            }, inline_urls)
        self.assertEqual(
            {
                'http://example.com/document_write.html',
                'http://example.com/http_document_write.html',
                'http://example.com/http_document_write2.html',
                'http://example.com/http document write.html',
                'http://example.com/script_variable.html',
                'http://example.com/http_script_variable.html',
                'https://example.com/https_script_variable.html',
                'ftp://example.com/ftp_script_variable.html',
                'http://example.com/end_dir_script_variable/',
                'http://example.com/start_dir_script_variable',
                'http://example.com/../relative_dir_script_variable'
                if sys.version_info <
                (3, 5) else 'http://example.com/relative_dir_script_variable',
                'http://example.com/script_json.html',
                'http://example.com/http_script_json.html?a=b',
            }, linked_urls)

Example #50

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_not_quite_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'videogame_top.htm')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertIn('http://example.com/copyright_2001_2006_rtype.gif',
                      inline_urls)
        self.assertIn('http://www.geocities.jp/gamehouse_grindcrusher/',
                      linked_urls)

Example #51

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_wrong_charset(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'kcna.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-16-le', scrape_result.encoding)

        self.assertEqual(
            {
                'http://example.com/utm/__utm.js',
                'http://example.com/Knewskage.gif',
                'http://example.com/Lline.gif',
                'http://example.com/Sline.gif',
                'http://example.com/korean01.gif',
                'http://example.com/korean02.gif',
                'http://example.com/english01.gif',
                'http://example.com/english02.gif',
                'http://example.com/Tongsinkage.gif',
                'http://example.com/Knewskage.gif',
            }, inline_urls)
        self.assertEqual(
            {
                'http://example.com/index-k.htm',
                'http://example.com/index-e.htm',
            }, linked_urls)

Example #52

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_krokozyabry(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()
        response.fields['content-type'] = 'text/html; charset=KOI8-R'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'krokozyabry.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('koi8-r', scrape_result.encoding)

        self.assertEqual(set(), inline_urls)
        self.assertEqual({'http://example.com/Кракозябры'}, linked_urls)

Example #53

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_xhtml_invalid(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, '')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'xhtml_invalid.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual(
            {
                'http://example.com/image.png',
                'http://example.com/script.js',
            }, inline_urls)
        self.assertEqual({'http://example.com/link'}, linked_urls)

Example #54

0

Show file

File: html_test.py Project: charygao/wpull

    def test_html_scraper_links(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(self.get_html_parser(), element_walker)
        request = Request("http://example.com/")
        response = Response(200, "OK")
        response.body = Body()
        response.fields["Refresh"] = "3; url=header_refresh.html"

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, "testing", "samples", "many_urls.html")
            with open(html_file_path, "rb") as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual("utf-8", scrape_result.encoding)

        self.assertEqual(
            {
                "http://example.com/style_import_url.css",
                "http://example.com/style_import_quote_url.css",
                "http://example.com/style_single_quote_import.css",
                "http://example.com/style_double_quote_import.css",
                "http://example.com/link_href.css",
                "http://example.com/script.js",
                "http://example.com/body_background.png",
                "http://example.com/images/table_background.png",
                "http://example.com/images/td_background.png",
                "http://example.com/images/th_background.png",
                "http://example.com/style_url1.png",
                "http://example.com/style_url2.png",
                "http://example.com/applet/",  # returned by lxml
                "http://example.com/applet/applet_code.class",
                "http://example.com/applet/applet_src.class",
                "http://example.com/bgsound.mid",
                "http://example.com/audio_src.wav",
                "http://example.net/source_src.wav",
                "http://example.com/embed_src.mov",
                "http://example.com/fig_src.png",
                "http://example.com/frame_src.html",
                "http://example.com/iframe_src.html",
                "http://example.com/img_href.png",
                "http://example.com/img_lowsrc.png",
                "http://example.com/img_src.png",
                "http://example.com/img_data.png",
                "http://example.com/img_srcset_1.jpeg",
                "http://example.com/img_srcset_2.jpeg",
                "http://example.com/img_srcset_3.jpeg",
                "http://example.com/input_src.png",
                "http://example.com/layer_src.png",
                "http://example.com/object/",  # returned by lxml
                "http://example.com/object/object_data.swf",
                "http://example.com/object/object_archive.dat",
                "mailto:internet",
                "object_not_url_codebase",
                "http://example.com/param_ref_value.php",
                "http://example.com/overlay_src.html",
                "http://example.com/script_variable.png",
            },
            inline_urls,
        )
        self.assertEqual(
            {
                "http://example.com/og_image.png",
                "http://example.com/og_url.html",
                "http://example.com/og_audio.mp3",
                "http://example.com/og_video.webm",
                "http://example.com/twitter_image.png",
                "http://example.com/twitter_image0.png",
                "http://example.com/twitter_image1.png",
                "http://example.com/twitter_image2.png",
                "http://example.com/twitter_image3.png",
                "http://example.com/twitter_player.html",
                "http://example.com/twitter_stream.mp4",
                "http://example.net/soup.html",
                "http://example.com/a_href.html",
                "http://example.com/area_href.html",
                "http://example.com/frame_src.html",
                "http://example.com/embed_href.html",
                "http://example.com/embed_src.mov",
                "http://example.com/form_action.html",
                "http://example.com/iframe_src.html",
                "http://example.com/layer_src.png",
                "http://example.com/overlay_src.html",
                "ftp://ftp.protocol.invalid/",
                "mailto:[email protected]",
                "http://a-double-slash.example",
                "http://example.com/header_refresh.html",
                "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6",
                "http://example.com/document_write.html",
                "http://example.com/http_document_write.html",
                "http://example.com/http_document_write2.html",
                "http://example.com/http document write.html",
                "http://example.com/script_variable.html",
                "http://example.com/http_script_variable.html",
                "https://example.com/https_script_variable.html",
                "ftp://example.com/ftp_script_variable.html",
                "http://example.com/end_dir_script_variable/",
                "http://example.com/start_dir_script_variable",
                "http://example.com/../relative_dir_script_variable"
                if sys.version_info < (3, 5)
                else "http://example.com/relative_dir_script_variable",
                "http://example.com/script_json.html",
                "http://example.com/http_script_json.html?a=b",
                "http://example.com/a_javascript_link.html",
                "http://example.com/a_onclick_link.html",
            },
            linked_urls,
        )

        for url in inline_urls | linked_urls:
            self.assertIsInstance(url, str)

Example #55

0

Show file

File: recorder_test.py Project: Super-Rad/wpull

    def test_warc_recorder_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                extra_fields=[('Extra-field', 'my_extra_field')],
                cdx=True, max_size=1,
            )
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/2')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'DOGE KITTEH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf-00000.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)

        with open('asdf-00001.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'DOGE KITTEH', warc_file_content)

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')

        print(cdx_lines)

        self.assertEqual(4, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))

        self.assertIn(b'http://example.com/1', cdx_file_content)
        self.assertIn(b'http://example.com/2', cdx_file_content)

        with open('asdf-meta.warc', 'rb') as in_file:
            meta_file_content = in_file.read()

        self.assertIn(b'FINISHED', meta_file_content)

        self.validate_warc('asdf-00000.warc')
        self.validate_warc('asdf-00001.warc')
        self.validate_warc('asdf-meta.warc')

Example #56

0

Show file

File: recorder_test.py Project: Super-Rad/wpull

    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder(
            'asdf',
            params=WARCRecorderParams(
                compress=False, cdx=True, url_table=url_table
            )
        )

        url_table.add_visits([
            (
                'http://example.com/fennec',
                '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
                'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ'
            )
        ])

        request = HTTPRequest('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()
        revisit_response_header_size = len(response.to_bytes())

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content
        )
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n',
            warc_file_content
        )
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/fennec\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content
        )
        self.assertEqual(
            1,
            warc_file_content.count(b'kitbit')
        )

        self.assertIn(b'http://example.com/horse ', cdx_file_content)

Example #57

0

Show file

File: recorder_test.py Project: Super-Rad/wpull

    def test_warc_recorder(self):
        file_prefix = 'asdf'
        warc_filename = 'asdf.warc'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                extra_fields=[('Extra-field', 'my_extra_field')],
                cdx=True,
            ),
        )

        request = HTTPRequest('http://example.com/')
        request.prepare_for_send()
        request.address = ('0.0.0.0', 80)
        request.prepare_for_send()
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open(warc_filename, 'rb') as in_file:
            warc_file_content = in_file.read()

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content)
        self.assertIn(b'Content-Type: application/warc-fields',
                      warc_file_content)
        self.assertIn(b'WARC-Date: ', warc_file_content)
        self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content)
        self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Type: request\r\n', warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=request',
                      warc_file_content)
        self.assertIn(b'WARC-Type: response', warc_file_content)
        self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=response',
                      warc_file_content)
        self.assertIn(
            'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'),
            warc_file_content
        )
        self.assertIn(
            'Python/{0}'.format(
                wpull.util.python_version()).encode('utf-8'),
            warc_file_content
        )
        self.assertIn(b'Extra-Field: my_extra_field', warc_file_content)
        self.assertIn(b'GET / HTTP', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)
        self.assertIn(b'FINISHED', warc_file_content)
        self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content)
        self.assertIn(b'Content-Length:', warc_file_content)
        self.assertNotIn(b'Content-Length: 0', warc_file_content)

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')
        cdx_fields = cdx_lines[1].split(b' ')

        print(cdx_lines)

        self.assertEqual(3, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))
        self.assertEqual(9, len(cdx_fields))
        self.assertTrue(cdx_lines[0].startswith(b' CDX'))

        self.assertEqual(b'http://example.com/', cdx_fields[0])
        self.assertEqual(b'-', cdx_fields[2])
        self.assertEqual(b'200', cdx_fields[3])
        self.assertNotEqual(b'-', cdx_fields[4])
        self.assertNotEqual(b'0', cdx_fields[5])
        self.assertNotEqual(b'0', cdx_fields[6])
        self.assertEqual(
            os.path.basename(warc_filename), cdx_fields[7].decode('ascii'))

        length = int(cdx_fields[5])
        offset = int(cdx_fields[6])

        with open(warc_filename, 'rb') as in_file:
            in_file.seek(offset)
            data = in_file.read(length)

            assert len(data) == length

        self.assertEqual(b'WARC/1.0', data[:8])

        self.assertIn(b'KITTEH DOGE', data)

        self.validate_warc(warc_filename)

Example #58

0

Show file

File: html_test.py Project: fakegit/ludios_wpull

    def test_html_scraper_links(self):
        element_walker = ElementWalker(css_scraper=CSSScraper(),
                                       javascript_scraper=JavaScriptScraper())
        scraper = HTMLScraper(HTMLParser(), element_walker)
        request = Request('http://example.com/')
        response = Response(200, 'OK')
        response.body = Body()
        response.fields['Refresh'] = '3; url=header_refresh.html'

        with wpull.util.reset_file_offset(response.body):
            html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples',
                                          'many_urls.html')
            with open(html_file_path, 'rb') as in_file:
                shutil.copyfileobj(in_file, response.body)

        scrape_result = scraper.scrape(request, response)
        inline_urls = scrape_result.inline_links
        linked_urls = scrape_result.linked_links

        self.assertEqual('utf-8', scrape_result.encoding)

        self.assertEqual(
            frozenset({
                'http://example.com/style_import_url.css',
                'http://example.com/style_import_quote_url.css',
                'http://example.com/style_single_quote_import.css',
                'http://example.com/style_double_quote_import.css',
                'http://example.com/bg.png',
                'http://example.com/link_href.css',
                'http://example.com/script.js',
                'http://example.com/body_background.png',
                'http://example.com/images/table_background.png',
                'http://example.com/images/td_background.png',
                'http://example.com/images/th_background.png',
                'http://example.com/style_url1.png',
                'http://example.com/style_url2.png',
                'http://example.com/applet/',  # returned by lxml
                'http://example.com/applet/applet_code.class',
                'http://example.com/applet/applet_src.class',
                'http://example.com/bgsound.mid',
                'http://example.com/audio_src.wav',
                'http://example.com/audio_poster.jpeg',
                'http://example.net/source_src.wav',
                'http://example.com/video_src.webm',
                'http://example.com/video_poster.jpeg',
                'http://example.net/track_src.vtt',
                'http://example.net/source_src.webm',
                'http://example.com/embed_src.mov',
                'http://example.com/fig_src.png',
                'http://example.com/frame_src.html',
                'http://example.com/iframe_src.html',
                'http://example.com/img_href.png',
                'http://example.com/img_lowsrc.png',
                'http://example.com/img_src.png',
                'http://example.com/img_data.png',
                'http://example.com/img_srcset_1.jpeg',
                'http://example.com/img_srcset_2.jpeg',
                'http://example.com/img_srcset_3.jpeg',
                'http://example.com/input_src.png',
                'http://example.com/layer_src.png',
                'http://example.com/object/',  # returned by lxml
                'http://example.com/object/object_data.swf',
                'http://example.com/object/object_archive.dat',
                'mailto:internet',
                'object_not_url_codebase',
                'http://example.com/param_ref_value.php',
                'http://example.com/overlay_src.html',
                'http://example.com/script_variable.png',
            }),
            inline_urls)
        self.assertEqual(
            frozenset({
                'http://example.com/og_image.png',
                'http://example.com/og_url.html',
                'http://example.com/og_audio.mp3',
                'http://example.com/og_video.webm',
                'http://example.com/twitter_image.png',
                'http://example.com/twitter_image0.png',
                'http://example.com/twitter_image1.png',
                'http://example.com/twitter_image2.png',
                'http://example.com/twitter_image3.png',
                'http://example.com/twitter_player.html',
                'http://example.com/twitter_stream.mp4',
                'http://example.net/soup.html',
                'http://example.com/a_href.html',
                'http://example.com/area_href.html',
                'http://example.com/frame_src.html',
                'http://example.com/embed_href.html',
                'http://example.com/embed_src.mov',
                'http://example.com/form_action.html',
                'http://example.com/iframe_src.html',
                'http://example.com/layer_src.png',
                'http://example.com/overlay_src.html',
                'ftp://ftp.protocol.invalid/',
                'mailto:[email protected]',
                'http://a-double-slash.example',
                'http://example.com/header_refresh.html',
                'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6',
                'http://example.com/document_write.html',
                'http://example.com/http_document_write.html',
                'http://example.com/http_document_write2.html',
                'http://example.com/http document write.html',
                'http://example.com/script_variable.html',
                'http://example.com/http_script_variable.html',
                'https://example.com/https_script_variable.html',
                'ftp://example.com/ftp_script_variable.html',
                'http://example.com/end_dir_script_variable/',
                'http://example.com/start_dir_script_variable',
                'http://example.com/../relative_dir_script_variable'
                if sys.version_info <
                (3, 5) else 'http://example.com/relative_dir_script_variable',
                'http://example.com/script_json.html',
                'http://example.com/http_script_json.html?a=b',
                'http://example.com/a_javascript_link.html',
                'http://example.com/a_onclick_link.html',
            }), linked_urls)

        for url in inline_urls | linked_urls:
            self.assertIsInstance(url, str)