コード例 #1
0
ファイル: converter_test.py プロジェクト: flatron18116/wpull
    def test_css_converter(self):
        url_table = URLTable()
        css_filename = os.path.join(self.temp_dir.name, 'styles.css')
        image_filename = os.path.join(self.temp_dir.name, 'image.png')
        new_css_filename = os.path.join(self.temp_dir.name, 'styles.css-new')

        url_table.add_many([
            {'url': 'http://example.com/styles.css'},
            {'url': 'http://example.com/image.png'},
            {'url': 'http://example.com/cat.jpg'},
            {'url': 'http://example.com/cat.jpg'},
        ])
        url_table.update_one(
            'http://example.com/styles.css',
            status=Status.done,
            link_type='css',
            filename=os.path.relpath(css_filename, self.temp_dir.name)
        )
        url_table.update_one(
            'http://example.com/image.png',
            status=Status.done,
            filename=os.path.relpath(image_filename, self.temp_dir.name)
        )

        with open(css_filename, 'w') as out_file:
            out_file.write(CSS_TEXT)

        with open(image_filename, 'wb'):
            pass

        converter = CSSConverter(url_table)

        converter.convert(
            css_filename, new_css_filename,
            base_url='http://example.com/styles.css'
        )

        with open(new_css_filename, 'r') as in_file:
            converted_text = in_file.read()

        self.assertIn("url('image.png')", converted_text)
        self.assertIn("url('http://example.com/cat.jpg')", converted_text)
コード例 #2
0
ファイル: converter_test.py プロジェクト: pombredanne/wpull
    def test_css_converter(self):
        url_table = URLTable()
        css_filename = os.path.join(self.temp_dir.name, "styles.css")
        image_filename = os.path.join(self.temp_dir.name, "image.png")
        new_css_filename = os.path.join(self.temp_dir.name, "styles.css-new")

        url_table.add_many(
            [
                AddURLInfo("http://example.com/styles.css", None, None),
                AddURLInfo("http://example.com/image.png", None, None),
                AddURLInfo("http://example.com/cat.jpg", None, None),
                AddURLInfo("http://example.com/cat.jpg", None, None),
            ]
        )
        url_table.update_one(
            "http://example.com/styles.css",
            status=Status.done.value,
            link_type="css",
            filename=os.path.relpath(css_filename, self.temp_dir.name),
        )
        url_table.update_one(
            "http://example.com/image.png",
            status=Status.done.value,
            filename=os.path.relpath(image_filename, self.temp_dir.name),
        )

        with open(css_filename, "w") as out_file:
            out_file.write(CSS_TEXT)

        with open(image_filename, "wb"):
            pass

        converter = CSSConverter(url_table)

        converter.convert(css_filename, new_css_filename, base_url="http://example.com/styles.css")

        with open(new_css_filename, "r") as in_file:
            converted_text = in_file.read()

        self.assertIn("url('image.png')", converted_text)
        self.assertIn("url('http://example.com/cat.jpg')", converted_text)
コード例 #3
0
ファイル: converter_test.py プロジェクト: flatron18116/wpull
    def test_xhtml_converter(self):
        url_table = URLTable()

        image_filename = os.path.join(self.temp_dir.name, 'image.png')
        tubes_filename = os.path.join(self.temp_dir.name, 'tubes.html')
        ferret_filename = os.path.join(self.temp_dir.name, 'ferret.jpg')

        url_table.add_many([
            {'url': 'http://example.com/styles.css'},
            {'url': 'http://example.com/image.png'},
            {'url': 'http://example.com/cat.jpg'},
            {'url': 'http://example.com/fox.jpg'},
            {'url': 'http://example.com/ferret.jpg'},
            {'url': 'http://example.com/tubes.html'},
        ])
        url_table.update_one(
            'http://example.com/styles.css',
            status=Status.done,
            link_type='css'
        )
        url_table.update_one(
            'http://example.com/image.png',
            status=Status.done,
            filename=os.path.relpath(image_filename, self.temp_dir.name)
        )
        url_table.update_one(
            'http://example.com/tubes.html',
            status=Status.done,
            filename=os.path.relpath(tubes_filename, self.temp_dir.name)
        )
        url_table.update_one(
            'http://example.com/ferret.jpg',
            status=Status.done,
            filename=os.path.relpath(ferret_filename, self.temp_dir.name)
        )

        html_filename = os.path.join(self.temp_dir.name, 'index.html')
        new_html_filename = os.path.join(self.temp_dir.name, 'index.html-new')

        with open(html_filename, 'w') as out_file:
            out_file.write(XHTML_TEXT)

        for filename in [image_filename, tubes_filename, ferret_filename]:
            with open(filename, 'wb'):
                pass

        element_walker = ElementWalker(css_scraper=CSSScraper())
        converter = HTMLConverter(
            self.get_html_parser(), element_walker, url_table)

        converter.convert(
            html_filename, new_html_filename,
            base_url='http://example.com/index.html'
        )

        with open(new_html_filename, 'r') as in_file:
            converted_text = in_file.read()

        self.assertIn("url('image.png')", converted_text)
        self.assertIn("url('http://example.com/cat.jpg')", converted_text)
        self.assertIn('"tubes.html"', converted_text)
        self.assertIn('"http://example.com/lol.html"', converted_text)
        self.assertIn("url('http://example.com/fox.jpg')", converted_text)
        self.assertIn("url('ferret.jpg')", converted_text)
        self.assertIn("hello world!!", converted_text)
        self.assertIn("<hr/>", converted_text)
コード例 #4
0
ファイル: converter_test.py プロジェクト: pombredanne/wpull
    def test_xhtml_converter(self):
        url_table = URLTable()

        image_filename = os.path.join(self.temp_dir.name, "image.png")
        tubes_filename = os.path.join(self.temp_dir.name, "tubes.html")
        ferret_filename = os.path.join(self.temp_dir.name, "ferret.jpg")

        url_table.add_many(
            [
                AddURLInfo("http://example.com/styles.css", None, None),
                AddURLInfo("http://example.com/image.png", None, None),
                AddURLInfo("http://example.com/cat.jpg", None, None),
                AddURLInfo("http://example.com/fox.jpg", None, None),
                AddURLInfo("http://example.com/ferret.jpg", None, None),
                AddURLInfo("http://example.com/tubes.html", None, None),
            ]
        )
        url_table.update_one("http://example.com/styles.css", status=Status.done.value, link_type="css")
        url_table.update_one(
            "http://example.com/image.png",
            status=Status.done.value,
            filename=os.path.relpath(image_filename, self.temp_dir.name),
        )
        url_table.update_one(
            "http://example.com/tubes.html",
            status=Status.done.value,
            filename=os.path.relpath(tubes_filename, self.temp_dir.name),
        )
        url_table.update_one(
            "http://example.com/ferret.jpg",
            status=Status.done.value,
            filename=os.path.relpath(ferret_filename, self.temp_dir.name),
        )

        html_filename = os.path.join(self.temp_dir.name, "index.html")
        new_html_filename = os.path.join(self.temp_dir.name, "index.html-new")

        with open(html_filename, "w") as out_file:
            out_file.write(XHTML_TEXT)

        for filename in [image_filename, tubes_filename, ferret_filename]:
            with open(filename, "wb"):
                pass

        element_walker = ElementWalker(css_scraper=CSSScraper())
        converter = HTMLConverter(self.get_html_parser(), element_walker, url_table)

        converter.convert(html_filename, new_html_filename, base_url="http://example.com/index.html")

        with open(new_html_filename, "r") as in_file:
            converted_text = in_file.read()

        self.assertIn("url('image.png')", converted_text)
        self.assertIn("url('http://example.com/cat.jpg')", converted_text)
        self.assertIn('"tubes.html"', converted_text)
        self.assertIn('"http://example.com/lol.html"', converted_text)
        self.assertIn("url('http://example.com/fox.jpg')", converted_text)
        self.assertIn("url('ferret.jpg')", converted_text)
        self.assertIn("hello world!!", converted_text)
        self.assertIn("<hr/>", converted_text)
コード例 #5
0
ファイル: warc_test.py プロジェクト: flatron18116/wpull
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder('asdf',
                                     params=WARCRecorderParams(
                                         compress=False,
                                         cdx=True,
                                         url_table=url_table))

        url_table.add_visits([
            ('http://example.com/fennec',
             '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
             'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ')
        ])

        request = HTTPRequest('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()
        revisit_response_header_size = len(response.to_bytes())

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.to_bytes())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.to_bytes())
            session.response_data(response.body.content())
            session.response(response)

        request = HTTPRequest('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.to_bytes())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.to_bytes())
            session.response_data(response.body.content())
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n', warc_file_content)
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n',
                      warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n',
                      warc_file_content)
        self.assertEqual(1, warc_file_content.count(b'kitbit'))

        self.assertIn(b'http://example.com/horse ', cdx_file_content)
コード例 #6
0
ファイル: recorder_test.py プロジェクト: Super-Rad/wpull
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder(
            'asdf',
            params=WARCRecorderParams(
                compress=False, cdx=True, url_table=url_table
            )
        )

        url_table.add_visits([
            (
                'http://example.com/fennec',
                '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
                'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ'
            )
        ])

        request = HTTPRequest('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()
        revisit_response_header_size = len(response.to_bytes())

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content
        )
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n',
            warc_file_content
        )
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/fennec\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content
        )
        self.assertEqual(
            1,
            warc_file_content.count(b'kitbit')
        )

        self.assertIn(b'http://example.com/horse ', cdx_file_content)