Example #1
0
    def test_css_converter(self):
        with cd_tempdir() as temp_dir:
            url_table = URLTable()
            css_filename = os.path.join(temp_dir, 'styles.css')
            image_filename = os.path.join(temp_dir, 'image.png')
            new_css_filename = os.path.join(temp_dir, 'styles.css-new')

            url_table.add([
                'http://example.com/styles.css',
                'http://example.com/image.png',
                'http://example.com/cat.jpg',
                'http://example.com/cat.jpg',
            ])
            url_table.update(
                'http://example.com/styles.css',
                status=Status.done,
                link_type='css',
                filename=os.path.relpath(css_filename, temp_dir)
            )
            url_table.update(
                'http://example.com/image.png',
                status=Status.done,
                filename=os.path.relpath(image_filename, temp_dir)
            )

            with open(css_filename, 'w') as out_file:
                out_file.write(CSS_TEXT)

            with open(image_filename, 'wb'):
                pass

            converter = CSSConverter(url_table)

            converter.convert(
                css_filename, new_css_filename,
                base_url='http://example.com/styles.css'
            )

            with open(new_css_filename, 'r') as in_file:
                converted_text = in_file.read()

            self.assertIn("url('image.png')", converted_text)
            self.assertIn("url('http://example.com/cat.jpg')", converted_text)
Example #2
0
    def test_css_converter(self):
        with TemporaryDirectory() as temp_dir:
            path_namer = PathNamer(temp_dir)
            url_table = URLTable()

            url_table.add([
                'http://example.com/styles.css',
                'http://example.com/image.png',
                'http://example.com/cat.jpg',
                'http://example.com/cat.jpg',
            ])
            url_table.update('http://example.com/styles.css',
                             status=Status.done,
                             link_type='css')
            url_table.update(
                'http://example.com/image.png',
                status=Status.done,
            )

            css_filename = os.path.join(temp_dir, 'styles.css')
            new_css_filename = os.path.join(temp_dir, 'styles.css-new')

            with open(css_filename, 'w') as out_file:
                out_file.write(CSS_TEXT)

            converter = CSSConverter(path_namer, url_table)

            converter.convert(css_filename,
                              new_css_filename,
                              base_url='http://example.com/styles.css')

            with open(new_css_filename, 'r') as in_file:
                converted_text = in_file.read()

            self.assertIn("url('image.png')", converted_text)
            self.assertIn("url('http://example.com/cat.jpg')", converted_text)
Example #3
0
    def test_css_converter(self):
        with TemporaryDirectory() as temp_dir:
            path_namer = PathNamer(temp_dir)
            url_table = URLTable()

            url_table.add([
                'http://example.com/styles.css',
                'http://example.com/image.png',
                'http://example.com/cat.jpg',
                'http://example.com/cat.jpg',
            ])
            url_table.update(
                'http://example.com/styles.css',
                status=Status.done,
                link_type='css'
            )
            url_table.update(
                'http://example.com/image.png',
                status=Status.done,
            )

            css_filename = os.path.join(temp_dir, 'styles.css')
            new_css_filename = os.path.join(temp_dir, 'styles.css-new')

            with open(css_filename, 'w') as out_file:
                out_file.write(CSS_TEXT)

            converter = CSSConverter(path_namer, url_table)

            converter.convert(
                css_filename, new_css_filename,
                base_url='http://example.com/styles.css'
            )

            with open(new_css_filename, 'r') as in_file:
                converted_text = in_file.read()

            self.assertIn("url('image.png')", converted_text)
            self.assertIn("url('http://example.com/cat.jpg')", converted_text)
Example #4
0
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder(
            'asdf',
            params=WARCRecorderParams(
                compress=False, cdx=True, url_table=url_table
            )
        )

        url_table.add_visits([
            (
                'http://example.com/fennec',
                '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
                'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ'
            )
        ])

        request = Request.new('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')
        revisit_response_header_size = len(response.header())

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        request = Request.new('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
                b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content
        )
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
                b'identical-payload-digest\r\n',
            warc_file_content
        )
        self.assertIn(
            b'Content-Length: ' +
                str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/fennec\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content
        )
        self.assertEqual(
            1,
            warc_file_content.count(b'kitbit')
        )

        self.assertIn(b'http://example.com/horse ', cdx_file_content)
Example #5
0
    def test_html_converter(self):
        with TemporaryDirectory() as temp_dir:
            path_namer = PathNamer(temp_dir)
            url_table = URLTable()

            url_table.add([
                'http://example.com/styles.css',
                'http://example.com/image.png',
                'http://example.com/cat.jpg',
                'http://example.com/fox.jpg',
                'http://example.com/ferret.jpg',
                'http://example.com/tubes.html',
            ])
            url_table.update('http://example.com/styles.css',
                             status=Status.done,
                             link_type='css')
            url_table.update(
                'http://example.com/image.png',
                status=Status.done,
            )
            url_table.update(
                'http://example.com/tubes.html',
                status=Status.done,
            )
            url_table.update(
                'http://example.com/ferret.jpg',
                status=Status.done,
            )

            html_filename = os.path.join(temp_dir, 'index.html')
            new_html_filename = os.path.join(temp_dir, 'index.html-new')

            with open(html_filename, 'w') as out_file:
                out_file.write(HTML_TEXT)

            converter = HTMLConverter(path_namer, url_table)

            converter.convert(html_filename,
                              new_html_filename,
                              base_url='http://example.com/index.html')

            with open(new_html_filename, 'r') as in_file:
                converted_text = in_file.read()

            self.assertIn("url('image.png')", converted_text)
            self.assertIn("url('http://example.com/cat.jpg')", converted_text)
            self.assertIn('"tubes.html"', converted_text)
            self.assertIn('"http://example.com/lol.html"', converted_text)
            self.assertIn("url('http://example.com/fox.jpg')", converted_text)
            self.assertIn("url('ferret.jpg')", converted_text)
Example #6
0
    def test_xhtml_converter(self):
        with cd_tempdir() as temp_dir:
            url_table = URLTable()

            image_filename = os.path.join(temp_dir, 'image.png')
            tubes_filename = os.path.join(temp_dir, 'tubes.html')
            ferret_filename = os.path.join(temp_dir, 'ferret.jpg')

            url_table.add([
                'http://example.com/styles.css',
                'http://example.com/image.png',
                'http://example.com/cat.jpg',
                'http://example.com/fox.jpg',
                'http://example.com/ferret.jpg',
                'http://example.com/tubes.html',
            ])
            url_table.update(
                'http://example.com/styles.css',
                status=Status.done,
                link_type='css'
            )
            url_table.update(
                'http://example.com/image.png',
                status=Status.done,
                filename=os.path.relpath(image_filename, temp_dir)
            )
            url_table.update(
                'http://example.com/tubes.html',
                status=Status.done,
                filename=os.path.relpath(tubes_filename, temp_dir)
            )
            url_table.update(
                'http://example.com/ferret.jpg',
                status=Status.done,
                filename=os.path.relpath(ferret_filename, temp_dir)
            )

            html_filename = os.path.join(temp_dir, 'index.html')
            new_html_filename = os.path.join(temp_dir, 'index.html-new')

            with open(html_filename, 'w') as out_file:
                out_file.write(XHTML_TEXT)

            for filename in [image_filename, tubes_filename, ferret_filename]:
                with open(filename, 'wb'):
                    pass

            converter = HTMLConverter(url_table)

            converter.convert(
                html_filename, new_html_filename,
                base_url='http://example.com/index.html'
            )

            with open(new_html_filename, 'r') as in_file:
                converted_text = in_file.read()

            self.assertIn("url('image.png')", converted_text)
            self.assertIn("url('http://example.com/cat.jpg')", converted_text)
            self.assertIn('"tubes.html"', converted_text)
            self.assertIn('"http://example.com/lol.html"', converted_text)
            self.assertIn("url('http://example.com/fox.jpg')", converted_text)
            self.assertIn("url('ferret.jpg')", converted_text)
            self.assertIn("hello world!!", converted_text)
            self.assertIn("<hr/>", converted_text)
Example #7
0
    def test_html_converter(self):
        with TemporaryDirectory() as temp_dir:
            path_namer = PathNamer(temp_dir)
            url_table = URLTable()

            url_table.add([
                'http://example.com/styles.css',
                'http://example.com/image.png',
                'http://example.com/cat.jpg',
                'http://example.com/fox.jpg',
                'http://example.com/ferret.jpg',
                'http://example.com/tubes.html',
            ])
            url_table.update(
                'http://example.com/styles.css',
                status=Status.done,
                link_type='css'
            )
            url_table.update(
                'http://example.com/image.png',
                status=Status.done,
            )
            url_table.update(
                'http://example.com/tubes.html',
                status=Status.done,
            )
            url_table.update(
                'http://example.com/ferret.jpg',
                status=Status.done,
            )

            html_filename = os.path.join(temp_dir, 'index.html')
            new_html_filename = os.path.join(temp_dir, 'index.html-new')

            with open(html_filename, 'w') as out_file:
                out_file.write(HTML_TEXT)

            converter = HTMLConverter(path_namer, url_table)

            converter.convert(
                html_filename, new_html_filename,
                base_url='http://example.com/index.html'
            )

            with open(new_html_filename, 'r') as in_file:
                converted_text = in_file.read()

            self.assertIn("url('image.png')", converted_text)
            self.assertIn("url('http://example.com/cat.jpg')", converted_text)
            self.assertIn('"tubes.html"', converted_text)
            self.assertIn('"http://example.com/lol.html"', converted_text)
            self.assertIn("url('http://example.com/fox.jpg')", converted_text)
            self.assertIn("url('ferret.jpg')", converted_text)
Example #8
0
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder('asdf',
                                     params=WARCRecorderParams(
                                         compress=False,
                                         cdx=True,
                                         url_table=url_table))

        url_table.add_visits([
            ('http://example.com/fennec',
             '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
             'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ')
        ])

        request = Request.new('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OK')
        revisit_response_header_size = len(response.header())

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        request = Request.new('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = Response('HTTP/1.1', '200', 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')

        with wpull.util.reset_file_offset(response.body.content_file):
            response.body.content_file.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.header())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.header())
            session.response_data(response.body.content)
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n', warc_file_content)
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n',
                      warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n',
                      warc_file_content)
        self.assertEqual(1, warc_file_content.count(b'kitbit'))

        self.assertIn(b'http://example.com/horse ', cdx_file_content)