Example #1
0
    def test_html_layout(self):
        reader = HTMLReader()

        elements = tuple(
            reader.read_tree(io.BytesIO(b'''
            <html>
                <head>
                    <title>hi</title>
                </head>
                <body>
                    <img>
                </body>
            </html>'''), encoding='ascii')
        )

        print(elements)

        self.assertEqual('html', elements[0].tag)
        self.assertEqual('head', elements[1].tag)
        self.assertEqual('title', elements[2].tag)
        self.assertEqual('title', elements[3].tag)
        self.assertEqual('head', elements[4].tag)
        self.assertEqual('body', elements[5].tag)
        self.assertEqual('img', elements[6].tag)
        self.assertEqual('img', elements[7].tag)
        self.assertEqual('body', elements[8].tag)
        self.assertEqual('html', elements[9].tag)
Example #2
0
    def test_html_layout(self):
        reader = HTMLReader()

        elements = tuple(
            reader.read_tree(io.BytesIO(b'''
            <html>
                <head>
                    <title>hi</title>
                </head>
                <body>
                    <img>
                </body>
            </html>'''), encoding='ascii')
        )

        print(elements)

        self.assertEqual('html', elements[0].tag)
        self.assertEqual('head', elements[1].tag)
        self.assertEqual('title', elements[2].tag)
        self.assertEqual('title', elements[3].tag)
        self.assertEqual('head', elements[4].tag)
        self.assertEqual('body', elements[5].tag)
        self.assertEqual('img', elements[6].tag)
        self.assertEqual('img', elements[7].tag)
        self.assertEqual('body', elements[8].tag)
        self.assertEqual('html', elements[9].tag)
Example #3
0
    def test_html_encoding(self):
        reader = HTMLReader()

        for name in CODEC_NAMES:
            data = io.BytesIO('<img>'.encode(name))
            elements = tuple(reader.read_links(data, encoding=name))
            html_element = elements[0]
            self.assertEqual('html', html_element.tag)
Example #4
0
    def test_html_encoding(self):
        reader = HTMLReader()

        for name in CODEC_NAMES:
            data = io.BytesIO('<img>'.encode(name))
            elements = tuple(reader.read_links(data, encoding=name))
            html_element = elements[0]
            self.assertEqual('html', html_element.tag)
Example #5
0
    def _process_phantomjs(self, request, response):
        '''Process PhantomJS.'''
        if not self._processor.instances.phantomjs_controller:
            return

        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        _logger.debug('Starting PhantomJS processing.')

        controller = self._processor.instances.phantomjs_controller

        with controller.client.remote() as remote:
            self._hook_phantomjs_logging(remote)

            yield controller.apply_page_size(remote)
            yield remote.call('page.open', request.url_info.url)
            yield remote.wait_page_event('load_finished')
            yield controller.control(remote)

            # FIXME: not sure where the logic should fit in
            if controller._snapshot:
                yield self._take_phantomjs_snapshot(controller, remote)

            content = yield remote.eval('page.content')

        mock_response = self._new_phantomjs_response(response, content)

        self._scrape_document(request, mock_response)

        _logger.debug('Ended PhantomJS processing.')
Example #6
0
    def _process_phantomjs(self, request, response):
        '''Process PhantomJS.'''
        if not self._processor.instances.phantomjs_controller:
            return

        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        _logger.debug('Starting PhantomJS processing.')

        controller = self._processor.instances.phantomjs_controller

        attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5))
        for dummy in range(attempts):
            # FIXME: this is a quick hack for handling time outs. See #137.
            try:
                with controller.client.remote() as remote:
                    self._hook_phantomjs_logging(remote)

                    yield controller.apply_page_size(remote)
                    yield remote.call('page.open', request.url_info.url)
                    yield remote.wait_page_event('load_finished')
                    yield controller.control(remote)

                    # FIXME: not sure where the logic should fit in
                    if controller._snapshot:
                        yield self._take_phantomjs_snapshot(controller, remote)

                    content = yield remote.eval('page.content')
            except PhantomJSRPCTimedOut:
                _logger.exception('PhantomJS timed out.')
            else:
                break

        mock_response = self._new_phantomjs_response(response, content)

        self._scrape_document(request, mock_response)

        _logger.debug('Ended PhantomJS processing.')
Example #7
0
 def test_html_parse_doctype(self):
     self.assertIn(
         'html',
         HTMLReader.parse_doctype(
             io.BytesIO(b'<!DOCTYPE HTML><html></html>')
         )
     )
     self.assertIn(
         'XHTML',
         HTMLReader.parse_doctype(
             io.BytesIO(b'''
             <!DOCTYPE html PUBLIC
             "-//W3C//DTD XHTML 1.0 Transitional//EN"
             "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
             <html></html>
             ''')
         )
     )
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'hello world!')))
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'')))
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'\x00')))
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'A\xfe')))
Example #8
0
 def test_html_parse_doctype(self):
     self.assertIn(
         'html',
         HTMLReader.parse_doctype(
             io.BytesIO(b'<!DOCTYPE HTML><html></html>')
         )
     )
     self.assertIn(
         'XHTML',
         HTMLReader.parse_doctype(
             io.BytesIO(b'''
             <!DOCTYPE html PUBLIC
             "-//W3C//DTD XHTML 1.0 Transitional//EN"
             "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
             <html></html>
             ''')
         )
     )
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'hello world!')))
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'')))
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'\x00')))
     self.assertFalse(HTMLReader.parse_doctype(io.BytesIO(b'A\xfe')))
Example #9
0
    def test_html_early_html(self):
        reader = HTMLReader()

        for test_string in [
            b'''<!DOCTYPE HTML><html></html><img>''',
            b'''<html></html><img>''',
            b'''<!DOCTYPE HTML><img><html></html>''',
            b'''<img><html></html>''',
            b'''<!DOCTYPE HTML>
                <html><body></body></html><p><img>''',
            b'''
                <html><body></body></html><p><img>''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
        ]:
            elements = tuple(
                reader.read_links(io.BytesIO(test_string), encoding='ascii')
            )
            self.assertEqual('img', elements[-1].tag)
            elements = tuple(
                reader.read_tree(io.BytesIO(test_string), encoding='ascii')
            )
            self.assertEqual('img', elements[-4].tag)
Example #10
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request.new('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request.new('example.com/image.jpg'))
        )

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
Example #11
0
    def test_html_early_html(self):
        reader = HTMLReader()

        for test_string in [
            b'''<!DOCTYPE HTML><html></html><img>''',
            b'''<html></html><img>''',
            b'''<!DOCTYPE HTML><img><html></html>''',
            b'''<img><html></html>''',
            b'''<!DOCTYPE HTML>
                <html><body></body></html><p><img>''',
            b'''
                <html><body></body></html><p><img>''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
        ]:
            elements = tuple(
                reader.read_links(io.BytesIO(test_string), encoding='ascii')
            )
            self.assertEqual('img', elements[-1].tag)
            elements = tuple(
                reader.read_tree(io.BytesIO(test_string), encoding='ascii')
            )
            self.assertEqual('img', elements[-4].tag)
Example #12
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request.new('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request.new('example.com/image.jpg'))
        )

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response('HTTP/1.0', '200', 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))