Example #1
0
    def test_html_layout(self):
        html_parser = self.get_html_parser()
        reader = HTMLReader(html_parser)

        elements = tuple(
            reader.iter_elements(io.BytesIO(b'''
            <html>
                <head>
                    <title>hi</title>
                </head>
                <body>
                    <img>
                </body>
            </html>'''), encoding='ascii')
        )

        print(elements)

        self.assertEqual('html', elements[0].tag)
        self.assertEqual('head', elements[1].tag)
        self.assertEqual('title', elements[2].tag)
        self.assertEqual('title', elements[3].tag)
        self.assertEqual('head', elements[4].tag)
        self.assertEqual('body', elements[5].tag)
        self.assertEqual('img', elements[6].tag)

        if isinstance(html_parser, LxmlHTMLParser):
            self.assertEqual('img', elements[7].tag)
            self.assertEqual('body', elements[8].tag)
            self.assertEqual('html', elements[9].tag)
        else:
            self.assertEqual('body', elements[7].tag)
            self.assertEqual('html', elements[8].tag)
Example #2
0
    def test_html_encoding(self):
        html_parser = self.get_html_parser()
        is_lxml = isinstance(html_parser, LxmlHTMLParser)
        reader = HTMLReader(html_parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if is_lxml and (name.startswith('utf_16') or name.startswith('utf_32')):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            print('->', name)

            data = io.BytesIO( bom_map.get(name, b'') +'<img>'.encode(name))
            elements = tuple(reader.iter_elements(data, encoding=name))

            html_element = elements[0]
            if isinstance(html_parser, LxmlHTMLParser):
                self.assertEqual('html', html_element.tag)
            else:
                self.assertEqual('img', html_element.tag)
Example #3
0
    def test_html_encoding(self):
        html_parser = HTMLParser()
        reader = HTMLReader(html_parser)

        bom_map = {
            'utf_16_le': codecs.BOM_UTF16_LE,
            'utf_16_be': codecs.BOM_UTF16_BE,
            'utf_32_le': codecs.BOM_UTF32_LE,
            'utf_32_be': codecs.BOM_UTF32_BE,
        }

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.startswith('utf_16') or name.startswith('utf_32'):
                # FIXME: libxml/lxml doesn't like it when we pass in a codec
                # name but don't specify the endian but BOM is included
                continue

            print('->', name)

            data = io.BytesIO(bom_map.get(name, b'') + '<img>'.encode(name))
            elements = tuple(reader.iter_elements(data, encoding=name))

            html_element = elements[0]
            self.assertEqual('html', html_element.tag)
Example #4
0
    def test_html_script_comment(self):
        test_string = b'''<script><!-- blah --></script>'''

        reader = HTMLReader(self.get_html_parser())
        elements = reader.iter_elements(io.BytesIO(test_string),
                                        encoding='ascii')
        elements = tuple(elements)

        self.assertTrue(
            all(isinstance(element, Element) for element in elements))
Example #5
0
    def test_html_script_comment(self):
        test_string = b'''<script><!-- blah --></script>'''

        reader = HTMLReader(HTMLParser())
        elements = reader.iter_elements(io.BytesIO(test_string),
                                        encoding='ascii')
        elements = tuple(elements)

        self.assertTrue(
            all(isinstance(element, Element) for element in elements))
Example #6
0
    def scrape_document(self, item_session):
        response = item_session.response
        url_info = item_session.request.url_info
        url = url_info.raw

        if response_body_size(response) < 30 * 1024 * 1024:
            dupes_db = self.dupes_db
            body = response.body.content()
            if HTMLReader.is_response(response):
                body = dupespotter.process_body(body, url)
            digest = hashlib.md5(body).digest()
            if dupes_db is not None:
                dupe_of = dupes_db.get_old_url(digest)
            else:
                dupe_of = None
            if dupe_of is not None:
                # Don't extract links from pages we've already seen
                # to avoid loops that descend a directory endlessly
                print("DUPE {}\n  OF {}".format(url, dupe_of))
                return
            else:
                if dupes_db is not None:
                    dupes_db.set_old_url(digest, url)

        super().scrape_document(item_session)
Example #7
0
    def process(self, url_item, request, response, file_writer_session):
        '''Process PhantomJS.

        Coroutine.
        '''
        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        _logger.debug('Starting PhantomJS processing.')

        self._file_writer_session = file_writer_session

        # FIXME: this is a quick hack for crashes. See #137.
        attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5))

        for dummy in range(attempts):
            try:
                yield From(self._run_driver(url_item, request, response))
            except trollius.TimeoutError:
                _logger.warning(_('Waiting for page load timed out.'))
                break
            except PhantomJSCrashed as error:
                _logger.exception(__('PhantomJS crashed: {}', error))
            else:
                break
        else:
            _logger.warning(
                __(_('PhantomJS failed to fetch ‘{url}’. I am sorry.'),
                   url=request.url_info.url))
Example #8
0
    def process(self, item_session: ItemSession, request, response, file_writer_session):
        '''Process PhantomJS.

        Coroutine.
        '''
        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        _logger.debug('Starting PhantomJS processing.')

        self._file_writer_session = file_writer_session

        # FIXME: this is a quick hack for crashes. See #137.
        attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5))

        for dummy in range(attempts):
            try:
                yield from self._run_driver(item_session, request, response)
            except asyncio.TimeoutError:
                _logger.warning(_('Waiting for page load timed out.'))
                break
            except PhantomJSCrashed as error:
                _logger.exception(__('PhantomJS crashed: {}', error))
            else:
                break
        else:
            _logger.warning(__(
                _('PhantomJS failed to fetch ‘{url}’. I am sorry.'),
                url=request.url_info.url
            ))
Example #9
0
    def _append_filename_extension(self, response: BaseResponse):
        '''Append an HTML/CSS file suffix as needed.'''
        if not self._filename:
            return

        if response.request.url_info.scheme not in ('http', 'https'):
            return

        if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \
                HTMLReader.is_response(response):
            self._filename += '.html'
        elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \
                CSSReader.is_response(response):
            self._filename += '.css'
Example #10
0
    def test_html_encoding(self):
        html_parser = self.get_html_parser()
        reader = HTMLReader(html_parser)

        for name in CODEC_NAMES:
            if name in EBCDIC or name == 'utf_8_sig':
                # XXX: we're assuming that all codecs are ASCII backward
                # compatable
                continue

            if name.endswith('_le') or name.endswith('_be'):
                # XXX: Assume BOM is always included
                continue

            print(name)
            data = io.BytesIO('<img>'.encode(name))
            elements = tuple(reader.iter_elements(data, encoding=name))

            html_element = elements[0]
            if isinstance(html_parser, LxmlHTMLParser):
                self.assertEqual('html', html_element.tag)
            else:
                self.assertEqual('img', html_element.tag)
Example #11
0
    def _append_filename_extension(self, response):
        '''Append an HTML/CSS file suffix as needed.'''
        if not self._filename:
            return

        if response.request.url_info.scheme not in ('http', 'https'):
            return

        if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \
                HTMLReader.is_response(response):
            self._filename += '.html'
        elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \
                CSSReader.is_response(response):
            self._filename += '.css'
Example #12
0
    def process(self, url_item, request, response, file_writer_session):
        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        session = Session(self._proxy_address, self._youtube_dl_path,
                          self._root_path, url_item, file_writer_session,
                          self._user_agent, self._warc_recorder,
                          self._inet_family, self._check_certificate)

        url = url_item.url_info.url
        _logger.info(__(_('youtube-dl fetching ‘{url}’.'), url=url))

        with contextlib.closing(session):
            yield From(session.run())

        _logger.info(__(_('youtube-dl fetched ‘{url}’.'), url=url))
Example #13
0
    def process(self, item_session: ItemSession, request, response, file_writer_session):
        if response.status_code != 200:
            return

        if not HTMLReader.is_supported(request=request, response=response):
            return

        session = Session(
            self._proxy_address, self._youtube_dl_path, self._root_path,
            item_session, file_writer_session, self._user_agent,
            self._warc_recorder, self._inet_family, self._check_certificate
        )

        url = item_session.url_record.url
        _logger.info(__(_('youtube-dl fetching ‘{url}’.'), url=url))

        with contextlib.closing(session):
            yield from session.run()

        _logger.info(__(_('youtube-dl fetched ‘{url}’.'), url=url))
Example #14
0
	def scrape_document(self, request, response, url_item):
		if response.body.size() < 30*1024*1024:
			dupes_db = self.dupes_db
			body = response.body.content()
			if HTMLReader.is_response(response):
				body = dupespotter.process_body(body, response.request.url)
			digest = hashlib.md5(body).digest()
			if dupes_db is not None:
				dupe_of = dupes_db.get_old_url(digest)
			else:
				dupe_of = None
			if dupe_of is not None:
				# Don't extract links from pages we've already seen
				# to avoid loops that descend a directory endlessly
				print("  DUPE {}\n      OF {}".format(response.request.url, dupe_of))
				return
			else:
				if dupes_db is not None:
					dupes_db.set_old_url(digest, response.request.url)

		super().scrape_document(request, response, url_item)
Example #15
0
    def scrape_document(self, request, response, url_item):
        if response.body.size() < 30*1024*1024:
            dupes_db = self.dupes_db
            body = response.body.content()
            if HTMLReader.is_response(response):
                body = archivebot.dupespotter.dupespotter.process_body(body, response.request.url)

            digest = hashlib.md5(body).digest()
            if dupes_db is not None:
                dupe_of = dupes_db.get_old_url(digest)
            else:
                dupe_of = None
            if dupe_of is not None:
                # Don't extract links from pages we've already seen
                # to avoid loops that descend a directory endlessly
                print("  DUPE {}\n      OF {}".format(response.request.url, dupe_of))
                sys.stdout.flush()
                return
            else:
                if dupes_db is not None:
                    dupes_db.set_old_url(digest, response.request.url)

        super().scrape_document(request, response, url_item)
Example #16
0
    def test_html_early_html(self):
        reader = HTMLReader(self.get_html_parser())

        for test_string in [
            b'''<!DOCTYPE HTML><html></html><img>''',
            b'''<html></html><img>''',
            b'''<!DOCTYPE HTML><img><html></html>''',
            b'''<img><html></html>''',
            b'''<!DOCTYPE HTML>
                <html><body></body></html><p><img>''',
            b'''
                <html><body></body></html><p><img>''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
            b'''
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
        ]:
            elements = []
            print()
            print('a' * 10)
            print(test_string)
            for element in reader.iter_elements(io.BytesIO(test_string),
                                                encoding='ascii'):
                if isinstance(element, Element):
                    print(element)
                    elements.append(element)

            element_tags = tuple(element.tag for element in elements)
            self.assertIn('img', element_tags)
Example #17
0
    def test_html_detect(self):
        self.assertTrue(
            HTMLReader.is_file(
                io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))))
        self.assertFalse(
            HTMLReader.is_file(io.BytesIO('hello world!'.encode('utf-16le'))))
        self.assertTrue(
            HTMLReader.is_file(io.BytesIO(b'<title>hello</title>hi')))
        self.assertTrue(HTMLReader.is_file(io.BytesIO(b'<html><body>hello')))
        self.assertTrue(
            HTMLReader.is_file(
                io.BytesIO(
                    b'The document has moved <a href="somewhere.html">here</a>'
                )))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm')))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html')))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht')))
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')))
        self.assertTrue(
            HTMLReader.is_request(Request('example.com/index.html')))
        self.assertFalse(
            HTMLReader.is_request(Request('example.com/image.jpg')))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
Example #18
0
    def test_html_early_html(self):
        reader = HTMLReader(HTMLParser())

        for test_string in [
                b'''<!DOCTYPE HTML><html></html><img>''',
                b'''<html></html><img>''',
                b'''<!DOCTYPE HTML><img><html></html>''',
                b'''<img><html></html>''',
                b'''<!DOCTYPE HTML>
                <html><body></body></html><p><img>''',
                b'''
                <html><body></body></html><p><img>''',
                b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <html>
                <head>
                <title>Download</title>
                </head>
                <body>
                <br />
                </body>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
                "http://www.w3.org/TR/html4/loose.dtd">
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
                b'''
                <html>
                <body>
                <br />
                </body>
                <head>
                <title>Download</title>
                </head>
                </html>
                <pre><img></pre>
            ''',
        ]:
            elements = []
            print()
            print('a' * 10)
            print(test_string)
            for element in reader.iter_elements(io.BytesIO(test_string),
                                                encoding='ascii'):
                if isinstance(element, Element):
                    print(element)
                    elements.append(element)

            element_tags = tuple(element.tag for element in elements)
            self.assertIn('img', element_tags)
Example #19
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request('example.com/image.jpg'))
        )

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))