def test_html_layout(self): html_parser = self.get_html_parser() reader = HTMLReader(html_parser) elements = tuple( reader.iter_elements(io.BytesIO(b''' <html> <head> <title>hi</title> </head> <body> <img> </body> </html>'''), encoding='ascii') ) print(elements) self.assertEqual('html', elements[0].tag) self.assertEqual('head', elements[1].tag) self.assertEqual('title', elements[2].tag) self.assertEqual('title', elements[3].tag) self.assertEqual('head', elements[4].tag) self.assertEqual('body', elements[5].tag) self.assertEqual('img', elements[6].tag) if isinstance(html_parser, LxmlHTMLParser): self.assertEqual('img', elements[7].tag) self.assertEqual('body', elements[8].tag) self.assertEqual('html', elements[9].tag) else: self.assertEqual('body', elements[7].tag) self.assertEqual('html', elements[8].tag)
def test_html_encoding(self): html_parser = self.get_html_parser() is_lxml = isinstance(html_parser, LxmlHTMLParser) reader = HTMLReader(html_parser) bom_map = { 'utf_16_le': codecs.BOM_UTF16_LE, 'utf_16_be': codecs.BOM_UTF16_BE, 'utf_32_le': codecs.BOM_UTF32_LE, 'utf_32_be': codecs.BOM_UTF32_BE, } for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if is_lxml and (name.startswith('utf_16') or name.startswith('utf_32')): # FIXME: libxml/lxml doesn't like it when we pass in a codec # name but don't specify the endian but BOM is included continue print('->', name) data = io.BytesIO( bom_map.get(name, b'') +'<img>'.encode(name)) elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] if isinstance(html_parser, LxmlHTMLParser): self.assertEqual('html', html_element.tag) else: self.assertEqual('img', html_element.tag)
def test_html_encoding(self): html_parser = HTMLParser() reader = HTMLReader(html_parser) bom_map = { 'utf_16_le': codecs.BOM_UTF16_LE, 'utf_16_be': codecs.BOM_UTF16_BE, 'utf_32_le': codecs.BOM_UTF32_LE, 'utf_32_be': codecs.BOM_UTF32_BE, } for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if name.startswith('utf_16') or name.startswith('utf_32'): # FIXME: libxml/lxml doesn't like it when we pass in a codec # name but don't specify the endian but BOM is included continue print('->', name) data = io.BytesIO(bom_map.get(name, b'') + '<img>'.encode(name)) elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] self.assertEqual('html', html_element.tag)
def test_html_script_comment(self): test_string = b'''<script><!-- blah --></script>''' reader = HTMLReader(self.get_html_parser()) elements = reader.iter_elements(io.BytesIO(test_string), encoding='ascii') elements = tuple(elements) self.assertTrue( all(isinstance(element, Element) for element in elements))
def test_html_script_comment(self): test_string = b'''<script><!-- blah --></script>''' reader = HTMLReader(HTMLParser()) elements = reader.iter_elements(io.BytesIO(test_string), encoding='ascii') elements = tuple(elements) self.assertTrue( all(isinstance(element, Element) for element in elements))
def scrape_document(self, item_session): response = item_session.response url_info = item_session.request.url_info url = url_info.raw if response_body_size(response) < 30 * 1024 * 1024: dupes_db = self.dupes_db body = response.body.content() if HTMLReader.is_response(response): body = dupespotter.process_body(body, url) digest = hashlib.md5(body).digest() if dupes_db is not None: dupe_of = dupes_db.get_old_url(digest) else: dupe_of = None if dupe_of is not None: # Don't extract links from pages we've already seen # to avoid loops that descend a directory endlessly print("DUPE {}\n OF {}".format(url, dupe_of)) return else: if dupes_db is not None: dupes_db.set_old_url(digest, url) super().scrape_document(item_session)
def process(self, url_item, request, response, file_writer_session): '''Process PhantomJS. Coroutine. ''' if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return _logger.debug('Starting PhantomJS processing.') self._file_writer_session = file_writer_session # FIXME: this is a quick hack for crashes. See #137. attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5)) for dummy in range(attempts): try: yield From(self._run_driver(url_item, request, response)) except trollius.TimeoutError: _logger.warning(_('Waiting for page load timed out.')) break except PhantomJSCrashed as error: _logger.exception(__('PhantomJS crashed: {}', error)) else: break else: _logger.warning( __(_('PhantomJS failed to fetch ‘{url}’. I am sorry.'), url=request.url_info.url))
def process(self, item_session: ItemSession, request, response, file_writer_session): '''Process PhantomJS. Coroutine. ''' if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return _logger.debug('Starting PhantomJS processing.') self._file_writer_session = file_writer_session # FIXME: this is a quick hack for crashes. See #137. attempts = int(os.environ.get('WPULL_PHANTOMJS_TRIES', 5)) for dummy in range(attempts): try: yield from self._run_driver(item_session, request, response) except asyncio.TimeoutError: _logger.warning(_('Waiting for page load timed out.')) break except PhantomJSCrashed as error: _logger.exception(__('PhantomJS crashed: {}', error)) else: break else: _logger.warning(__( _('PhantomJS failed to fetch ‘{url}’. I am sorry.'), url=request.url_info.url ))
def _append_filename_extension(self, response: BaseResponse): '''Append an HTML/CSS file suffix as needed.''' if not self._filename: return if response.request.url_info.scheme not in ('http', 'https'): return if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \ HTMLReader.is_response(response): self._filename += '.html' elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \ CSSReader.is_response(response): self._filename += '.css'
def test_html_encoding(self): html_parser = self.get_html_parser() reader = HTMLReader(html_parser) for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if name.endswith('_le') or name.endswith('_be'): # XXX: Assume BOM is always included continue print(name) data = io.BytesIO('<img>'.encode(name)) elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] if isinstance(html_parser, LxmlHTMLParser): self.assertEqual('html', html_element.tag) else: self.assertEqual('img', html_element.tag)
def _append_filename_extension(self, response): '''Append an HTML/CSS file suffix as needed.''' if not self._filename: return if response.request.url_info.scheme not in ('http', 'https'): return if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \ HTMLReader.is_response(response): self._filename += '.html' elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \ CSSReader.is_response(response): self._filename += '.css'
def process(self, url_item, request, response, file_writer_session): if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return session = Session(self._proxy_address, self._youtube_dl_path, self._root_path, url_item, file_writer_session, self._user_agent, self._warc_recorder, self._inet_family, self._check_certificate) url = url_item.url_info.url _logger.info(__(_('youtube-dl fetching ‘{url}’.'), url=url)) with contextlib.closing(session): yield From(session.run()) _logger.info(__(_('youtube-dl fetched ‘{url}’.'), url=url))
def process(self, item_session: ItemSession, request, response, file_writer_session): if response.status_code != 200: return if not HTMLReader.is_supported(request=request, response=response): return session = Session( self._proxy_address, self._youtube_dl_path, self._root_path, item_session, file_writer_session, self._user_agent, self._warc_recorder, self._inet_family, self._check_certificate ) url = item_session.url_record.url _logger.info(__(_('youtube-dl fetching ‘{url}’.'), url=url)) with contextlib.closing(session): yield from session.run() _logger.info(__(_('youtube-dl fetched ‘{url}’.'), url=url))
def scrape_document(self, request, response, url_item): if response.body.size() < 30*1024*1024: dupes_db = self.dupes_db body = response.body.content() if HTMLReader.is_response(response): body = dupespotter.process_body(body, response.request.url) digest = hashlib.md5(body).digest() if dupes_db is not None: dupe_of = dupes_db.get_old_url(digest) else: dupe_of = None if dupe_of is not None: # Don't extract links from pages we've already seen # to avoid loops that descend a directory endlessly print(" DUPE {}\n OF {}".format(response.request.url, dupe_of)) return else: if dupes_db is not None: dupes_db.set_old_url(digest, response.request.url) super().scrape_document(request, response, url_item)
def scrape_document(self, request, response, url_item): if response.body.size() < 30*1024*1024: dupes_db = self.dupes_db body = response.body.content() if HTMLReader.is_response(response): body = archivebot.dupespotter.dupespotter.process_body(body, response.request.url) digest = hashlib.md5(body).digest() if dupes_db is not None: dupe_of = dupes_db.get_old_url(digest) else: dupe_of = None if dupe_of is not None: # Don't extract links from pages we've already seen # to avoid loops that descend a directory endlessly print(" DUPE {}\n OF {}".format(response.request.url, dupe_of)) sys.stdout.flush() return else: if dupes_db is not None: dupes_db.set_old_url(digest, response.request.url) super().scrape_document(request, response, url_item)
def test_html_early_html(self): reader = HTMLReader(self.get_html_parser()) for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = [] print() print('a' * 10) print(test_string) for element in reader.iter_elements(io.BytesIO(test_string), encoding='ascii'): if isinstance(element, Element): print(element) elements.append(element) element_tags = tuple(element.tag for element in elements) self.assertIn('img', element_tags)
def test_html_detect(self): self.assertTrue( HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')))) self.assertFalse( HTMLReader.is_file(io.BytesIO('hello world!'.encode('utf-16le')))) self.assertTrue( HTMLReader.is_file(io.BytesIO(b'<title>hello</title>hi'))) self.assertTrue(HTMLReader.is_file(io.BytesIO(b'<html><body>hello'))) self.assertTrue( HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ))) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html'))) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))) self.assertTrue( HTMLReader.is_request(Request('example.com/index.html'))) self.assertFalse( HTMLReader.is_request(Request('example.com/image.jpg'))) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))
def test_html_early_html(self): reader = HTMLReader(HTMLParser()) for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = [] print() print('a' * 10) print(test_string) for element in reader.iter_elements(io.BytesIO(test_string), encoding='ascii'): if isinstance(element, Element): print(element) elements.append(element) element_tags = tuple(element.tag for element in elements) self.assertIn('img', element_tags)
def test_html_detect(self): self.assertTrue(HTMLReader.is_file( io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le')) )) self.assertFalse(HTMLReader.is_file( io.BytesIO('hello world!'.encode('utf-16le')) )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<title>hello</title>hi') )) self.assertTrue(HTMLReader.is_file( io.BytesIO(b'<html><body>hello') )) self.assertTrue(HTMLReader.is_file( io.BytesIO( b'The document has moved <a href="somewhere.html">here</a>' ) )) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.htm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.html')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml')) ) self.assertTrue( HTMLReader.is_url(URLInfo.parse('example.com/index.xht')) ) self.assertFalse( HTMLReader.is_url(URLInfo.parse('example.com/image.jpg')) ) self.assertTrue( HTMLReader.is_request(Request('example.com/index.html')) ) self.assertFalse( HTMLReader.is_request(Request('example.com/image.jpg')) ) response = Response(200, 'OK') response.fields['Content-Type'] = 'text/html' self.assertTrue(HTMLReader.is_response(response)) response = Response(200, 'OK') response.fields['Content-Type'] = 'image/png' self.assertFalse(HTMLReader.is_response(response))