def is_file(cls, file): '''Return whether the file is likely a Sitemap.''' peeked_data = wpull.util.peek_file(file) if is_gzip(peeked_data): try: peeked_data = wpull.decompression.gzip_uncompress( peeked_data, truncated=True) except zlib.error: pass peeked_data = wpull.string.printable_bytes(peeked_data) if b'<?xml' in peeked_data \ and (b'<sitemapindex' in peeked_data or b'<urlset' in peeked_data): return True
def is_file(cls, file): '''Return whether the file is likely a Sitemap.''' peeked_data = wpull.util.peek_file(file) if is_gzip(peeked_data): try: peeked_data = wpull.decompression.gzip_uncompress( peeked_data, truncated=True ) except zlib.error: pass peeked_data = wpull.string.printable_bytes(peeked_data) if b'<?xml' in peeked_data \ and (b'<sitemapindex' in peeked_data or b'<urlset' in peeked_data): return True
def iter_links(self, file, encoding=None): peeked_data = wpull.util.peek_file(file) if is_gzip(peeked_data): file = gzip.GzipFile(mode='rb', fileobj=file) if self.is_file(file): for html_obj in self._html_parser.parse(file, encoding): if isinstance(html_obj, Element) \ and html_obj.tag.endswith('loc'): if html_obj.text: yield html_obj.text else: parser = robotexclusionrulesparser.RobotExclusionRulesParser() parser.parse(file.read(self.MAX_ROBOTS_FILE_SIZE)) for link in parser.sitemaps: yield link
def iter_links(self, file, encoding=None): peeked_data = wpull.util.peek_file(file) if is_gzip(peeked_data): file = gzip.GzipFile(mode='rb', fileobj=file) if self.is_file(file): for html_obj in self._html_parser.parse(file, encoding): # Beware: some .tag values are not a str: # html_obj=<?xml version="1.0" encoding="UTF-8" ?> # tag=<cyfunction ProcessingInstruction at 0x7f17e49d78e8> if isinstance(html_obj, Element) and \ isinstance(html_obj.tag, str) and \ html_obj.tag.endswith('loc'): if html_obj.text: yield html_obj.text else: parser = robotexclusionrulesparser.RobotExclusionRulesParser() parser.parse(file.read(self.MAX_ROBOTS_FILE_SIZE)) for link in parser.sitemaps: yield link