def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): ''' :param level: The level of this file. Should be 0 for the root file. :param encoding: Use `encoding` to decode HTML. :param referrer: The :class:`HTMLFile` that first refers to this file. ''' self.path = unicode_path(path_to_html_file, abs=True) self.title = os.path.splitext(os.path.basename(self.path))[0] self.base = os.path.dirname(self.path) self.level = level self.referrer = referrer self.links = [] try: with open(self.path, 'rb') as f: src = header = f.read(4096) encoding = detect_xml_encoding(src)[1] if encoding: try: header = header.decode(encoding, errors='replace') except ValueError: pass self.is_binary = False if level > 0: pat = self.HTML_PAT_BIN if isinstance( header, bytes) else self.HTML_PAT self.is_binary = not bool(pat.search(header)) if not self.is_binary: src += f.read() except OSError as err: msg = 'Could not read from file: %s with error: %s' % ( self.path, as_unicode(err)) if level == 0: raise OSError(msg) raise IgnoreFile(msg, err.errno) if not src: if level == 0: raise ValueError('The file %s is empty' % self.path) self.is_binary = True if not self.is_binary: if not encoding: encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1] self.encoding = encoding else: self.encoding = encoding src = src.decode(encoding, 'replace') match = self.TITLE_PAT.search(src) self.title = match.group(1) if match is not None else self.title self.find_links(src)
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None): ''' :param level: The level of this file. Should be 0 for the root file. :param encoding: Use `encoding` to decode HTML. :param referrer: The :class:`HTMLFile` that first refers to this file. ''' self.path = unicode_path(path_to_html_file, abs=True) self.title = os.path.splitext(os.path.basename(self.path))[0] self.base = os.path.dirname(self.path) self.level = level self.referrer = referrer self.links = [] try: with open(self.path, 'rb') as f: src = header = f.read(4096) encoding = detect_xml_encoding(src)[1] if encoding: try: header = header.decode(encoding) except ValueError: pass self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header)) if not self.is_binary: src += f.read() except IOError as err: msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err)) if level == 0: raise IOError(msg) raise IgnoreFile(msg, err.errno) if not src: if level == 0: raise ValueError('The file %s is empty'%self.path) self.is_binary = True if not self.is_binary: if not encoding: encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1] self.encoding = encoding else: self.encoding = encoding src = src.decode(encoding, 'replace') match = self.TITLE_PAT.search(src) self.title = match.group(1) if match is not None else self.title self.find_links(src)