def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False): html_files = set([]) try: x = self.get_encoding() codecs.lookup(x) enc = x except: enc = 'cp1252' for path in self.Contents(): fpath = path if not isinstance(path, unicode_type): fpath = path.decode(enc) lpath = os.path.join(output_dir, fpath) self._ensure_dir(lpath) try: data = self.GetFile(path) except: self.log.exception('Failed to extract %s from CHM, ignoring' % path) continue if lpath.find(';') != -1: # fix file names with ";<junk>" at the end, see _reformat() lpath = lpath.split(';')[0] try: with open(lpath, 'wb') as f: f.write(data) try: if 'html' in guess_mimetype(path)[0]: html_files.add(lpath) except: pass except: if iswindows and len(lpath) > 250: self.log.warn('%r filename too long, skipping' % path) continue raise if debug_dump: import shutil shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump')) for lpath in html_files: with open(lpath, 'r+b') as f: data = f.read() data = self._reformat(data, lpath) if isinstance(data, unicode_type): data = data.encode('utf-8') f.seek(0) f.truncate() f.write(data) self._extracted = True files = [ y for y in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, y)) ] if self.hhc_path not in files: for f in files: if f.lower() == self.hhc_path.lower(): self.hhc_path = f break if self.hhc_path not in files and files: for f in files: if f.partition('.')[-1].lower() in { 'html', 'htm', 'xhtm', 'xhtml' }: self.hhc_path = f break if self.hhc_path == '.hhc' and self.hhc_path not in files: from calibre import walk for x in walk(output_dir): if os.path.basename(x).lower() in ('index.htm', 'index.html', 'contents.htm', 'contents.html'): self.hhc_path = os.path.relpath(x, output_dir) break if self.hhc_path not in files and files: self.hhc_path = files[0]
def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False): html_files = set([]) try: x = self.GetEncoding() codecs.lookup(x) enc = x except: enc = 'cp1252' for path in self.Contents(): fpath = path if not isinstance(path, unicode): fpath = path.decode(enc) lpath = os.path.join(output_dir, fpath) self._ensure_dir(lpath) try: data = self.GetFile(path) except: self.log.exception('Failed to extract %s from CHM, ignoring'%path) continue if lpath.find(';') != -1: # fix file names with ";<junk>" at the end, see _reformat() lpath = lpath.split(';')[0] try: with open(lpath, 'wb') as f: f.write(data) try: if 'html' in guess_mimetype(path)[0]: html_files.add(lpath) except: pass except: if iswindows and len(lpath) > 250: self.log.warn('%r filename too long, skipping'%path) continue raise if debug_dump: import shutil shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump')) for lpath in html_files: with open(lpath, 'r+b') as f: data = f.read() data = self._reformat(data, lpath) if isinstance(data, unicode): data = data.encode('utf-8') f.seek(0) f.truncate() f.write(data) self._extracted = True files = [x for x in os.listdir(output_dir) if os.path.isfile(os.path.join(output_dir, x))] if self.hhc_path not in files: for f in files: if f.lower() == self.hhc_path.lower(): self.hhc_path = f break if self.hhc_path not in files and files: for f in files: if f.partition('.')[-1].lower() in {'html', 'htm', 'xhtm', 'xhtml'}: self.hhc_path = f break if self.hhc_path == '.hhc' and self.hhc_path not in files: from calibre import walk for x in walk(output_dir): if os.path.basename(x).lower() in ('index.htm', 'index.html', 'contents.htm', 'contents.html'): self.hhc_path = os.path.relpath(x, output_dir) break if self.hhc_path not in files and files: self.hhc_path = files[0]