Exemple #1
0
    def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
        html_files = set([])
        try:
            x = self.get_encoding()
            codecs.lookup(x)
            enc = x
        except:
            enc = 'cp1252'
        for path in self.Contents():
            fpath = path
            if not isinstance(path, unicode_type):
                fpath = path.decode(enc)
            lpath = os.path.join(output_dir, fpath)
            self._ensure_dir(lpath)
            try:
                data = self.GetFile(path)
            except:
                self.log.exception('Failed to extract %s from CHM, ignoring' %
                                   path)
                continue
            if lpath.find(';') != -1:
                # fix file names with ";<junk>" at the end, see _reformat()
                lpath = lpath.split(';')[0]
            try:
                with open(lpath, 'wb') as f:
                    f.write(data)
                try:
                    if 'html' in guess_mimetype(path)[0]:
                        html_files.add(lpath)
                except:
                    pass
            except:
                if iswindows and len(lpath) > 250:
                    self.log.warn('%r filename too long, skipping' % path)
                    continue
                raise

        if debug_dump:
            import shutil
            shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump'))
        for lpath in html_files:
            with open(lpath, 'r+b') as f:
                data = f.read()
                data = self._reformat(data, lpath)
                if isinstance(data, unicode_type):
                    data = data.encode('utf-8')
                f.seek(0)
                f.truncate()
                f.write(data)

        self._extracted = True
        files = [
            y for y in os.listdir(output_dir)
            if os.path.isfile(os.path.join(output_dir, y))
        ]
        if self.hhc_path not in files:
            for f in files:
                if f.lower() == self.hhc_path.lower():
                    self.hhc_path = f
                    break
        if self.hhc_path not in files and files:
            for f in files:
                if f.partition('.')[-1].lower() in {
                        'html', 'htm', 'xhtm', 'xhtml'
                }:
                    self.hhc_path = f
                    break

        if self.hhc_path == '.hhc' and self.hhc_path not in files:
            from calibre import walk
            for x in walk(output_dir):
                if os.path.basename(x).lower() in ('index.htm', 'index.html',
                                                   'contents.htm',
                                                   'contents.html'):
                    self.hhc_path = os.path.relpath(x, output_dir)
                    break

        if self.hhc_path not in files and files:
            self.hhc_path = files[0]
Exemple #2
0
    def ExtractFiles(self, output_dir=os.getcwdu(), debug_dump=False):
        html_files = set([])
        try:
            x = self.GetEncoding()
            codecs.lookup(x)
            enc = x
        except:
            enc = 'cp1252'
        for path in self.Contents():
            fpath = path
            if not isinstance(path, unicode):
                fpath = path.decode(enc)
            lpath = os.path.join(output_dir, fpath)
            self._ensure_dir(lpath)
            try:
                data = self.GetFile(path)
            except:
                self.log.exception('Failed to extract %s from CHM, ignoring'%path)
                continue
            if lpath.find(';') != -1:
                # fix file names with ";<junk>" at the end, see _reformat()
                lpath = lpath.split(';')[0]
            try:
                with open(lpath, 'wb') as f:
                    f.write(data)
                try:
                    if 'html' in guess_mimetype(path)[0]:
                        html_files.add(lpath)
                except:
                    pass
            except:
                if iswindows and len(lpath) > 250:
                    self.log.warn('%r filename too long, skipping'%path)
                    continue
                raise

        if debug_dump:
            import shutil
            shutil.copytree(output_dir, os.path.join(debug_dump, 'debug_dump'))
        for lpath in html_files:
            with open(lpath, 'r+b') as f:
                data = f.read()
                data = self._reformat(data, lpath)
                if isinstance(data, unicode):
                    data = data.encode('utf-8')
                f.seek(0)
                f.truncate()
                f.write(data)

        self._extracted = True
        files = [x for x in os.listdir(output_dir) if
                os.path.isfile(os.path.join(output_dir, x))]
        if self.hhc_path not in files:
            for f in files:
                if f.lower() == self.hhc_path.lower():
                    self.hhc_path = f
                    break
        if self.hhc_path not in files and files:
            for f in files:
                if f.partition('.')[-1].lower() in {'html', 'htm', 'xhtm',
                        'xhtml'}:
                    self.hhc_path = f
                    break

        if self.hhc_path == '.hhc' and self.hhc_path not in files:
            from calibre import walk
            for x in walk(output_dir):
                if os.path.basename(x).lower() in ('index.htm', 'index.html',
                        'contents.htm', 'contents.html'):
                    self.hhc_path = os.path.relpath(x, output_dir)
                    break

        if self.hhc_path not in files and files:
            self.hhc_path = files[0]