def update_cover(self):
        # get the calibre cover
        cover_path = self.db.cover(self.book_id, as_path=True)
        fmt = cover_path.rpartition('.')[-1]
        new_cover_name = "00000000_cover." + fmt

        self.make_temp_cbz_file()

        # search for a previously embeded cover
        zf = ZipFile(self.file)
        cover_info = ""
        for name in zf.namelist():
            if name.rsplit(".", 1)[0] == "00000000_cover":
                cover_info = name
                break

        # delete previous cover
        if cover_info != "":
            with open(self.file, 'r+b') as zf, open(cover_path, 'r+b') as cp:
                safe_replace(zf, cover_info, cp)

        # save the cover in the file
        else:
            zf = ZipFile(self.file, "a")
            zf.write(cover_path, new_cover_name)
            zf.close()

        delete_temp_file(cover_path)
Example #2
0
def get_metadata(stream):
    from calibre.ebooks.metadata.meta import get_metadata
    from calibre.ebooks.metadata.archive import is_comic
    stream_type = None
    zf = ZipFile(stream, 'r')
    names = zf.namelist()
    if is_comic(names):
        # Is probably a comic
        return get_metadata(stream, 'cbz')

    for f in names:
        stream_type = os.path.splitext(f)[1].lower()
        if stream_type:
            stream_type = stream_type[1:]
            if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
                               'rb', 'imp', 'pdf', 'lrf', 'azw', 'azw1', 'azw3'):
                with TemporaryDirectory() as tdir:
                    with CurrentDir(tdir):
                        path = zf.extract(f)
                        mi = get_metadata(open(path,'rb'), stream_type)
                        if stream_type == 'opf' and mi.application_id is None:
                            try:
                                # zip archive opf files without an application_id were assumed not to have a cover
                                # reparse the opf and if cover exists read its data from zip archive for the metadata
                                nmi = zip_opf_metadata(path, zf)
                                nmi.timestamp = None
                                return nmi
                            except:
                                pass
                        mi.timestamp = None
                        return mi
    raise ValueError('No ebook found in ZIP archive (%s)' % os.path.basename(getattr(stream, 'name', '') or '<stream>'))
Example #3
0
    def get_picture_size(self):
        from calibre.utils.magick import Image

        self.make_temp_cbz_file()
        zf = ZipFile(self.file)
        files = zf.namelist()

        size_x, size_y = 0, 0
        index = 1
        while index < 10 and index < len(files):
            fname = files[index]
            if fname.lower().rpartition('.')[-1] in IMG_EXTENSIONS:
                with zf.open(fname) as ffile:
                    img = Image()
                    try:
                        img.open(ffile)
                        size_x, size_y = img.size
                    except:
                        pass
                if size_x < size_y:
                    break
            index += 1
        zf.close()
        size = round(size_x * size_y / 1000000, 2)
        return size
Example #4
0
    def update_cover(self):
        # get the calibre cover
        cover_path = self.db.cover(self.book_id, as_path=True)
        fmt = cover_path.rpartition('.')[-1]
        new_cover_name = "00000000_cover." + fmt

        self.make_temp_cbz_file()

        # search for a previously embeded cover
        zf = ZipFile(self.file)
        cover_info = ""
        for name in zf.namelist():
            if name.rsplit(".", 1)[0] == "00000000_cover":
                cover_info = name
                break
        zf.close()

        # delete previous cover
        if cover_info != "":
            with open(self.file, 'r+b') as zf, open(cover_path, 'r+b') as cp:
                safe_replace(zf, cover_info, cp)

        # save the cover in the file
        else:
            zf = ZipFile(self.file, "a")
            zf.write(cover_path, new_cover_name)
            zf.close()

        delete_temp_file(cover_path)
Example #5
0
 def __enter__(self, *args):
     '''
     Add this plugin to the python path so that it's contents become directly importable.
     Useful when bundling large python libraries into the plugin. Use it like this::
         with plugin:
             import something
     '''
     if self.plugin_path is not None:
         from calibre.utils.zipfile import ZipFile
         zf = ZipFile(self.plugin_path)
         extensions = {x.rpartition('.')[-1].lower() for x in zf.namelist()}
         zip_safe = True
         for ext in ('pyd', 'so', 'dll', 'dylib'):
             if ext in extensions:
                 zip_safe = False
                 break
         if zip_safe:
             sys.path.insert(0, self.plugin_path)
             self.sys_insertion_path = self.plugin_path
         else:
             from calibre.ptempfile import TemporaryDirectory
             self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip')
             self.sys_insertion_path = self._sys_insertion_tdir.__enter__(
                 *args)
             zf.extractall(self.sys_insertion_path)
             sys.path.insert(0, self.sys_insertion_path)
         zf.close()
Example #6
0
    def get_comic_metadata_from_cbz(self):
        '''
        Reads the comic metadata from the comic cbz file as comictagger metadata
        '''
        self.make_temp_cbz_file()
        # open the zipfile
        zf = ZipFile(self.file)

        # get cix metadata
        for name in zf.namelist():
            if name.lower() == "comicinfo.xml":
                self.cix_metadata = ComicInfoXml().metadataFromString(
                    zf.read(name))
                self.zipinfo = name
                if not prefs['auto_count_pages']:
                    break
            elif prefs['auto_count_pages'] and name.lower().rpartition(
                    '.')[-1] in IMG_EXTENSIONS:
                self.pages += 1

        # get the cbi metadata
        if ComicBookInfo().validateString(zf.comment):
            self.cbi_metadata = ComicBookInfo().metadataFromString(zf.comment)
        zf.close()

        # get combined metadata
        self._get_combined_metadata()
Example #7
0
def get_metadata(stream):
    from calibre.ebooks.metadata.meta import get_metadata
    from calibre.ebooks.metadata.archive import is_comic
    stream_type = None
    zf = ZipFile(stream, 'r')
    names = zf.namelist()
    if is_comic(names):
        # Is probably a comic
        return get_metadata(stream, 'cbz')

    for f in names:
        stream_type = os.path.splitext(f)[1].lower()
        if stream_type:
            stream_type = stream_type[1:]
            if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub',
                               'rb', 'imp', 'pdf', 'lrf', 'azw', 'azw1', 'azw3'):
                with TemporaryDirectory() as tdir:
                    with CurrentDir(tdir):
                        path = zf.extract(f)
                        mi = get_metadata(open(path,'rb'), stream_type)
                        if stream_type == 'opf' and mi.application_id is None:
                            try:
                                # zip archive opf files without an application_id were assumed not to have a cover
                                # reparse the opf and if cover exists read its data from zip archive for the metadata
                                nmi = zip_opf_metadata(path, zf)
                                nmi.timestamp = None
                                return nmi
                            except:
                                pass
                        mi.timestamp = None
                        return mi
    raise ValueError('No ebook found in ZIP archive (%s)' % os.path.basename(getattr(stream, 'name', '') or '<stream>'))
Example #8
0
 def __enter__(self, *args):
     '''
     Add this plugin to the python path so that it's contents become directly importable.
     Useful when bundling large python libraries into the plugin. Use it like this::
         with plugin:
             import something
     '''
     if self.plugin_path is not None:
         from calibre.utils.zipfile import ZipFile
         zf = ZipFile(self.plugin_path)
         extensions = set([x.rpartition('.')[-1].lower() for x in
             zf.namelist()])
         zip_safe = True
         for ext in ('pyd', 'so', 'dll', 'dylib'):
             if ext in extensions:
                 zip_safe = False
                 break
         if zip_safe:
             sys.path.insert(0, self.plugin_path)
             self.sys_insertion_path = self.plugin_path
         else:
             from calibre.ptempfile import TemporaryDirectory
             self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip')
             self.sys_insertion_path = self._sys_insertion_tdir.__enter__(*args)
             zf.extractall(self.sys_insertion_path)
             sys.path.insert(0, self.sys_insertion_path)
         zf.close()
 def count_pages(self):
     self.make_temp_cbz_file()
     zf = ZipFile(self.file)
     pages = 0
     for name in zf.namelist():
         if name.lower().rpartition('.')[-1] in IMG_EXTENSIONS:
             pages += 1
     return pages
Example #10
0
    def run(self, archive):
        from calibre.utils.zipfile import ZipFile
        is_rar = archive.lower().endswith('.rar')
        if is_rar:
            from calibre.utils.unrar import extract_member, names
        else:
            zf = ZipFile(archive, 'r')

        if is_rar:
            with open(archive, 'rb') as rf:
                fnames = list(names(rf))
        else:
            fnames = zf.namelist()

        def fname_ok(fname):
            bn = os.path.basename(fname).lower()
            if bn == 'thumbs.db':
                return False
            if '.' not in bn:
                return False
            if bn.rpartition('.')[-1] in {'diz', 'nfo'}:
                return False
            if '__MACOSX' in fname.split('/'):
                return False
            return True

        fnames = list(filter(fname_ok, fnames))
        if is_comic(fnames):
            ext = '.cbr' if is_rar else '.cbz'
            of = self.temporary_file('_archive_extract' + ext)
            with open(archive, 'rb') as f:
                of.write(f.read())
            of.close()
            return of.name
        if len(fnames) > 1 or not fnames:
            return archive
        fname = fnames[0]
        ext = os.path.splitext(fname)[1][1:]
        if ext.lower() not in {
                'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb',
                'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt'
        }:
            return archive

        of = self.temporary_file('_archive_extract.' + ext)
        with closing(of):
            if is_rar:
                with open(archive, 'rb') as f:
                    data = extract_member(f, match=None, name=fname)[1]
                of.write(data)
            else:
                of.write(zf.read(fname))
        return of.name
Example #11
0
    def run(self, archive):
        from calibre.utils.zipfile import ZipFile
        is_rar = archive.lower().endswith('.rar')
        if is_rar:
            from calibre.utils.unrar import extract_member, names
        else:
            zf = ZipFile(archive, 'r')

        if is_rar:
            with open(archive, 'rb') as rf:
                fnames = list(names(rf))
        else:
            fnames = zf.namelist()

        def fname_ok(fname):
            bn = os.path.basename(fname).lower()
            if bn == 'thumbs.db':
                return False
            if '.' not in bn:
                return False
            if bn.rpartition('.')[-1] in {'diz', 'nfo'}:
                return False
            if '__MACOSX' in fname.split('/'):
                return False
            return True

        fnames = list(filter(fname_ok, fnames))
        if is_comic(fnames):
            ext = '.cbr' if is_rar else '.cbz'
            of = self.temporary_file('_archive_extract'+ext)
            with open(archive, 'rb') as f:
                of.write(f.read())
            of.close()
            return of.name
        if len(fnames) > 1 or not fnames:
            return archive
        fname = fnames[0]
        ext = os.path.splitext(fname)[1][1:]
        if ext.lower() not in {
                'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb',
                'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt'}:
            return archive

        of = self.temporary_file('_archive_extract.'+ext)
        with closing(of):
            if is_rar:
                with open(archive, 'rb') as f:
                    data = extract_member(f, match=None, name=fname)[1]
                of.write(data)
            else:
                of.write(zf.read(fname))
        return of.name
Example #12
0
def get_fb2_data(stream):
    from calibre.utils.zipfile import ZipFile, BadZipfile
    pos = stream.tell()
    try:
        zf = ZipFile(stream)
    except BadZipfile:
        stream.seek(pos)
        ans = stream.read()
        zip_file_name = None
    else:
        names = zf.namelist()
        names = [x for x in names if x.lower().endswith('.fb2')] or names
        zip_file_name = names[0]
        ans = zf.open(zip_file_name).read()
    return ans, zip_file_name
Example #13
0
def get_fb2_data(stream):
    from calibre.utils.zipfile import ZipFile, BadZipfile
    pos = stream.tell()
    try:
        zf = ZipFile(stream)
    except BadZipfile:
        stream.seek(pos)
        ans = stream.read()
        zip_file_name = None
    else:
        names = zf.namelist()
        names = [x for x in names if x.lower().endswith('.fb2')] or names
        zip_file_name = names[0]
        ans = zf.open(zip_file_name).read()
    return ans, zip_file_name
Example #14
0
    def count_pages(self):
        self.make_temp_cbz_file()
        # open the zipfile
        zf = ZipFile(self.file)

        # count the pages
        for name in zf.namelist():
            if name.lower().rpartition('.')[-1] in IMG_EXTENSIONS:
                self.pages += 1

        if self.pages == 0:
            return False

        update_custom_column(prefs['pages_column'], self.pages,
                             self.calibre_metadata,
                             self.db.field_metadata.custom_field_metadata())
        self.db.set_metadata(self.book_id, self.calibre_metadata)
        return True
Example #15
0
    def run(self, archive):
        from calibre.utils.zipfile import ZipFile
        is_rar = archive.lower().endswith('.rar')
        if is_rar:
            from calibre.utils.unrar import extract_member, names
        else:
            zf = ZipFile(archive, 'r')

        if is_rar:
            with open(archive, 'rb') as rf:
                fnames = list(names(rf))
        else:
            fnames = zf.namelist()

        fnames = [
            x for x in fnames
            if '.' in x and x.lower().rpartition('/')[-1] != 'thumbs.db'
        ]
        if is_comic(fnames):
            ext = '.cbr' if is_rar else '.cbz'
            of = self.temporary_file('_archive_extract' + ext)
            with open(archive, 'rb') as f:
                of.write(f.read())
            of.close()
            return of.name
        if len(fnames) > 1 or not fnames:
            return archive
        fname = fnames[0]
        ext = os.path.splitext(fname)[1][1:]
        if ext.lower() not in ('lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf',
                               'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2'):
            return archive

        of = self.temporary_file('_archive_extract.' + ext)
        with closing(of):
            if is_rar:
                with open(archive, 'rb') as f:
                    data = extract_member(f, match=None, name=fname)[1]
                of.write(data)
            else:
                of.write(zf.read(fname))
        return of.name
Example #16
0
 def __enter__(self, *args):
     if self.plugin_path is not None:
         from calibre.utils.zipfile import ZipFile
         zf = ZipFile(self.plugin_path)
         extensions = set([x.rpartition('.')[-1].lower() for x in
             zf.namelist()])
         zip_safe = True
         for ext in ('pyd', 'so', 'dll', 'dylib'):
             if ext in extensions:
                 zip_safe = False
         if zip_safe:
             sys.path.insert(0, self.plugin_path)
             self.sys_insertion_path = self.plugin_path
         else:
             from calibre.ptempfile import TemporaryDirectory
             self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip')
             self.sys_insertion_path = self._sys_insertion_tdir.__enter__(*args)
             zf.extractall(self.sys_insertion_path)
             sys.path.insert(0, self.sys_insertion_path)
         zf.close()
Example #17
0
 def __enter__(self, *args):
     if self.plugin_path is not None:
         from calibre.utils.zipfile import ZipFile
         zf = ZipFile(self.plugin_path)
         extensions = set([x.rpartition('.')[-1].lower() for x in
             zf.namelist()])
         zip_safe = True
         for ext in ('pyd', 'so', 'dll', 'dylib'):
             if ext in extensions:
                 zip_safe = False
         if zip_safe:
             sys.path.insert(0, self.plugin_path)
             self.sys_insertion_path = self.plugin_path
         else:
             from calibre.ptempfile import TemporaryDirectory
             self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip')
             self.sys_insertion_path = self._sys_insertion_tdir.__enter__(*args)
             zf.extractall(self.sys_insertion_path)
             sys.path.insert(0, self.sys_insertion_path)
         zf.close()
Example #18
0
    def run(self, archive):
        from calibre.utils.zipfile import ZipFile
        is_rar = archive.lower().endswith('.rar')
        if is_rar:
            from calibre.utils.unrar import extract_member, names
        else:
            zf = ZipFile(archive, 'r')

        if is_rar:
            with open(archive, 'rb') as rf:
                fnames = list(names(rf))
        else:
            fnames = zf.namelist()

        fnames = [x for x in fnames if '.' in x and x.lower().rpartition('/')[-1] != 'thumbs.db']
        if is_comic(fnames):
            ext = '.cbr' if is_rar else '.cbz'
            of = self.temporary_file('_archive_extract'+ext)
            with open(archive, 'rb') as f:
                of.write(f.read())
            of.close()
            return of.name
        if len(fnames) > 1 or not fnames:
            return archive
        fname = fnames[0]
        ext = os.path.splitext(fname)[1][1:]
        if ext.lower() not in ('lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf',
                'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2'):
            return archive

        of = self.temporary_file('_archive_extract.'+ext)
        with closing(of):
            if is_rar:
                with open(archive, 'rb') as f:
                    data = extract_member(f, match=None, name=fname)[1]
                of.write(data)
            else:
                of.write(zf.read(fname))
        return of.name
    def get_comic_metadata_from_cbz(self):
        '''
        Reads the comic metadata from the comic cbz file as comictagger metadata
        '''
        self.make_temp_cbz_file()
        # open the zipfile
        zf = ZipFile(self.file)

        # get cix metadata
        for name in zf.namelist():
            if name.lower() == "comicinfo.xml":
                self.cix_metadata = ComicInfoXml().metadataFromString(zf.read(name))
                self.zipinfo = name
                break

        # get the cbi metadata
        if ComicBookInfo().validateString(zf.comment):
            self.cbi_metadata = ComicBookInfo().metadataFromString(zf.comment)
        zf.close()

        # get combined metadata
        self._get_combined_metadata()
    def get_comic_metadata_from_cbz(self):
        '''
        Reads the comic metadata from the comic cbz file as comictagger metadata
        '''
        self.make_temp_cbz_file()
        # open the zipfile
        zf = ZipFile(self.file)

        # get cix metadata
        for name in zf.namelist():
            if name.lower() == "comicinfo.xml":
                self.cix_metadata = ComicInfoXml().metadataFromString(zf.read(name))
                self.zipinfo = name
                break

        # get the cbi metadata
        if ComicBookInfo().validateString(zf.comment):
            self.cbi_metadata = ComicBookInfo().metadataFromString(zf.comment)
        zf.close()

        # get combined metadata
        self._get_combined_metadata()
Example #21
0
class DOCX(object):

    def __init__(self, path_or_stream, log=None, extract=True):
        stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
        self.name = getattr(stream, 'name', None) or '<stream>'
        self.log = log or default_log
        if extract:
            self.extract(stream)
        else:
            self.init_zipfile(stream)
        self.read_content_types()
        self.read_package_relationships()

    def init_zipfile(self, stream):
        self.zipf = ZipFile(stream)
        self.names = frozenset(self.zipf.namelist())

    def extract(self, stream):
        self.tdir = PersistentTemporaryDirectory('docx_container')
        try:
            zf = ZipFile(stream)
            zf.extractall(self.tdir)
        except:
            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
                    ' more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream, self.tdir)

        self.names = {}
        for f in walk(self.tdir):
            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
            self.names[name] = f

    def exists(self, name):
        return name in self.names

    def read(self, name):
        if hasattr(self, 'zipf'):
            return self.zipf.open(name).read()
        path = self.names[name]
        with open(path, 'rb') as f:
            return f.read()

    def read_content_types(self):
        try:
            raw = self.read('[Content_Types].xml')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
        root = fromstring(raw)
        self.content_types = {}
        self.default_content_types = {}
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
            self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
            name = item.get('PartName').lstrip('/')
            self.content_types[name] = item.get('ContentType')

    def content_type(self, name):
        if name in self.content_types:
            return self.content_types[name]
        ext = name.rpartition('.')[-1].lower()
        if ext in self.default_content_types:
            return self.default_content_types[ext]
        return guess_type(name)[0]

    def read_package_relationships(self):
        try:
            raw = self.read('_rels/.rels')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
        root = fromstring(raw)
        self.relationships = {}
        self.relationships_rmap = {}
        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
            target = item.get('Target').lstrip('/')
            typ = item.get('Type')
            self.relationships[typ] = target
            self.relationships_rmap[target] = typ

    @property
    def document_name(self):
        name = self.relationships.get(DOCUMENT, None)
        if name is None:
            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
            if not names:
                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
            name = names[0]
        return name

    @property
    def document(self):
        return fromstring(self.read(self.document_name))

    @property
    def document_relationships(self):
        return self.get_relationships(self.document_name)

    def get_relationships(self, name):
        base = '/'.join(name.split('/')[:-1])
        by_id, by_type = {}, {}
        parts = name.split('/')
        name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
        try:
            raw = self.read(name)
        except KeyError:
            pass
        else:
            root = fromstring(raw)
            for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
                target = item.get('Target')
                if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
                    target = '/'.join((base, target.lstrip('/')))
                typ = item.get('Type')
                Id = item.get('Id')
                by_id[Id] = by_type[typ] = target

        return by_id, by_type

    @property
    def metadata(self):
        mi = Metadata(_('Unknown'))
        name = self.relationships.get(DOCPROPS, None)
        if name is None:
            names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
            if names:
                name = names[0]
        if name:
            try:
                raw = self.read(name)
            except KeyError:
                pass
            else:
                read_doc_props(raw, mi)
        if mi.is_null('language'):
            try:
                raw = self.read('word/styles.xml')
            except KeyError:
                pass
            else:
                read_default_style_language(raw, mi)

        name = self.relationships.get(APPPROPS, None)
        if name is None:
            names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
            if names:
                name = names[0]
        if name:
            try:
                raw = self.read(name)
            except KeyError:
                pass
            else:
                read_app_props(raw, mi)

        return mi

    def close(self):
        if hasattr(self, 'zipf'):
            self.zipf.close()
        else:
            try:
                shutil.rmtree(self.tdir)
            except EnvironmentError:
                pass
Example #22
0
    def _get_epub_toc(self, cid=None, path=None, prepend_title=None):
        '''
        Given a calibre id, return the epub TOC indexed by section
        If cid, use copy in library, else use path to copy on device
        '''
        toc = None
        #         if cid is not None:
        #             mi = self.opts.gui.current_db.get_metadata(cid, index_is_id=True)
        #             toc = None
        #             if 'EPUB' in mi.formats:
        #                 fpath = self.opts.gui.current_db.format(cid, 'EPUB', index_is_id=True, as_path=True)
        #             else:
        #                 return toc
        #         elif path is not None:
        #             fpath = path
        #         else:
        #             return toc
        fpath = path

        # iBooks stores books unzipped
        # Marvin stores books zipped
        # Need spine, ncx_tree to construct toc

        if self.ios.stat(fpath) and self.ios.stat(
                fpath)['st_ifmt'] == 'S_IFDIR':
            # Find the OPF in the unzipped ePub
            fp = '/'.join([fpath, 'META-INF', 'container.xml'])
            cf = io.BytesIO(self.ios.read(fp))
            container = etree.parse(cf)
            opf_file = container.xpath('.//*[local-name()="rootfile"]')[0].get(
                'full-path')
            oebps = opf_file.rpartition('/')[0]

            fp = '/'.join([fpath, opf_file])
            opf = io.BytesIO(self.ios.read(fp))
            opf_tree = etree.parse(opf)
            spine = opf_tree.xpath('.//*[local-name()="spine"]')[0]
            ncx_fs = spine.get('toc')
            manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0]
            ncx_file = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href')

            fp = '/'.join([fpath, oebps, ncx_file])
            ncxf = io.BytesIO(self.ios.read(fp))
            ncx_tree = etree.parse(ncxf)
            #self._log(etree.tostring(ncx_tree, pretty_print=True))

        else:
            # Find the OPF file in the zipped ePub
            zfo = io.BytesIO(self.ios.read(fpath, mode='rb'))
            try:
                zf = ZipFile(zfo, 'r')
                container = etree.fromstring(zf.read('META-INF/container.xml'))
                opf_tree = etree.fromstring(
                    zf.read(
                        container.xpath('.//*[local-name()="rootfile"]')
                        [0].get('full-path')))

                spine = opf_tree.xpath('.//*[local-name()="spine"]')[0]
                ncx_fs = spine.get('toc')
                manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0]
                ncx = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href')

                # Find the ncx file
                fnames = zf.namelist()
                _ncx = [x for x in fnames if ncx in x][0]
                ncx_tree = etree.fromstring(zf.read(_ncx))
            except:
                import traceback
                self._log_location()
                self._log(" unable to unzip '%s'" % fpath)
                self._log(traceback.format_exc())
                return toc

        # fpath points to epub (zipped or unzipped dir)
        # spine, ncx_tree populated
        try:
            toc = OrderedDict()
            # 1. capture idrefs from spine
            for i, el in enumerate(spine):
                toc[str(i)] = el.get('idref')

            # 2. Resolve <spine> idrefs to <manifest> hrefs
            for el in toc:
                toc[el] = manifest.find('.//*[@id="%s"]' % toc[el]).get('href')

            # 3. Build a dict of src:toc_entry
            src_map = OrderedDict()
            navMap = ncx_tree.xpath('.//*[local-name()="navMap"]')[0]
            for navPoint in navMap:
                # Get the first-level entry
                src = re.sub(
                    r'#.*$', '',
                    navPoint.xpath('.//*[local-name()="content"]')[0].get(
                        'src'))
                toc_entry = navPoint.xpath('.//*[local-name()="text"]')[0].text
                src_map[src] = toc_entry

                # Get any nested navPoints
                nested_navPts = navPoint.xpath('.//*[local-name()="navPoint"]')
                for nnp in nested_navPts:
                    src = re.sub(
                        r'#.*$', '',
                        nnp.xpath('.//*[local-name()="content"]')[0].get(
                            'src'))
                    toc_entry = nnp.xpath('.//*[local-name()="text"]')[0].text
                    src_map[src] = toc_entry

            # Resolve src paths to toc_entry
            for section in toc:
                if toc[section] in src_map:
                    if prepend_title:
                        toc[section] = "%s &middot; %s" % (
                            prepend_title, src_map[toc[section]])
                    else:
                        toc[section] = src_map[toc[section]]
                else:
                    toc[section] = None

            # 5. Fill in the gaps
            current_toc_entry = None
            for section in toc:
                if toc[section] is None:
                    toc[section] = current_toc_entry
                else:
                    current_toc_entry = toc[section]
        except:
            import traceback
            self._log_location()
            self._log("{:~^80}".format(" error parsing '%s' " % fpath))
            self._log(traceback.format_exc())
            self._log("{:~^80}".format(" end traceback "))

        return toc
Example #23
0
class DOCX(object):

    def __init__(self, path_or_stream, log=None, extract=True):
        self.docx_is_transitional = True
        stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
        self.name = getattr(stream, 'name', None) or '<stream>'
        self.log = log or default_log
        if extract:
            self.extract(stream)
        else:
            self.init_zipfile(stream)
        self.read_content_types()
        self.read_package_relationships()
        self.namespace = DOCXNamespace(self.docx_is_transitional)

    def init_zipfile(self, stream):
        self.zipf = ZipFile(stream)
        self.names = frozenset(self.zipf.namelist())

    def extract(self, stream):
        self.tdir = PersistentTemporaryDirectory('docx_container')
        try:
            zf = ZipFile(stream)
            zf.extractall(self.tdir)
        except:
            self.log.exception('DOCX appears to be invalid ZIP file, trying a'
                    ' more forgiving ZIP parser')
            from calibre.utils.localunzip import extractall
            stream.seek(0)
            extractall(stream, self.tdir)

        self.names = {}
        for f in walk(self.tdir):
            name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
            self.names[name] = f

    def exists(self, name):
        return name in self.names

    def read(self, name):
        if hasattr(self, 'zipf'):
            return self.zipf.open(name).read()
        path = self.names[name]
        with open(path, 'rb') as f:
            return f.read()

    def read_content_types(self):
        try:
            raw = self.read('[Content_Types].xml')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
        root = fromstring(raw)
        self.content_types = {}
        self.default_content_types = {}
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
            self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
        for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
            name = item.get('PartName').lstrip('/')
            self.content_types[name] = item.get('ContentType')

    def content_type(self, name):
        if name in self.content_types:
            return self.content_types[name]
        ext = name.rpartition('.')[-1].lower()
        if ext in self.default_content_types:
            return self.default_content_types[ext]
        return guess_type(name)[0]

    def read_package_relationships(self):
        try:
            raw = self.read('_rels/.rels')
        except KeyError:
            raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
        root = fromstring(raw)
        self.relationships = {}
        self.relationships_rmap = {}
        for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
            target = item.get('Target').lstrip('/')
            typ = item.get('Type')
            if target == 'word/document.xml':
                self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
            self.relationships[typ] = target
            self.relationships_rmap[target] = typ

    @property
    def document_name(self):
        name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
        if name is None:
            names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
            if not names:
                raise InvalidDOCX('The file %s docx file has no main document' % self.name)
            name = names[0]
        return name

    @property
    def document(self):
        return fromstring(self.read(self.document_name))

    @property
    def document_relationships(self):
        return self.get_relationships(self.document_name)

    def get_relationships(self, name):
        base = '/'.join(name.split('/')[:-1])
        by_id, by_type = {}, {}
        parts = name.split('/')
        name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
        try:
            raw = self.read(name)
        except KeyError:
            pass
        else:
            root = fromstring(raw)
            for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
                target = item.get('Target')
                if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
                    target = '/'.join((base, target.lstrip('/')))
                typ = item.get('Type')
                Id = item.get('Id')
                by_id[Id] = by_type[typ] = target

        return by_id, by_type

    def get_document_properties_names(self):
        name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
        if name is None:
            names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
            if names:
                name = names[0]
        yield name
        name = self.relationships.get(self.namespace.names['APPPROPS'], None)
        if name is None:
            names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
            if names:
                name = names[0]
        yield name

    @property
    def metadata(self):
        mi = Metadata(_('Unknown'))
        dp_name, ap_name = self.get_document_properties_names()
        if dp_name:
            try:
                raw = self.read(dp_name)
            except KeyError:
                pass
            else:
                read_doc_props(raw, mi, self.namespace.XPath)
        if mi.is_null('language'):
            try:
                raw = self.read('word/styles.xml')
            except KeyError:
                pass
            else:
                read_default_style_language(raw, mi, self.namespace.XPath)

        ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
        if ap_name:
            try:
                raw = self.read(ap_name)
            except KeyError:
                pass
            else:
                read_app_props(raw, mi)

        return mi

    def close(self):
        if hasattr(self, 'zipf'):
            self.zipf.close()
        else:
            try:
                shutil.rmtree(self.tdir)
            except EnvironmentError:
                pass
    def _get_epub_toc(self, cid=None, path=None, prepend_title=None):
        '''
        Given a calibre id, return the epub TOC indexed by section
        If cid, use copy in library, else use path to copy on device
        '''
        toc = None
#         if cid is not None:
#             mi = self.opts.gui.current_db.get_metadata(cid, index_is_id=True)
#             toc = None
#             if 'EPUB' in mi.formats:
#                 fpath = self.opts.gui.current_db.format(cid, 'EPUB', index_is_id=True, as_path=True)
#             else:
#                 return toc
#         elif path is not None:
#             fpath = path
#         else:
#             return toc
        fpath = path

        # iBooks stores books unzipped
        # Marvin stores books zipped
        # Need spine, ncx_tree to construct toc

        if self.ios.stat(fpath) and self.ios.stat(fpath)['st_ifmt'] == 'S_IFDIR':
            # Find the OPF in the unzipped ePub
            fp = '/'.join([fpath, 'META-INF', 'container.xml'])
            cf = cStringIO.StringIO(self.ios.read(fp))
            container = etree.parse(cf)
            opf_file = container.xpath('.//*[local-name()="rootfile"]')[0].get('full-path')
            oebps = opf_file.rpartition('/')[0]

            fp = '/'.join([fpath, opf_file])
            opf = cStringIO.StringIO(self.ios.read(fp))
            opf_tree = etree.parse(opf)
            spine = opf_tree.xpath('.//*[local-name()="spine"]')[0]
            ncx_fs = spine.get('toc')
            manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0]
            ncx_file = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href')

            fp = '/'.join([fpath, oebps, ncx_file])
            ncxf = cStringIO.StringIO(self.ios.read(fp))
            ncx_tree = etree.parse(ncxf)
            #self._log(etree.tostring(ncx_tree, pretty_print=True))

        else:
            # Find the OPF file in the zipped ePub
            zfo = cStringIO.StringIO(self.ios.read(fpath, mode='rb'))
            try:
                zf = ZipFile(zfo, 'r')
                container = etree.fromstring(zf.read('META-INF/container.xml'))
                opf_tree = etree.fromstring(zf.read(container.xpath('.//*[local-name()="rootfile"]')[0].get('full-path')))

                spine = opf_tree.xpath('.//*[local-name()="spine"]')[0]
                ncx_fs = spine.get('toc')
                manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0]
                ncx = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href')

                # Find the ncx file
                fnames = zf.namelist()
                _ncx = [x for x in fnames if ncx in x][0]
                ncx_tree = etree.fromstring(zf.read(_ncx))
            except:
                import traceback
                self._log_location()
                self._log(" unable to unzip '%s'" % fpath)
                self._log(traceback.format_exc())
                return toc

        # fpath points to epub (zipped or unzipped dir)
        # spine, ncx_tree populated
        try:
            toc = OrderedDict()
            # 1. capture idrefs from spine
            for i, el in enumerate(spine):
                toc[str(i)] = el.get('idref')

            # 2. Resolve <spine> idrefs to <manifest> hrefs
            for el in toc:
                toc[el] = manifest.find('.//*[@id="%s"]' % toc[el]).get('href')

            # 3. Build a dict of src:toc_entry
            src_map = OrderedDict()
            navMap = ncx_tree.xpath('.//*[local-name()="navMap"]')[0]
            for navPoint in navMap:
                # Get the first-level entry
                src = re.sub(r'#.*$', '', navPoint.xpath('.//*[local-name()="content"]')[0].get('src'))
                toc_entry = navPoint.xpath('.//*[local-name()="text"]')[0].text
                src_map[src] = toc_entry

                # Get any nested navPoints
                nested_navPts = navPoint.xpath('.//*[local-name()="navPoint"]')
                for nnp in nested_navPts:
                    src = re.sub(r'#.*$', '', nnp.xpath('.//*[local-name()="content"]')[0].get('src'))
                    toc_entry = nnp.xpath('.//*[local-name()="text"]')[0].text
                    src_map[src] = toc_entry

            # Resolve src paths to toc_entry
            for section in toc:
                if toc[section] in src_map:
                    if prepend_title:
                        toc[section] = "%s &middot; %s" % (prepend_title,  src_map[toc[section]])
                    else:
                        toc[section] = src_map[toc[section]]
                else:
                    toc[section] = None

            # 5. Fill in the gaps
            current_toc_entry = None
            for section in toc:
                if toc[section] is None:
                    toc[section] = current_toc_entry
                else:
                    current_toc_entry = toc[section]
        except:
            import traceback
            self._log_location()
            self._log("{:~^80}".format(" error parsing '%s' " % fpath))
            self._log(traceback.format_exc())
            self._log("{:~^80}".format(" end traceback "))

        return toc