Exemple #1
0
def performOPFSourceUpdates(data, newbkpath, oldbkpath, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary
    updates = {}
    for i in range(0, len(keylist)):
        updates[keylist[i]] = valuelist[i]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None,
                                       empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data,
                         features=None,
                         from_encoding="utf-8",
                         builder=xmlbuilder)
    for tag in soup.find_all(["link", "item", "reference", "site"]):
        if "href" in tag.attrs:
            href = tag["href"]
            if href.find(":") == -1:
                parts = href.split('#')
                ahref = unquoteurl(parts[0])
                fragment = ""
                if len(parts) > 1:
                    fragment = parts[1]
                oldtarget = buildBookPath(ahref, startingDir(oldbkpath))
                newtarget = updates.get(oldtarget, oldtarget)
                attribute_value = buildRelativePath(newbkpath, newtarget)
                if fragment != "":
                    attribute_value = attribute_value + "#" + fragment
                attribute_value = quoteurl(attribute_value)
                tag["href"] = attribute_value
    newdata = soup.decodexml(indent_level=0,
                             formatter='minimal',
                             indent_chars="  ")
    return newdata
Exemple #2
0
def anchorNCXUpdatesAfterMerge(data, ncx_bookpath, sink_bookpath,
                               merged_bookpaths):
    data = _remove_xml_header(data)
    startdir = startingDir(ncx_bookpath)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    xmlbuilder = LXMLTreeBuilderForXML(parser=None,
                                       empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data,
                         features=None,
                         from_encoding="utf-8",
                         builder=xmlbuilder)
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                if parts is not None:
                    ahref = unquoteurl(parts[0])
                    target_bookpath = buildBookPath(ahref, startdir)
                    if target_bookpath in merged_bookpaths:
                        attribute_value = buildRelativePath(
                            ncx_bookpath, sink_bookpath)
                        if len(parts) > 1 and parts[1] != "":
                            attribute_value += "#" + parts[1]
                        tag["src"] = quoteurl(attribute_value)
    newdata = soup.decodexml(indent_level=0,
                             formatter='minimal',
                             indent_chars="  ")
    return newdata
Exemple #3
0
def performPageMapUpdates(data, newbkpath, oldbkpath, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary of xml_updates properly adjusted
    updates = OrderedDict()
    for i in range(0, len(keylist)):
        updates[ keylist[i] ] = valuelist[i]
    xml_empty_tags = ["page"]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all(["page"]):
        for att in ["href"]:
            if att in tag.attrs :
                ref = tag[att]
                if ref.find(":") == -1 :
                    parts = ref.split('#')
                    apath = urldecodepart(parts[0])
                    fragment = ""
                    if len(parts) > 1:
                        fragment = urldecodepart(parts[1])
                    oldtarget = buildBookPath(apath, startingDir(oldbkpath))
                    newtarget = updates.get(oldtarget, oldtarget)
                    attribute_value = urlencodepart(buildRelativePath(newbkpath, newtarget))
                    if fragment != "":
                        attribute_value = attribute_value + "#" + urlencodepart(fragment)
                    tag[att] = attribute_value
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata
Exemple #4
0
def anchorNCXUpdates(data, ncx_bookpath, originating_bookpath, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary
    id_dict = OrderedDict()
    for i in range(0, len(keylist)):
        id_dict[ keylist[i] ] = valuelist[i]
    startdir = startingDir(ncx_bookpath)
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                apath = urldecodepart(parts[0])
                # convert this path to its target bookpath
                target_bookpath = buildBookPath(apath, startdir)
                if (parts is not None) and (len(parts) > 1) and (target_bookpath == originating_bookpath) and (parts[1] != ""):
                    fragment_id = urldecodepart(parts[1])
                    if fragment_id in id_dict:
                        target_bookpath = id_dict[fragment_id]
                        attribute_value = urlencodepart(buildRelativePath(ncx_bookpath, target_bookpath))
                        attribute_value = attribute_value + "#" + urlencodepart(fragment_id)
                        tag["src"] = attribute_value;
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata
Exemple #5
0
    def __init__(self, opf_path, opf_bookpath, debug=False):
        self._debug = debug
        opf_path = pathof(opf_path)
        self.opfname = os.path.basename(opf_path)
        self.opf_bookpath = opf_bookpath
        self.opf_dir = startingDir(opf_bookpath)
        self.opf = None
        with open(opf_path, 'rb') as fp:
            self.opf = fp.read().decode('utf-8')
        self.opos = 0
        self.package = None
        self.metadata_attr = None
        self.metadata = []
        self.cover_id = None

        # let downstream invert any invertable dictionaries when needed
        self.manifest_id_to_href = {}
        self.manifest_id_to_bookpath = {}

        # create non-invertable dictionaries
        self.manifest_id_to_mime = {}
        self.manifest_id_to_properties = {}
        self.manifest_id_to_fallback = {}
        self.manifest_id_to_overlay = {}

        # spine and guide
        self.spine = []
        self.spine_ppd = None
        self.guide = []
        self.bindings = []

        # determine folder structure
        self.group_folder = {}
        self.group_count = {}
        self.group_folder["epub"] = ['META-INF']
        self.group_count["epub"] = [1]
        self.group_folder["opf"] = [self.opf_dir]
        self.group_count["opf"] = [1]

        # self.bookpaths = []
        # self.bookpaths.append(self.opf_bookpath)

        self._parseData()
Exemple #6
0
 def get_startingdir(self, bookpath):
     bookpath = _unicodestr(bookpath)
     return startingDir(bookpath)
Exemple #7
0
    def _parseData(self):
        cnt = 0
        for prefix, tname, tattr, tcontent in self._opf_tag_iter():
            if self._debug:
                print("   Parsing OPF: ", prefix, tname, tattr, tcontent)
            # package
            if tname == "package":
                ver = tattr.pop("version", "2.0")
                uid = tattr.pop("unique-identifier", "bookid")
                self.package = (ver, uid, tattr)
                continue
            # metadata
            if tname == "metadata":
                self.metadata_attr = tattr
                continue
            if tname in ["meta", "link"
                         ] or tname.startswith("dc:") and "metadata" in prefix:
                self.metadata.append((tname, tattr, tcontent))
                if tattr.get("name", "") == "cover":
                    self.cover_id = tattr.get("content", None)
                continue
            # manifest
            if tname == "item" and "manifest" in prefix:
                nid = "xid%03d" % cnt
                cnt += 1
                id = tattr.pop("id", nid)
                href = tattr.pop("href", '')
                mtype = tattr.pop("media-type", '')
                if mtype == "text/html":
                    mtype = "application/xhtml+xml"
                if mtype not in mime_group_map:
                    print("****Opf_Parser Warning****: Unknown MediaType: ",
                          mtype)
                href = unquoteurl(href)
                properties = tattr.pop("properties", None)
                fallback = tattr.pop("fallback", None)
                overlay = tattr.pop("media-overlay", None)

                # external resources are now allowed in the opf under epub3
                # we can ignore fragments here as these are links to files
                self.manifest_id_to_href[id] = href

                bookpath = ""
                if href.find(":") == -1:
                    bookpath = buildBookPath(href, self.opf_dir)
                self.manifest_id_to_bookpath[id] = bookpath
                self.manifest_id_to_mime[id] = mtype
                # self.bookpaths.append(bookpath)
                group = mime_group_map.get(mtype, '')
                if bookpath != "" and group != "":
                    folderlst = self.group_folder.get(group, [])
                    countlst = self.group_count.get(group, [])
                    sdir = startingDir(bookpath)
                    if sdir not in folderlst:
                        folderlst.append(sdir)
                        countlst.append(1)
                    else:
                        pos = folderlst.index(sdir)
                        countlst[pos] = countlst[pos] + 1
                    self.group_folder[group] = folderlst
                    self.group_count[group] = countlst
                self.manifest_id_to_properties[id] = properties
                self.manifest_id_to_fallback[id] = fallback
                self.manifest_id_to_overlay[id] = overlay
                continue
            # spine
            if tname == "spine":
                if tattr is not None:
                    self.spine_ppd = tattr.get("page-progression-direction",
                                               None)
                continue
            if tname == "itemref" and "spine" in prefix:
                idref = tattr.pop("idref", "")
                linear = tattr.pop("linear", None)
                properties = tattr.pop("properties", None)
                self.spine.append((idref, linear, properties))
                continue
            # guide
            if tname == "reference" and "guide" in prefix:
                type = tattr.pop("type", '')
                title = tattr.pop("title", '')
                href = unquoteurl(tattr.pop("href", ''))
                self.guide.append((type, title, href))
                continue
            # bindings (stored but ignored for now)
            if tname in ["mediaType", "mediatype"] and "bindings" in prefix:
                mtype = tattr.pop("media-type", "")
                handler = tattr.pop("handler", "")
                self.bindings.append((mtype, handler))
                continue

        # determine unique ShortPathName for each bookpath
        # start with filename and work back up the folders
        # spn = {}
        # dupset = set()
        # nameset = {}
        # lvl = 1
        # for bkpath in self.bookpaths:
        #     aname = build_short_name(bkpath, lvl)
        #     spn[bkpath] = aname
        #     if aname in nameset:
        #         dupset.add(aname)
        #         nameset[aname].append(bkpath)
        #     else:
        #         nameset[aname]=[bkpath]
        #
        # now work just through any to-do list of duplicates
        # until all duplicates are gone
        #
        # todolst = list(dupset)
        # while(todolst):
        #     dupset = set()
        #     lvl += 1
        #     for aname in todolst:
        #         bklst = nameset[aname]
        #         del nameset[aname]
        #         for bkpath in bklst:
        #             newname = build_short_name(bkpath, lvl)
        #             spn[bkpath] = newname
        #             if newname in nameset:
        #                 dupset.add(newname)
        #                 nameset[newname].append(bkpath)
        #             else:
        #                 nameset[newname] = [bkpath]
        #     todolst = list(dupset)

        # finally sort by number of files in dir to find default folders for each group
        dirlst = []
        use_lower_case = False
        for group in self.group_folder.keys():
            folders = self.group_folder[group]
            cnts = self.group_count[group]
            folders = [x for _, x in sorted(zip(cnts, folders), reverse=True)]
            self.group_folder[group] = folders
            if group in [
                    "Text", "Styles", "Images", "Audio", "Fonts", "Video",
                    "Misc"
            ]:
                afolder = folders[0]
                if afolder.find(group.lower()) > -1:
                    use_lower_case = True
                dirlst.append(folders[0])

        # now back fill any missing values
        # commonbase will end with a /
        commonbase = longestCommonPath(dirlst)
        if commonbase == "/":
            commonbase = ""
        for group in ["Styles", "Images", "Audio", "Fonts", "Video", "Misc"]:
            folders = self.group_folder.get(group, [])
            gname = group
            if use_lower_case:
                gname = gname.lower()
            if not folders:
                folders = [commonbase + gname]
                self.group_folder[group] = folders
Exemple #8
0
def parse_nav(qp, navdata, navbkpath, newdir):
    qp.setContent(navdata)
    toclist = []
    pagelist = []
    landmarks = []
    lvl = 0
    pgcnt = 0
    maxlvl = -1
    nav_type = None
    href = None
    title = ""
    play = 0
    navdir = startingDir(navbkpath)

    for txt, tp, tname, ttype, tattr in qp.parse_iter():
        if txt is not None:
            if ".a." in tp or tp.endswith(".a"):
                title = title + txt
            else:
                title = ""
        else:
            if tname == "nav":
                if ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                if ttype == "end":
                    nav_type = None
                continue
            if tname == "ol" and nav_type is not None and nav_type in ("toc","page-list","landmarks"):
                if ttype == "begin":
                    lvl += 1
                    if nav_type == "toc":
                        if lvl > maxlvl: maxlvl = lvl
                if ttype == "end": lvl -= 1
                continue
            if tname == "a" and ttype == "begin":
                href = tattr.get("href", "")
                href = unquoteurl(href)
                if href.find(":") == -1:
                    # first strip off any fragment
                    fragment = ""
                    if href.find("#") != -1:
                        href, fragment = href.split("#")
                    # find destination bookpath
                    if href.startswith("./"): href=href[2:]
                    if href == "":
                        destbkpath = navbkpath
                    else:
                        destbkpath = buildBookPath(href, navdir)
                    # create relative path to destbkpath from newdir
                    href = relativePath(destbkpath, newdir)
                    if fragment != "":
                        href = href + "#" + fragment
                epubtype = tattr.get("epub:type", None)
                continue
            if tname == "a" and ttype == "end":
                if nav_type == "toc":
                    play += 1
                    toclist.append((play, lvl, href, title))
                elif nav_type == "page-list":
                    pgcnt += 1
                    pagelist.append((pgcnt, href, title))
                elif nav_type == "landmarks":
                    if epubtype is not None:
                        gtype = _epubtype_guide_map.get(epubtype, None)
                        landmarks.append((gtype, href, title))
                title = ""
                continue

    return toclist, pagelist, landmarks, maxlvl, pgcnt