Exemple #1
0
    def extract_recognized_metadata(self):
        self.op = OPFMetadataParser(self.opfdata)
        self.md = self.op.get_metadata()
        self.idlst = self.op.get_idlst()
        self.metadata_attr = self.op.get_metadata_attr().copy()
        self.pkg = self.op.get_package()

        # add the opf attribute namespace to the metadata tag for OPF 2
        # and make sure the dc namespace is there as well
        if self.metadata_attr is None:
            self.metadata_attr = OrderedDict()
        if "xmlsns:opf" not in self.metadata_attr:
            self.metadata_attr["xmlns:opf"] = "http://www.idpf.org/2007/opf"
        if "xmlsns:dc" not in self.metadata_attr:
            self.metadata_attr["xmlns:dc"] = "http://purl.org/dc/elements/1.1/"

        # first sort out recognized dc and other metadata
        # while building up id2rec map, and removing id from idlst

        # special case the cover image meta and the unique id meta
        (ver, uid, attr) = self.pkg

        numrec = 0
        for mentry in self.md:
            (mname, mcontent, mattr) = mentry

            if mname == "dc:identifier" and mattr.get("id", "") == uid:
                self.other.append(mentry)
                continue

            if mname in _recognized_dc:
                self.rec.append(mentry)
                id = mattr.get("id", None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1

            elif mname == "meta" and "name" in mattr and mattr[
                    "name"] not in _skip_meta:
                # normal meta tag
                mname = mattr["name"]
                del mattr["name"]
                mcontent = mattr.get("content", "")
                del mattr["content"]
                mentry = (mname, mcontent, mattr)
                self.rec.append(mentry)
                id = mattr.get("id", None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1

            else:
                self.other.append(mentry)

        if _DEBUG:
            print("recongized", self.rec)
            print("other", self.other)
            print("idlst", self.idlst)
Exemple #2
0
    def extract_recognized_metadata(self):
        self.op = OPFMetadataParser(self.opfdata)
        self.md = self.op.get_metadata()
        self.idlst = self.op.get_idlst()
        self.metadata_attr = self.op.get_metadata_attr().copy()
        self.pkg = self.op.get_package()

        # add the opf attribute namespace to the metadata tag for OPF 2
        # and make sure the dc namespace is there as well
        if self.metadata_attr is None:
            self.metadata_attr = {}
        if "xmlsns:opf" not in self.metadata_attr:
            self.metadata_attr["xmlns:opf"] = "http://www.idpf.org/2007/opf"
        if "xmlsns:dc" not in self.metadata_attr:
            self.metadata_attr["xmlns:dc"] = "http://purl.org/dc/elements/1.1/"

        # first sort out recognized dc and other metadata
        # while building up id2rec map, and removing id from idlst

        # special case the cover image meta and the unique id meta
        (ver, uid, attr) = self.pkg

        numrec = 0
        for mentry in self.md:
            (mname, mcontent, mattr) = mentry

            if mname == "dc:identifier" and mattr.get("id","") == uid:
                self.other.append(mentry)
                continue
                
            if mname in _recognized_dc:
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1

            elif mname == "meta" and "name" in mattr and mattr["name"] not in _skip_meta:
                # normal meta tag
                mname = mattr["name"]
                del mattr["name"]
                mcontent = mattr.get("content","")
                del mattr["content"]
                mentry = (mname, mcontent, mattr)
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1

            else:
                self.other.append(mentry)

        if _DEBUG:
            print("recongized", self.rec)
            print("other", self.other)
            print("idlst", self.idlst)
Exemple #3
0
class MetadataProcessor(object):

    def __init__(self, opfdata):
        self.opfdata = opfdata
        self.rec = []
        self.pkg = None
        self.other = []
        self.op = None
        self.md = None
        self.id2rec = {}
        self.idlst = []
        self.metadata_attr = None

    def extract_recognized_metadata(self):
        self.op = OPFMetadataParser(self.opfdata)
        self.md = self.op.get_metadata()
        self.idlst = self.op.get_idlst()
        self.metadata_attr = self.op.get_metadata_attr().copy()
        self.pkg = self.op.get_package()

        # add the opf attribute namespace to the metadata tag for OPF 2
        # and make sure the dc namespace is there as well
        if self.metadata_attr is None:
            self.metadata_attr = {}
        if "xmlsns:opf" not in self.metadata_attr:
            self.metadata_attr["xmlns:opf"] = "http://www.idpf.org/2007/opf"
        if "xmlsns:dc" not in self.metadata_attr:
            self.metadata_attr["xmlns:dc"] = "http://purl.org/dc/elements/1.1/"

        # first sort out recognized dc and other metadata
        # while building up id2rec map, and removing id from idlst

        # special case the cover image meta and the unique id meta
        (ver, uid, attr) = self.pkg

        numrec = 0
        for mentry in self.md:
            (mname, mcontent, mattr) = mentry

            if mname == "dc:identifier" and mattr.get("id","") == uid:
                self.other.append(mentry)
                continue
                
            if mname in _recognized_dc:
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1

            elif mname == "meta" and "name" in mattr and mattr["name"] not in _skip_meta:
                # normal meta tag
                mname = mattr["name"]
                del mattr["name"]
                mcontent = mattr.get("content","")
                del mattr["content"]
                mentry = (mname, mcontent, mattr)
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1

            else:
                self.other.append(mentry)

        if _DEBUG:
            print("recongized", self.rec)
            print("other", self.other)
            print("idlst", self.idlst)

    # get recognized metadata as text based tree of 
    # metadata elements with properties/attributes as indented children
    def get_recognized_metadata(self):
        data=[]
        for (dname, dcontent, dattr) in self.rec:
            content = xmldecode(dcontent)
            data.append(dname + _US + content + _RS)
            keys = sorted(list(dattr.keys()))
            for key in keys:
                val = xmldecode(dattr[key])
                data.append(_IN + key + _US + val + _RS)
        return "".join(data)
    
    def get_other_meta_xml(self):
        res = []
        for mentry in self.other:
            res.append('  ' + buildxml(mentry))
        return "".join(res)

    def get_id_list(self):
        return self.idlst;

    def get_metadata_tag(self):
        res = []
        res.append('<metadata')
        if self.metadata_attr is not None:
            for key in self.metadata_attr:
                val = self.metadata_attr[key]
                res.append(' ' + key + '="'+val+'"' )
        res.append('>\n')
        return "".join(res)
Exemple #4
0
class MetadataProcessor(object):

    def __init__(self, opfdata):
        self.opfdata = opfdata
        self.rec = []
        self.refines = []
        self.other = []
        self.op = None
        self.md = None
        self.pkg = None
        self.id2rec = {}
        self.idlst = []
        self.metadata_attr = {}

    def extract_recognized_metadata(self):
        self.op = OPFMetadataParser(self.opfdata)
        self.md = self.op.get_metadata()
        self.idlst = self.op.get_idlst()
        self.metadata_attr = self.op.get_metadata_attr()
        self.pkg = self.op.get_package()

        # first sort out recognized dc and primary meta from refines, and other metadata
        # while building up id2rec map, and removing id from idlst
        numrec = 0
        (ver, uid, attr) = self.pkg
        for mentry in self.md:
            (mname, mcontent, mattr) = mentry
            
            # do not allow the gui to play with the unique-identifier to 
            # prevent font obfuscation issues later
            if mname == "dc:identifier" and mattr.get("id","") == uid:
                self.other.append(mentry)
                continue

            if mname in _recognized_dc:
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            elif mname == "meta" and "refines" in mattr:
                self.refines.append(mentry)
            elif mname == "meta" and "property" in mattr and mattr["property"] in _recognized_meta:
                # primary meta tag
                property = mattr["property"]
                del mattr["property"]
                mname = property
                mentry = (mname, mcontent, mattr)
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            else:
                self.other.append(mentry)

        # finally convert any refines on metadata to be extra attributes on their target tag
        # all other types of metadata are added to "others" to they are not touched in any way
        for mentry in self.refines:
            (rname, rcontent, rattr) = mentry
            rid = rattr.get("id",None)
            tid = rattr["refines"]
            prop = rattr["property"]
            scheme = rattr.get("scheme", None)
            propval = rcontent
            if tid.startswith("#"):
                tid = tid[1:]
                if tid in self.id2rec:
                    pos = self.id2rec[tid]
                    (dname, dcontent, dattr) = self.rec[pos]
                    dattr[prop] = propval
                    if scheme is not None:
                        dattr["scheme"] = scheme
                    if prop == "alternate-script":
                        dattr["altlang"] = rattr["xml:lang"]
                    self.rec[pos] = (dname, dcontent, dattr)
                    if rid is not None:
                        self.idlst.remove(rid) 
                else:
                    # these refines refer to something that is not recognized metadata
                    self.other.append(mentry)
            else:
                # this is refinement that doesn't seem to point to anything in the opf
                self.other.append(mentry)
        if _DEBUG:
            print("recongized", self.rec)
            print("other", self.other)
            print("idlst", self.idlst)

    # get recognized metadata with included refines as text based tree of 
    # metadata elements with properties/attributes as indented children
    def get_recognized_metadata(self):
        data=[]
        for (dname, dcontent, dattr) in self.rec:
            content = xmldecode(dcontent)
            data.append(dname + _US + content + _RS)
            keys = sorted(list(dattr.keys()))
            for key in keys:
                val = xmldecode(dattr[key])
                data.append(_IN + key + _US + val + _RS)
        return "".join(data)
    
    def get_other_meta_xml(self):
        res = []
        for mentry in self.other:
            res.append('  ' + buildxml(mentry))
        return "".join(res)

    def get_id_list(self):
        return self.idlst;

    def get_metadata_tag(self):
        res = []
        res.append('<metadata')
        if self.metadata_attr is not None:
            for key in self.metadata_attr:
                val = self.metadata_attr[key]
                res.append(' ' + key + '="'+val+'"' )
        res.append('>\n')
        return "".join(res)
Exemple #5
0
    def extract_recognized_metadata(self):
        self.op = OPFMetadataParser(self.opfdata)
        self.md = self.op.get_metadata()
        self.idlst = self.op.get_idlst()
        self.metadata_attr = self.op.get_metadata_attr()
        self.pkg = self.op.get_package()

        # first sort out recognized dc and primary meta from refines, and other metadata
        # while building up id2rec map, and removing id from idlst
        numrec = 0
        (ver, uid, attr) = self.pkg
        for mentry in self.md:
            (mname, mcontent, mattr) = mentry
            
            # do not allow the gui to play with the unique-identifier to 
            # prevent font obfuscation issues later
            if mname == "dc:identifier" and mattr.get("id","") == uid:
                self.other.append(mentry)
                continue

            if mname in _recognized_dc:
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            elif mname == "meta" and "refines" in mattr:
                self.refines.append(mentry)
            elif mname == "meta" and "property" in mattr and mattr["property"] in _recognized_meta:
                # primary meta tag
                property = mattr["property"]
                del mattr["property"]
                mname = property
                mentry = (mname, mcontent, mattr)
                self.rec.append(mentry)
                id = mattr.get("id",None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            else:
                self.other.append(mentry)

        # finally convert any refines on metadata to be extra attributes on their target tag
        # all other types of metadata are added to "others" to they are not touched in any way
        for mentry in self.refines:
            (rname, rcontent, rattr) = mentry
            rid = rattr.get("id",None)
            tid = rattr["refines"]
            prop = rattr["property"]
            scheme = rattr.get("scheme", None)
            propval = rcontent
            if tid.startswith("#"):
                tid = tid[1:]
                if tid in self.id2rec:
                    pos = self.id2rec[tid]
                    (dname, dcontent, dattr) = self.rec[pos]
                    dattr[prop] = propval
                    if scheme is not None:
                        dattr["scheme"] = scheme
                    if prop == "alternate-script":
                        dattr["altlang"] = rattr["xml:lang"]
                    self.rec[pos] = (dname, dcontent, dattr)
                    if rid is not None:
                        self.idlst.remove(rid) 
                else:
                    # these refines refer to something that is not recognized metadata
                    self.other.append(mentry)
            else:
                # this is refinement that doesn't seem to point to anything in the opf
                self.other.append(mentry)
        if _DEBUG:
            print("recongized", self.rec)
            print("other", self.other)
            print("idlst", self.idlst)
Exemple #6
0
class MetadataProcessor(object):
    def __init__(self, opfdata):
        self.opfdata = opfdata
        self.rec = []
        self.refines = []
        self.other = []
        self.op = None
        self.md = None
        self.pkg = None
        self.id2rec = {}
        self.idlst = []
        self.metadata_attr = {}

    def extract_recognized_metadata(self):
        self.op = OPFMetadataParser(self.opfdata)
        self.md = self.op.get_metadata()
        self.idlst = self.op.get_idlst()
        self.metadata_attr = self.op.get_metadata_attr()
        self.pkg = self.op.get_package()

        # first sort out recognized dc and primary meta from refines, and other metadata
        # while building up id2rec map, and removing id from idlst
        numrec = 0
        (ver, uid, attr) = self.pkg
        for mentry in self.md:
            (mname, mcontent, mattr) = mentry

            # do not allow the gui to play with the unique-identifier to
            # prevent font obfuscation issues later
            if mname == "dc:identifier" and mattr.get("id", "") == uid:
                self.other.append(mentry)
                continue

            if mname in _recognized_dc:
                self.rec.append(mentry)
                id = mattr.get("id", None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            elif mname == "meta" and "refines" in mattr:
                self.refines.append(mentry)
            elif mname == "meta" and "property" in mattr and mattr[
                    "property"] in _recognized_meta:
                # primary meta tag
                property = mattr["property"]
                del mattr["property"]
                mname = property
                mentry = (mname, mcontent, mattr)
                self.rec.append(mentry)
                id = mattr.get("id", None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            else:
                self.other.append(mentry)

        # finally convert any refines on metadata to be extra attributes on their target tag
        # all other types of metadata are added to "others" to they are not touched in any way
        for mentry in self.refines:
            (rname, rcontent, rattr) = mentry
            rid = rattr.get("id", None)
            tid = rattr["refines"]
            prop = rattr["property"]
            scheme = rattr.get("scheme", None)
            propval = rcontent
            if tid.startswith("#"):
                tid = tid[1:]
                if tid in self.id2rec:
                    pos = self.id2rec[tid]
                    (dname, dcontent, dattr) = self.rec[pos]
                    dattr[prop] = propval
                    if scheme is not None:
                        dattr["scheme"] = scheme
                    if prop == "alternate-script":
                        dattr["altlang"] = rattr["xml:lang"]
                    self.rec[pos] = (dname, dcontent, dattr)
                    if rid is not None:
                        self.idlst.remove(rid)
                else:
                    # these refines refer to something that is not recognized metadata
                    self.other.append(mentry)
            else:
                # this is refinement that doesn't seem to point to anything in the opf
                self.other.append(mentry)
        if _DEBUG:
            print("recongized", self.rec)
            print("other", self.other)
            print("idlst", self.idlst)

    # get recognized metadata with included refines as text based tree of
    # metadata elements with properties/attributes as indented children
    def get_recognized_metadata(self):
        data = []
        for (dname, dcontent, dattr) in self.rec:
            content = xmldecode(dcontent)
            data.append(dname + _US + content + _RS)
            keys = sorted(list(dattr.keys()))
            for key in keys:
                val = xmldecode(dattr[key])
                data.append(_IN + key + _US + val + _RS)
        return "".join(data)

    def get_other_meta_xml(self):
        res = []
        for mentry in self.other:
            res.append('  ' + buildxml(mentry))
        return "".join(res)

    def get_id_list(self):
        return self.idlst

    def get_metadata_tag(self):
        res = []
        res.append('<metadata')
        if self.metadata_attr is not None:
            for key in self.metadata_attr:
                val = self.metadata_attr[key]
                res.append(' ' + key + '="' + val + '"')
        res.append('>\n')
        return "".join(res)
Exemple #7
0
    def extract_recognized_metadata(self):
        self.op = OPFMetadataParser(self.opfdata)
        self.md = self.op.get_metadata()
        self.idlst = self.op.get_idlst()
        self.metadata_attr = self.op.get_metadata_attr()
        self.pkg = self.op.get_package()

        # first sort out recognized dc and primary meta from refines, and other metadata
        # while building up id2rec map, and removing id from idlst
        numrec = 0
        (ver, uid, attr) = self.pkg
        for mentry in self.md:
            (mname, mcontent, mattr) = mentry

            # do not allow the gui to play with the unique-identifier to
            # prevent font obfuscation issues later
            if mname == "dc:identifier" and mattr.get("id", "") == uid:
                self.other.append(mentry)
                continue

            if mname in _recognized_dc:
                self.rec.append(mentry)
                id = mattr.get("id", None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            elif mname == "meta" and "refines" in mattr:
                self.refines.append(mentry)
            elif mname == "meta" and "property" in mattr and mattr[
                    "property"] in _recognized_meta:
                # primary meta tag
                property = mattr["property"]
                del mattr["property"]
                mname = property
                mentry = (mname, mcontent, mattr)
                self.rec.append(mentry)
                id = mattr.get("id", None)
                if id is not None:
                    self.id2rec[id] = numrec
                    self.idlst.remove(id)
                numrec += 1
            else:
                self.other.append(mentry)

        # finally convert any refines on metadata to be extra attributes on their target tag
        # all other types of metadata are added to "others" to they are not touched in any way
        for mentry in self.refines:
            (rname, rcontent, rattr) = mentry
            rid = rattr.get("id", None)
            tid = rattr["refines"]
            prop = rattr["property"]
            scheme = rattr.get("scheme", None)
            propval = rcontent
            if tid.startswith("#"):
                tid = tid[1:]
                if tid in self.id2rec:
                    pos = self.id2rec[tid]
                    (dname, dcontent, dattr) = self.rec[pos]
                    dattr[prop] = propval
                    if scheme is not None:
                        dattr["scheme"] = scheme
                    if prop == "alternate-script":
                        dattr["altlang"] = rattr["xml:lang"]
                    self.rec[pos] = (dname, dcontent, dattr)
                    if rid is not None:
                        self.idlst.remove(rid)
                else:
                    # these refines refer to something that is not recognized metadata
                    self.other.append(mentry)
            else:
                # this is refinement that doesn't seem to point to anything in the opf
                self.other.append(mentry)
        if _DEBUG:
            print("recongized", self.rec)
            print("other", self.other)
            print("idlst", self.idlst)