Beispiel #1
0
    def to_etree(self, parent=None):
        """XML representation of this object."""
        # etree._namespace_map[str(self.RDF)] = 'rdf'
        # etree._namespace_map[str(self.DC)] = 'dc'

        if parent is None:
            root = etree.Element(RDFNS('RDF'))
        else:
            root = parent.makeelement(RDFNS('RDF'))

        description = etree.SubElement(root, RDFNS('Description'))

        if self.about:
            description.set(RDFNS('about'), self.about)

        for field in self.FIELDS:
            v = getattr(self, field.name, None)
            if v is not None:
                if field.multiple:
                    if len(v) == 0:
                        continue
                    for x in v:
                        e = etree.Element(field.uri)
                        if x is not None:
                            e.text = six.text_type(x)
                        description.append(e)
                else:
                    e = etree.Element(field.uri)
                    e.text = six.text_type(v)
                    description.append(e)

        return root
Beispiel #2
0
    def from_file(cls, xmlfile, *args, **kwargs):
        desc_tag = None
        try:
            iter = etree.iterparse(xmlfile, ['start', 'end'])
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'start':
                    desc_tag = element
                    break

            if desc_tag is None:
                raise NoDublinCore("DublinCore section not found. \
                    Check if there are rdf:RDF and rdf:Description tags.")

            # continue 'till the end of RDF section
            for (event, element) in iter:
                if element.tag == RDFNS('RDF') and event == 'end':
                    break

            # if there is no end, Expat should yell at us with an ExpatError

            # extract data from the element and make the info
            return cls.from_element(desc_tag, *args, **kwargs)
        except XMLSyntaxError as e:
            raise ParseError(e)
        except ExpatError as e:
            raise ParseError(e)
Beispiel #3
0
def mark_subauthors(doc):
    root_author = ', '.join(
        elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' +
                                          DCNS('creator_parsed')))
    last_author = None
    # jeśli autor jest inny niż autor całości i niż poprzedni autor
    # to wstawiamy jakiś znacznik w rdf?
    for subutwor in doc.xpath('/utwor/utwor'):
        author = ', '.join(
            elem.text
            for elem in subutwor.findall('.//' + DCNS('creator_parsed')))
        if author not in (last_author, root_author):
            subutwor.find('.//' + RDFNS('RDF')).append(
                etree.Element('use_subauthor'))
        last_author = author
Beispiel #4
0
    def __init__(self,
                 edoc,
                 parse_dublincore=True,
                 provider=None,
                 strict=False,
                 meta_fallbacks=None):
        self.edoc = edoc
        self.provider = provider

        root_elem = edoc.getroot()

        dc_path = './/' + RDFNS('RDF')

        if root_elem.tag != 'utwor':
            raise ValidationError(
                "Invalid root element. Found '%s', should be 'utwor'" %
                root_elem.tag)

        if parse_dublincore:
            self.rdf_elem = root_elem.find(dc_path)

            if self.rdf_elem is None:
                raise NoDublinCore(
                    'Document has no DublinCore - which is required.')

            self.book_info = dcparser.BookInfo.from_element(
                self.rdf_elem, fallbacks=meta_fallbacks, strict=strict)
        else:
            self.book_info = None
Beispiel #5
0
    def from_element(cls, rdf_tag, *args, **kwargs):
        # the tree is already parsed, so we don't need to worry about Expat errors
        field_dict = {}
        desc = rdf_tag.find(".//" + RDFNS('Description'))

        if desc is None:
            raise NoDublinCore("No DublinCore section found.")

        lang = None
        p = desc
        while p is not None and lang is None:
            lang = p.attrib.get(XMLNS('lang'))
            p = p.getparent()

        for e in desc.getchildren():
            fv = field_dict.get(e.tag, [])
            if e.text is not None:
                text = e.text
                if not isinstance(text, six.text_type):
                    text = text.decode('utf-8')
                val = TextPlus(text)
                val.lang = e.attrib.get(XMLNS('lang'), lang)
                if e.tag == 'meta':
                    meta_id = e.attrib.get('id')
                    if meta_id and meta_id.endswith('-id'):
                        field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')]
            else:
                val = e.text
            fv.append(val)
            field_dict[e.tag] = fv

        return cls(desc.attrib, field_dict, *args, **kwargs)
Beispiel #6
0
    def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False):
        """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description.
        dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the
        given field. """

        self.about = rdf_attrs.get(RDFNS('about'))
        self.fmap = {}

        for field in self.FIELDS:
            value = field.validate(dc_fields,
                                   fallbacks=fallbacks,
                                   strict=strict)
            setattr(self, 'prop_' + field.name, value)
            self.fmap[field.name] = field
            if field.salias:
                self.fmap[field.salias] = field
Beispiel #7
0
    def serialize(self):
        rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}}

        dc = {}
        for field in self.FIELDS:
            v = getattr(self, field.name, None)
            if v is not None:
                if field.multiple:
                    if len(v) == 0:
                        continue
                    v = [six.text_type(x) for x in v if x is not None]
                else:
                    v = six.text_type(v)

                dc[field.name] = {'uri': field.uri, 'value': v}
        rdf['fields'] = dc
        return rdf
Beispiel #8
0
    def __init__(self, edoc, parse_dublincore=True, image_store=None):
        self.edoc = edoc
        self.image_store = image_store

        root_elem = edoc.getroot()

        dc_path = './/' + RDFNS('RDF')

        if root_elem.tag != 'picture':
            raise ValidationError(
                "Invalid root element. Found '%s', should be 'picture'" %
                root_elem.tag)

        if parse_dublincore:
            self.rdf_elem = root_elem.find(dc_path)

            if self.rdf_elem is None:
                raise NoDublinCore(
                    'Document has no DublinCore - which is required.')

            self.picture_info = PictureInfo.from_element(self.rdf_elem)
        else:
            self.picture_info = None
        self.frame = None
Beispiel #9
0
    def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
        """ processes one input file and proceeds to its children """

        replace_characters(wldoc.edoc.getroot())

        hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
        hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)

        # every input file will have a TOC entry,
        # pointing to starting chunk
        toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
        chars = set()
        if first:
            # write book title page
            html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
            chars = used_chars(html_tree.getroot())
            zip.writestr(
                'OPS/title.html',
                etree.tostring(
                    html_tree, pretty_print=True, xml_declaration=True,
                    encoding="utf-8",
                    doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                            ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                )
            )
            # add a title page TOC entry
            toc.add(u"Strona tytułowa", "title.html")
        elif wldoc.book_info.parts:
            # write title page for every parent
            if sample is not None and sample <= 0:
                chars = set()
                html_string = open(get_resource('epub/emptyChunk.html')).read()
            else:
                html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
                chars = used_chars(html_tree.getroot())
                html_string = etree.tostring(
                    html_tree, pretty_print=True, xml_declaration=True,
                    encoding="utf-8",
                    doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                            ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                )
            zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
            add_to_manifest(manifest, chunk_counter)
            add_to_spine(spine, chunk_counter)
            chunk_counter += 1

        if len(wldoc.edoc.getroot()) > 1:
            # rdf before style master
            main_text = wldoc.edoc.getroot()[1]
        else:
            # rdf in style master
            main_text = wldoc.edoc.getroot()[0]
            if main_text.tag == RDFNS('RDF'):
                main_text = None

        if main_text is not None:
            for chunk_xml in chop(main_text):
                empty = False
                if sample is not None:
                    if sample <= 0:
                        empty = True
                    else:
                        sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
                chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)

                toc.extend(chunk_toc)
                chars = chars.union(chunk_chars)
                zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                add_to_manifest(manifest, chunk_counter)
                add_to_spine(spine, chunk_counter)
                chunk_counter += 1

        for child in wldoc.parts():
            child_toc, chunk_counter, chunk_chars, sample = transform_file(
                child, chunk_counter, first=False, sample=sample)
            toc.append(child_toc)
            chars = chars.union(chunk_chars)

        return toc, chunk_counter, chars, sample