def to_etree(self, parent=None): """XML representation of this object.""" # etree._namespace_map[str(self.RDF)] = 'rdf' # etree._namespace_map[str(self.DC)] = 'dc' if parent is None: root = etree.Element(RDFNS('RDF')) else: root = parent.makeelement(RDFNS('RDF')) description = etree.SubElement(root, RDFNS('Description')) if self.about: description.set(RDFNS('about'), self.about) for field in self.FIELDS: v = getattr(self, field.name, None) if v is not None: if field.multiple: if len(v) == 0: continue for x in v: e = etree.Element(field.uri) if x is not None: e.text = six.text_type(x) description.append(e) else: e = etree.Element(field.uri) e.text = six.text_type(v) description.append(e) return root
def from_file(cls, xmlfile, *args, **kwargs): desc_tag = None try: iter = etree.iterparse(xmlfile, ['start', 'end']) for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'start': desc_tag = element break if desc_tag is None: raise NoDublinCore("DublinCore section not found. \ Check if there are rdf:RDF and rdf:Description tags.") # continue 'till the end of RDF section for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'end': break # if there is no end, Expat should yell at us with an ExpatError # extract data from the element and make the info return cls.from_element(desc_tag, *args, **kwargs) except XMLSyntaxError as e: raise ParseError(e) except ExpatError as e: raise ParseError(e)
def mark_subauthors(doc): root_author = ', '.join( elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed'))) last_author = None # jeśli autor jest inny niż autor całości i niż poprzedni autor # to wstawiamy jakiś znacznik w rdf? for subutwor in doc.xpath('/utwor/utwor'): author = ', '.join( elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed'))) if author not in (last_author, root_author): subutwor.find('.//' + RDFNS('RDF')).append( etree.Element('use_subauthor')) last_author = author
def __init__(self, edoc, parse_dublincore=True, provider=None, strict=False, meta_fallbacks=None): self.edoc = edoc self.provider = provider root_elem = edoc.getroot() dc_path = './/' + RDFNS('RDF') if root_elem.tag != 'utwor': raise ValidationError( "Invalid root element. Found '%s', should be 'utwor'" % root_elem.tag) if parse_dublincore: self.rdf_elem = root_elem.find(dc_path) if self.rdf_elem is None: raise NoDublinCore( 'Document has no DublinCore - which is required.') self.book_info = dcparser.BookInfo.from_element( self.rdf_elem, fallbacks=meta_fallbacks, strict=strict) else: self.book_info = None
def from_element(cls, rdf_tag, *args, **kwargs): # the tree is already parsed, so we don't need to worry about Expat errors field_dict = {} desc = rdf_tag.find(".//" + RDFNS('Description')) if desc is None: raise NoDublinCore("No DublinCore section found.") lang = None p = desc while p is not None and lang is None: lang = p.attrib.get(XMLNS('lang')) p = p.getparent() for e in desc.getchildren(): fv = field_dict.get(e.tag, []) if e.text is not None: text = e.text if not isinstance(text, six.text_type): text = text.decode('utf-8') val = TextPlus(text) val.lang = e.attrib.get(XMLNS('lang'), lang) if e.tag == 'meta': meta_id = e.attrib.get('id') if meta_id and meta_id.endswith('-id'): field_dict[meta_id] = [val.replace('ISBN-', 'ISBN ')] else: val = e.text fv.append(val) field_dict[e.tag] = fv return cls(desc.attrib, field_dict, *args, **kwargs)
def __init__(self, rdf_attrs, dc_fields, fallbacks=None, strict=False): """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. dc_fields - dictionary mapping DC fields (with namespace) to list of text values for the given field. """ self.about = rdf_attrs.get(RDFNS('about')) self.fmap = {} for field in self.FIELDS: value = field.validate(dc_fields, fallbacks=fallbacks, strict=strict) setattr(self, 'prop_' + field.name, value) self.fmap[field.name] = field if field.salias: self.fmap[field.salias] = field
def serialize(self): rdf = {'about': {'uri': RDFNS('about'), 'value': self.about}} dc = {} for field in self.FIELDS: v = getattr(self, field.name, None) if v is not None: if field.multiple: if len(v) == 0: continue v = [six.text_type(x) for x in v if x is not None] else: v = six.text_type(v) dc[field.name] = {'uri': field.uri, 'value': v} rdf['fields'] = dc return rdf
def __init__(self, edoc, parse_dublincore=True, image_store=None): self.edoc = edoc self.image_store = image_store root_elem = edoc.getroot() dc_path = './/' + RDFNS('RDF') if root_elem.tag != 'picture': raise ValidationError( "Invalid root element. Found '%s', should be 'picture'" % root_elem.tag) if parse_dublincore: self.rdf_elem = root_elem.find(dc_path) if self.rdf_elem is None: raise NoDublinCore( 'Document has no DublinCore - which is required.') self.picture_info = PictureInfo.from_element(self.rdf_elem) else: self.picture_info = None self.frame = None
def transform_file(wldoc, chunk_counter=1, first=True, sample=None): """ processes one input file and proceeds to its children """ replace_characters(wldoc.edoc.getroot()) hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator) # every input file will have a TOC entry, # pointing to starting chunk toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter) chars = set() if first: # write book title page html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type) chars = used_chars(html_tree.getroot()) zip.writestr( 'OPS/title.html', etree.tostring( html_tree, pretty_print=True, xml_declaration=True, encoding="utf-8", doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' + ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">' ) ) # add a title page TOC entry toc.add(u"Strona tytułowa", "title.html") elif wldoc.book_info.parts: # write title page for every parent if sample is not None and sample <= 0: chars = set() html_string = open(get_resource('epub/emptyChunk.html')).read() else: html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl')) chars = used_chars(html_tree.getroot()) html_string = etree.tostring( html_tree, pretty_print=True, xml_declaration=True, encoding="utf-8", doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' + ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">' ) zip.writestr('OPS/part%d.html' % chunk_counter, html_string) add_to_manifest(manifest, chunk_counter) add_to_spine(spine, chunk_counter) chunk_counter += 1 if len(wldoc.edoc.getroot()) > 1: # rdf before style master main_text = wldoc.edoc.getroot()[1] else: # rdf in style master main_text = wldoc.edoc.getroot()[0] if main_text.tag == RDFNS('RDF'): main_text = None if main_text is not None: for chunk_xml in chop(main_text): empty = False if sample is not None: if sample <= 0: empty = True else: sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog')) chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty) toc.extend(chunk_toc) chars = chars.union(chunk_chars) zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html) add_to_manifest(manifest, chunk_counter) add_to_spine(spine, chunk_counter) chunk_counter += 1 for child in wldoc.parts(): child_toc, chunk_counter, chunk_chars, sample = transform_file( child, chunk_counter, first=False, sample=sample) toc.append(child_toc) chars = chars.union(chunk_chars) return toc, chunk_counter, chars, sample