def from_file(cls, xmlfile, *args, **kwargs): desc_tag = None try: iter = etree.iterparse(xmlfile, ['start', 'end']) for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'start': desc_tag = element break if desc_tag is None: raise NoDublinCore("DublinCore section not found. \ Check if there are rdf:RDF and rdf:Description tags.") # continue 'till the end of RDF section for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'end': break # if there is no end, Expat should yell at us with an ExpatError # extract data from the element and make the info return cls.from_element(desc_tag, *args, **kwargs) except XMLSyntaxError as e: raise ParseError(e) except ExpatError as e: raise ParseError(e)
def from_file(cls, xmlfile, parse_dublincore=True, image_store=None): # first, prepare for parsing if isinstance(xmlfile, basestring): file = open(xmlfile, 'rb') try: data = file.read() finally: file.close() else: data = xmlfile.read() if not isinstance(data, unicode): data = data.decode('utf-8') data = data.replace(u'\ufeff', '') # assume images are in the same directory if image_store is None and getattr(xmlfile, 'name', None): image_store = ImageStore(path.dirname(xmlfile.name)) try: parser = etree.XMLParser(remove_blank_text=False) tree = etree.parse(StringIO(data.encode('utf-8')), parser) me = cls(tree, parse_dublincore=parse_dublincore, image_store=image_store) me.load_frame_info() return me except (ExpatError, XMLSyntaxError, XSLTApplyError), e: raise ParseError(e)
def transform(wldoc, stylesheet='legacy', options=None, flags=None): """Transforms the WL document to XHTML. If output_filename is None, returns an XML, otherwise returns True if file has been written,False if it hasn't. File won't be written if it has no content. """ # Parse XSLT try: style_filename = get_stylesheet(stylesheet) style = etree.parse(style_filename) document = copy.deepcopy(wldoc) del wldoc document.swap_endlines() if flags: for flag in flags: document.edoc.getroot().set(flag, 'yes') document.clean_ed_note() document.clean_ed_note('abstrakt') if not options: options = {} options.setdefault('gallery', "''") result = document.transform(style, **options) del document # no longer needed large object :) if html_has_content(result): add_anchors(result.getroot()) add_table_of_themes(result.getroot()) add_table_of_contents(result.getroot()) return OutputFile.from_bytes( etree.tostring(result, method='html', xml_declaration=False, pretty_print=True, encoding='utf-8')) else: return None except KeyError: raise ValueError("'%s' is not a valid stylesheet.") except (XMLSyntaxError, XSLTApplyError) as e: raise ParseError(e)
def from_file(cls, xmlfile, *args, **kwargs): # first, prepare for parsing if isinstance(xmlfile, basestring): file = open(xmlfile, 'rb') try: data = file.read() finally: file.close() else: data = xmlfile.read() if not isinstance(data, unicode): data = data.decode('utf-8') data = data.replace(u'\ufeff', '') try: parser = etree.XMLParser(remove_blank_text=False) tree = etree.parse(StringIO(data.encode('utf-8')), parser) return cls(tree, *args, **kwargs) except (ExpatError, XMLSyntaxError, XSLTApplyError), e: raise ParseError(e)
class WorkInfo(object): __metaclass__ = DCInfo FIELDS = ( Field(DCNS('creator'), 'authors', as_person, salias='author', multiple=True), Field(DCNS('title'), 'title'), Field(DCNS('type'), 'type', required=False, multiple=True), Field(DCNS('contributor.editor'), 'editors', as_person, salias='editor', multiple=True, default=[]), Field(DCNS('contributor.technical_editor'), 'technical_editors', as_person, salias='technical_editor', multiple=True, default=[]), Field(DCNS('contributor.funding'), 'funders', salias='funder', multiple=True, default=[]), Field(DCNS('contributor.thanks'), 'thanks', required=False), Field(DCNS('date'), 'created_at'), Field(DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), Field(DCNS('publisher'), 'publisher', multiple=True), Field(DCNS('language'), 'language'), Field(DCNS('description'), 'description', required=False), Field(DCNS('source'), 'source_name', required=False), Field(DCNS('source.URL'), 'source_url', required=False), Field(DCNS('identifier.url'), 'url', WLURI, strict=as_wluri_strict), Field(DCNS('rights.license'), 'license', required=False), Field(DCNS('rights'), 'license_description'), Field(PLMETNS('digitisationSponsor'), 'sponsors', multiple=True, default=[]), Field(WLNS('digitisationSponsorNote'), 'sponsor_note', required=False), Field(WLNS('developmentStage'), 'stage', required=False), ) @classmethod def from_string(cls, xml, *args, **kwargs): from StringIO import StringIO return cls.from_file(StringIO(xml), *args, **kwargs) @classmethod def from_file(cls, xmlfile, *args, **kwargs): desc_tag = None try: iter = etree.iterparse(xmlfile, ['start', 'end']) for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'start': desc_tag = element break if desc_tag is None: raise NoDublinCore("DublinCore section not found. \ Check if there are rdf:RDF and rdf:Description tags.") # continue 'till the end of RDF section for (event, element) in iter: if element.tag == RDFNS('RDF') and event == 'end': break # if there is no end, Expat should yell at us with an ExpatError # extract data from the element and make the info return cls.from_element(desc_tag, *args, **kwargs) except XMLSyntaxError, e: raise ParseError(e) except ExpatError, e: raise ParseError(e)
def transform(wldoc, verbose=False, save_tex=None, morefloats=None, cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False): """ produces a PDF file with XeLaTeX wldoc: a WLDocument verbose: prints all output from LaTeX save_tex: path to save the intermediary LaTeX file to morefloats (old/new/none): force specific morefloats cover: a cover.Cover factory or True for default flags: less-advertising, customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class) """ # Parse XSLT try: book_info = wldoc.book_info document = load_including_children(wldoc) root = document.edoc.getroot() if cover: if cover is True: cover = make_cover bound_cover = cover(book_info, width=1200) root.set('data-cover-width', str(bound_cover.width)) root.set('data-cover-height', str(bound_cover.height)) if bound_cover.uses_dc_cover: if book_info.cover_by: root.set('data-cover-by', book_info.cover_by) if book_info.cover_source: root.set('data-cover-source', book_info.cover_source) if flags: for flag in flags: root.set('flag-' + flag, 'yes') # check for LaTeX packages if morefloats: root.set('morefloats', morefloats.lower()) elif package_available('morefloats', 'maxfloats=19'): root.set('morefloats', 'new') # add customizations if customizations is not None: root.set('customizations', u','.join(customizations)) # add editors info editors = document.editors() if editors: root.set( 'editors', u', '.join(sorted(editor.readable() for editor in editors))) if document.book_info.funders: root.set('funders', u', '.join(document.book_info.funders)) if document.book_info.thanks: root.set('thanks', document.book_info.thanks) # hack the tree move_motifs_inside(document.edoc) hack_motifs(document.edoc) parse_creator(document.edoc) substitute_hyphens(document.edoc) fix_hanging(document.edoc) fix_tables(document.edoc) mark_subauthors(document.edoc) # wl -> TeXML style_filename = get_stylesheet("wl2tex") style = etree.parse(style_filename) functions.reg_mathml_latex() # TeXML -> LaTeX temp = mkdtemp('-wl2pdf') for ilustr in document.edoc.findall("//ilustr"): shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp) for sponsor in book_info.sponsors: ins = etree.Element("data-sponsor", name=sponsor) logo = sponsor_logo(sponsor) if logo: fname = 'sponsor-%s' % os.path.basename(logo) shutil.copy(logo, os.path.join(temp, fname)) ins.set('src', fname) root.insert(0, ins) if book_info.sponsor_note: root.set("sponsor-note", book_info.sponsor_note) texml = document.transform(style) if cover: with open(os.path.join(temp, 'cover.png'), 'w') as f: bound_cover.save(f, quality=80) del document # no longer needed large object :) tex_path = os.path.join(temp, 'doc.tex') fout = open(tex_path, 'wb') process(six.BytesIO(texml), fout, 'utf-8') fout.close() del texml if save_tex: shutil.copy(tex_path, save_tex) # LaTeX -> PDF shutil.copy(get_resource('pdf/wl.cls'), temp) shutil.copy(get_resource('res/wl-logo.png'), temp) if latex_dir: return temp try: cwd = os.getcwd() except OSError: cwd = None os.chdir(temp) # some things work better when compiled twice # (table of contents, [line numbers - disabled]) for run in range(2): if verbose: p = call(['xelatex', tex_path]) else: p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE) if p: raise ParseError("Error parsing .tex file") if cwd is not None: os.chdir(cwd) output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False) pdf_path = os.path.join(temp, 'doc.pdf') shutil.move(pdf_path, output_file.name) shutil.rmtree(temp) return OutputFile.from_filename(output_file.name) except (XMLSyntaxError, XSLTApplyError) as e: raise ParseError(e)