def copy_aux_files(self, job, dest_dir): """ Copy image files to dest_dir. Use image data cached in parsers. """ for p in job.spider.parsers: if hasattr(p, 'resize_image'): src_uri = p.attribs.url fn_dest = gg.make_url_relative(webify_url(job.base_url), src_uri) fn_dest = os.path.join(dest_dir, fn_dest) # debug ('base_url = %s, src_uri = %s' % (job.base_url, src_uri)) if gg.is_same_path(src_uri, fn_dest): debug('Not copying %s to %s: same file' % (src_uri, fn_dest)) continue debug('Copying %s to %s' % (src_uri, fn_dest)) fn_dest = gg.normalize_path(fn_dest) gg.mkdir_for_filename(fn_dest) try: with open(fn_dest, 'wb') as fp_dest: fp_dest.write(p.serialize()) except IOError as what: error('Cannot copy %s to %s: %s' % (src_uri, fn_dest, what))
def format_date(date): """ Format a date. """ if date is None: return '' try: # datetime return date.replace(tzinfo=gg.UTC(), microsecond=0).isoformat() except TypeError: # date return datetime.datetime.combine( date, datetime.time(tzinfo=gg.UTC())).isoformat()
def package (self, job): self.setup (job) zipfilename = job.outputfile # filename is zipfile m = re.match (r'\d+', zipfilename) if m: ebook_no = m.group (0) else: error ('Invalid filename %s for push packager.' % zipfilename) return zip_ = self.create (zipfilename) for suffix in '.txt -8.txt -0.txt .zip -8.zip -0.zip -rst.zip -h.zip'.split (): filename = '%s%s' % (ebook_no, suffix) memberfilename = '%s/%s' % (ebook_no, filename) self.add (zip_, filename, memberfilename) for suffix, ext in (('-h', 'html'), ('-rst', 'rst')): filename = '%s%s.%s' % (ebook_no, suffix, ext) memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, filename) self.add (zip_, filename, memberfilename) # image files for url in options.html_images_list: rel_url = gg.make_url_relative (job.base_url, url) filename = os.path.join (self.path, rel_url) memberfilename = '%s/%s%s/%s' % (ebook_no, ebook_no, suffix, rel_url) self.add (zip_, filename, memberfilename) zip_.close () info ('Done Zip file: %s' % zipfilename)
def add_meta(xhtml, name, content): """ Add a meta tag. """ for head in gg.xpath(xhtml, '//xhtml:head'): meta = em.meta(name=name, content=content) meta.tail = '\n' head.append(meta)
def format_title_filing(row): """ Format a book title for display in results. """ title = gg.cut_at_newline(row.get('filing') or 'No Title') for lang_id in row.get('fk_langs') or []: if lang_id != 'en': title += " (%s)" % cherrypy.response.i18n.locale.languages.get( lang_id, lang_id) return title
def make_output_filename(type_, dc): """ Make a suitable filename for output type. """ if dc.project_gutenberg_id: # PG book: use PG naming convention return FILENAMES[type_].format(id=dc.project_gutenberg_id) # not a PG ebook return FILENAMES[type_].format(id=gg.string_to_filename(dc.title)[:65])
def get_header_text(header): """ clean header text """ text = gg.normalize( etree.tostring(header, method="text", encoding=six.text_type, with_tail=False)) return header.get('title', text).strip()
def add_internal_css(xhtml, css_as_string): """ Add internal stylesheet to html. """ if css_as_string and xhtml is not None: css_as_string = '\n' + css_as_string.strip(' \n') + '\n' for head in gg.xpath(xhtml, '//xhtml:head'): style = em.style(css_as_string, type='text/css') style.tail = '\n' head.append(style)
def add_dublincore(self, job, tree): """ Add dublin core metadata to <head>. """ source = gg.archive2files(options.ebook, job.url) if hasattr(options.config, 'FILESDIR'): job.dc.source = source.replace(options.config.FILESDIR, options.config.PGURL) for head in xpath(tree, '//xhtml:head'): for e in job.dc.to_html(): e.tail = '\n' head.append(e)
def serve(books, size, session): """ Output a gallery of coverpages. """ cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8' cherrypy.response.headers['Content-Language'] = 'en' s = '' for book_id in books: dc = DublinCoreMapping.DublinCoreObject(session=session, pooled=True) dc.load_from_database(book_id) cover = session.execute( select(Models.File.archive_path).where( Models.File.fk_books == book_id, Models.File.fk_filetypes == size)).scalars().first() if not cover: continue url = '/' + cover href = '/ebooks/%d' % book_id if dc.title: title = gg.xmlspecialchars(dc.title) # handles <,>,& #Shortening long titles for latest covers title = title.replace('"', '"') title = title.replace("'", ''') else: title = '!! missing title !!' short_title = dc.make_pretty_title() def author_name(author): return DublinCore.DublinCore.make_pretty_name(author.name) author_name_list = map(author_name, dc.authors) authors = ', '.join(author_name_list) s += f""" <a href="{href}" title="{title}" authors="{authors}" target="_top"> <div class="cover_image"> <div class="cover_img"> <img src="{url}" alt="{title}, {authors}" title="{title}" authors="{authors}" draggable="false"> </div> <div class="cover_title"> <h5>{short_title}</h5> </div> </div> </a> """ return s.encode('utf-8')
def __init__(self): self.nsmap = gg.build_nsmap('opf dc dcterms xsi') # FIXME: remove this when lxml is fixed # workaround for lxml fat-fingering the default attribute namespaces self.nsmap[None] = str(NS.opf) + 'lxml-bug-workaround' self.opf = ElementMaker(namespace=self.nsmap[None], nsmap=self.nsmap) self.metadata = self.opf.metadata() self.manifest = self.opf.manifest() self.spine = self.opf.spine() self.guide = self.opf.guide() self.item_id = 0
def add_external_css(self, spider, xhtml, css_as_string, url): """ Add external stylesheet to html. """ if css_as_string: attribs = parsers.ParserAttributes() attribs.orig_mediatype = attribs.HeaderElement('text/css') attribs.url = attribs.orig_url = url p = ParserFactory.ParserFactory.get(attribs) p.parse_string(css_as_string) spider.parsers.append(p) if xhtml is not None: for head in gg.xpath(xhtml, '//xhtml:head'): link = em.link(href=url, rel='stylesheet', type='text/css') link.tail = '\n' head.append(link)
def topological_sort(self): """ Do a topological sort of documents using <link rel='next'> """ relnext = [(p.attribs.referrer, p.attribs.url) for p in self.parsers if 'next' in p.attribs.rel] if relnext: try: d = {} for order, url in enumerate(gg.topological_sort(relnext)): d[url] = order debug("%s order %d" % (url, order)) for parser in self.parsers: parser.order = d.get(parser.attribs.url, 999999) self.parsers.sort(key=lambda p: p.order) except Exception: pass
def serve(rows, size): """ Output a gallery of coverpages. """ cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8' cherrypy.response.headers['Content-Language'] = 'en' s = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" xml:base="http://www.gutenberg.org"> <head> <title>Cover Flow</title> <style> .cover-thumb { display: inline-block; background-position: center; background-repeat: no-repeat; } .cover-thumb-small { width: 76px; height: 110px; } .cover-thumb-medium { width: 210px; height: 310px; } </style> </head> <body><div>""" for row in rows: url = '/' + row.filename href = '/ebooks/%d' % row.pk title = gg.xmlspecialchars(row.title) title = title.replace('"', '"') s += """<a href="{href}" title="{title}" class="cover-thumb cover-thumb-{size}" target="_top" style="background-image: url({url})"> </a>\n""".format( url=url, href=href, title=title, size=size) return (s + '</div></body></html>\n').encode('utf-8')
def serve(rows, size): """ Output a gallery of coverpages. """ cherrypy.response.headers['Content-Type'] = 'text/html; charset=utf-8' cherrypy.response.headers['Content-Language'] = 'en' s = '' for row in rows: url = '/' + row.filename href = '/ebooks/%d' % row.pk if row.title: title = gg.xmlspecialchars(row.title) # handles <,>,& #Shortening long titles for latest covers title = title.replace('"', '"') title = title.replace("'", ''') else: title = '!! missing title !!' short_title = title title_len = len(title) short_title = re.sub(r"\-+", " ", short_title) short_title = short_title.splitlines()[0] if (title_len > 80): short_title = textwrap.wrap(short_title, 80)[0] s += """ <a href="{href}" title="{title}" target="_top"> <div class="cover_image"> <div class="cover_img"> <img src="{url}" alt="{title}" title="{title}" draggable="false"> </div> <div class="cover_title"> <h5>{short_title}</h5> </div> </div> </a> """.format(url=url, href=href, title=title, short_title=short_title, size=size) return s.encode('utf-8')
def package(self, job): self.setup(job) try: aux_file_list = list(job.spider.aux_file_iter()) except AttributeError: aux_file_list = [] filename = job.outputfile zipfilename = os.path.join(self.path, self.name) + '.zip' memberfilename = os.path.join(self.name, self.name) + self.ext zip_ = self.create(zipfilename) self.add(zip_, filename, memberfilename) # now images for url in aux_file_list: rel_url = gg.make_url_relative(job.base_url, url) filename = os.path.join(self.path, rel_url) memberfilename = os.path.join(self.name, rel_url) self.add(zip_, filename, memberfilename) zip_.close() info('Done Zip file: %s' % zipfilename)
def format_suggestion(self, row): """ Format a suggestion for display in results. """ query = ' '.join(self.query.split()[0:-1]) if query: query += ' ' return query + gg.cut_at_newline(row.get('title') or '')
def add_body_class(xhtml, classname): """ Add a class to the body element. """ if classname and xhtml is not None: for body in gg.xpath(xhtml, '//xhtml:body'): HTMLishWriter.add_class(body, classname)
def index(self, **dummy_kwargs): """ A bibrec page. """ os = BaseSearcher.OpenSearch() os.log_request('bibrec') dc = BaseSearcher.DC(cherrypy.engine.pool) # the bulk of the work is done here dc.load_from_database(os.id) if not dc.files: # NOTE: Error message cherrypy.tools.rate_limiter.e404() raise cherrypy.HTTPError(404, _('No ebook by that number.')) # add these fields so we won't have to test for their existence later dc.extra_info = None dc.url = None dc.translate() dc.header = gg.cut_at_newline(dc.title) os.title = dc.make_pretty_title() dc.extra_info = '' dc.class_ = BaseSearcher.ClassAttr() dc.order = 10 dc.icon = 'book' if 'Sound' in dc.categories: dc.icon = 'audiobook' os.title_icon = dc.icon os.twit = os.title os.qrcode_url = '//%s/cache/epub/%d/pg%d.qrcode.png' % (os.file_host, os.id, os.id) os.entries.append(dc) s = cherrypy.session last_visited = s.get('last_visited', []) last_visited.append(os.id) s['last_visited'] = last_visited # can we find some meaningful breadcrumbs ? for a in dc.authors: if a.marcrel in ('aut', 'cre'): book_cnt = BaseSearcher.sql_get( "select count (*) from mn_books_authors where fk_authors = %(aid)s", aid=a.id) if book_cnt > 1: os.breadcrumbs.append( (__('One by {author}', '{count} by {author}', book_cnt).format(count=book_cnt, author=dc.make_pretty_name( a.name)), _('Find more ebooks by the same author.'), os.url('author', id=a.id))) if os.format in ('html', 'mobile'): cat = BaseSearcher.Cat() cat.header = _('Similar Books') cat.title = _('Readers also downloaded…') cat.rel = 'related' cat.url = os.url('also', id=os.id) cat.class_ += 'navlink grayed noprint' cat.icon = 'suggestion' cat.order = 30 os.entries.append(cat) for bookshelf in dc.bookshelves: cat = BaseSearcher.Cat() cat.title = _('In {bookshelf}').format( bookshelf=bookshelf.bookshelf) cat.rel = 'related' cat.url = os.url('bookshelf', id=bookshelf.id) cat.class_ += 'navlink grayed' cat.icon = 'bookshelf' cat.order = 33 os.entries.append(cat) if os.format in ('mobile', ): for author in dc.authors: cat = BaseSearcher.Cat() cat.title = _('By {author}').format( author=author.name_and_dates) cat.rel = 'related' cat.url = os.url('author', id=author.id) cat.class_ += 'navlink grayed' cat.icon = 'author' cat.order = 31 os.entries.append(cat) for subject in dc.subjects: cat = BaseSearcher.Cat() cat.title = _('On {subject}').format(subject=subject.subject) cat.rel = 'related' cat.url = os.url('subject', id=subject.id) cat.class_ += 'navlink grayed' cat.icon = 'subject' cat.order = 32 os.entries.append(cat) os.total_results = 1 os.template = 'results' if os.format == 'mobile' else 'bibrec' os.page = 'bibrec' os.og_type = 'book' os.finalize() return self.format(os)
def make_toc(self, xhtml): """ Build a TOC from HTML headers. Return a list of tuples (url, text, depth). Page numbers are also inserted because DTBook NCX needs the play_order to be sequential. """ def id_generator(i=0): """ Generate an id for the TOC to link to. """ while True: yield 'pgepubid%05d' % i i += 1 idg = id_generator() def get_id(elem): """ Get the id of the element or generate and set one. """ if not elem.get('id'): elem.set('id', six.next(idg)) return elem.get('id') toc = [] last_depth = 0 for header in xpath( xhtml, '//xhtml:h1|//xhtml:h2|//xhtml:h3|//xhtml:h4|' # DP page number '//xhtml:*[contains (@class, "pageno")]|' # DocUtils contents header '//xhtml:p[contains (@class, "topic-title")]'): text = gg.normalize( etree.tostring(header, method="text", encoding=six.text_type, with_tail=False)) text = header.get('title', text).strip() if not text: # so <h2 title=""> may be used to suppress TOC entry continue if header.get('class', '').find('pageno') > -1: toc.append( ["%s#%s" % (self.attribs.url, get_id(header)), text, -1]) else: # header if text.lower().startswith('by '): # common error in PG: <h2>by Lewis Carroll</h2> should # yield no TOC entry continue try: depth = int(header.tag[-1:]) except ValueError: depth = 2 # avoid top level # fix bogus header numberings if depth > last_depth + 1: depth = last_depth + 1 last_depth = depth # if <h*> is first element of a <div> use <div> instead parent = header.getparent() if (parent.tag == NS.xhtml.div and parent[0] == header and parent.text and parent.text.strip() == ''): header = parent toc.append([ "%s#%s" % (self.attribs.url, get_id(header)), text, depth ]) return toc
def parse(self): """ Parse the plain text. Try to find semantic units in the character soup. """ debug("GutenbergTextParser.parse () ...") if self.xhtml is not None: return text = self.unicode_content() text = parsers.RE_RESTRICTED.sub('', text) text = gg.xmlspecialchars(text) lines = [line.rstrip() for line in text.splitlines()] lines.append("") del text blanks = 0 par = Par() for line in lines: if len(line) == 0: blanks += 1 else: if blanks and par.lines: # don't append empty pars par.after = blanks self.pars.append(par) if self.body == 1: self.max_blanks = max(blanks, self.max_blanks) par = Par() par.before = blanks blanks = 0 par.lines.append(line) par.after = blanks if par.lines: self.pars.append(par) lines = None self.analyze() # build xhtml tree em = parsers.em self.xhtml = em.html( em.head( em.title(' '), # pylint: disable=W0142 em.meta(**{ 'http-equiv': 'Content-Style-Type', 'content': 'text/css' }), em.meta( **{ 'http-equiv': 'Content-Type', 'content': mt.xhtml + '; charset=utf-8' })), em.body()) for body in xpath(self.xhtml, '//xhtml:body'): xhtmlparser = lxml.html.XHTMLParser() for par in self.pars: p = etree.fromstring(self.ship_out(par), xhtmlparser) p.tail = '\n\n' body.append(p) self.pars = []
def strip_pagenumbers(xhtml, strip_classes): """ Strip dp page numbers. Rationale: DP implements page numbers either with float or with absolute positioning. Float is not supported by Kindle. Absolute positioning is not allowed in epub. If we'd leave these in, they would show up as numbers in the middle of the text. To still keep links working, we replace all page number contraptions we can find with empty <a>'s. """ # look for elements with a class that is in strip_classes for class_ in strip_classes: xp = "//xhtml:*[@class and contains(concat(' ', normalize-space(@class), ' '), ' %s ')]" % class_ count = 0 for elem in xpath(xhtml, xp): # save textual content text = gg.normalize( etree.tostring(elem, method="text", encoding=six.text_type, with_tail=False)) if len(text) > 10: # safeguard against removing things that are not pagenumbers continue if not text: text = elem.get('title') # look for id anywhere inside element id_ = elem.xpath(".//@id") # transmogrify element into empty <a> tail = elem.tail elem.clear() elem.tag = NS.xhtml.a if id_: # some blockheaded PPers include more than # one page number in one span. take the last id # because the others represent empty pages. elem.set('id', id_[-1]) if class_ in DP_PAGENUMBER_CLASSES: # mark element as rewritten pagenumber. we # actually don't use this class for styling # because it is on an empty element elem.set('class', 'x-ebookmaker-pageno') if text: elem.set('title', text) elem.tail = tail count += 1 # The OPS Spec 2.0 is very clear: "Reading Systems # must be XML processors as defined in XML 1.1." # Nevertheless many browser-plugin ebook readers use # the HTML parsers of the browser. But HTML parsers # don't grok the minimized form of empty elements. # # This will force lxml to output the non-minimized form # of the element. elem.text = '' if count: warning("%d elements having class %s have been rewritten." % (count, class_))
def metadata_item(self, dc): """ Build metadata from DublinCore struct. Example of metadata: <metadata xmlns:dcterms='http://purl.org/dc/terms/' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xmlns:opf='http://www.idpf.org/2007/opf'> <dcterms:identifier opf:scheme='URI' id='id'>http://www.gutenberg.org/ebooks/29000</dc:identifier> <dcterms:creator opf:file-as='Trollope, Anthony'>Anthony Trollope</dc:creator> <dcterms:title>The Macdermots of Ballycloran</dc:title> <dcterms:language xsi:type='dcterms:RFC3066'>en</dc:language> <dcterms:subject>Domestic fiction</dc:subject> <dcterms:subject>Ireland -- Fiction</dc:subject> <dcterms:created>1890</dcterms:created> <dcterms:publisher>Project Gutenberg</dc:publisher> <dcterms:date opf:event='publication'>2009-05-31</dc:date> <dcterms:date opf:event='conversion'>2009-08-26T21:11:14Z</dc:date> <dcterms:rights>Public domain</dc:rights> <dcterms:source>29000-h.htm</dc:source> <meta name='cover' content='item0' /> </metadata> """ # OPF 2.0 v1.0 specifies to use the # Dublin Core Metadata Element Set, Version 1.1 # http://dublincore.org/documents/2004/12/20/dces/ # but that has been superseded by DCMI Metadata Terms # http://dublincore.org/documents/dcmi-terms/ # we use NS.dc for now but should switch to NS.dcterms later dcterms = ElementMaker(nsmap=self.nsmap, namespace=str(NS.dc)) if dc.publisher: self.metadata.append(dcterms.publisher(dc.publisher)) if dc.rights: self.metadata.append(dcterms.rights(dc.rights)) self.metadata.append( dcterms.identifier(dc.opf_identifier, { NS.opf.scheme: 'URI', 'id': 'id' })) # should be NS.xml.id for author in dc.authors: pretty_name = dc.make_pretty_name(author.name) if author.marcrel == 'aut' or author.marcrel == 'cre': self.metadata.append( dcterms.creator(pretty_name, {NS.opf['file-as']: author.name})) else: self.metadata.append( dcterms.contributor( pretty_name, { NS.opf.role: author.marcrel, NS.opf['file-as']: author.name })) # replace newlines with / title = re.sub(r'\s*[\r\n]+\s*', ' / ', dc.title) self.metadata.append(dcterms.title(title)) for language in dc.languages: self.metadata.append( dcterms.language(language.id, {NS.xsi.type: 'dcterms:RFC4646'})) for subject in dc.subjects: self.metadata.append(dcterms.subject(subject.subject)) if dc.created: self.metadata.append( dcterms.date(dc.created, {NS.opf.event: 'creation'})) if dc.release_date: self.metadata.append( dcterms.date(dc.release_date.isoformat(), {NS.opf.event: 'publication'})) self.metadata.append( dcterms.date( datetime.datetime.now(gg.UTC()).isoformat(), {NS.opf.event: 'conversion'})) source = dc.source if hasattr(options.config, 'FILESDIR'): if source.startswith(options.config.FILESDIR): source = source[len(options.config.FILESDIR):] source = urllib.parse.urljoin(options.config.PGURL, source) self.metadata.append(dcterms.source(source))