def __init__(self, out_name, metadata, content_dir='OEBPS/'): self.content_dir = content_dir self.dt = datetime.now() self.z = zipfile.ZipFile(out_name, 'w') self.add('mimetype', 'application/epub+zip', deflate=False) self.book_id = common.get_metadata_tag_data(metadata, 'identifier') self.title = common.get_metadata_tag_data(metadata, 'title') if self.title is None: self.title = 'none' self.author = common.get_metadata_tag_data(metadata, 'creator') if self.author is None: self.author = 'none' tree_str = make_container_info(content_dir) self.add('META-INF/container.xml', tree_str) (self.opf, self.opf_manifest_el, self.opf_spine_el, self.opf_guide_el) = make_opf(metadata) (self.ncx, self.ncx_head_el, self.ncx_navmap_el) = make_ncx(self.book_id, self.title, self.author) self.ncx_pagelist_el = None self.navpoint_stack = [self.ncx_navmap_el] self.id_index = 1 self.nav_number = 1 self.depth = 0 self.current_depth = 0 self.el_stack = [] self.el_len_total = 0 self.max_el_len_total = 150000 self.part_number = 0 self.current_part = None # Add static extra files - style sheet, etc. for id, href, media_type in [('css', 'stylesheet.css', 'text/css')]: content_src = os.path.join(sys.path[0], 'epub_files', href) content_str = open(content_src, 'r').read() self.add_content(id, href, media_type, content_str)
def process_book(iabook, ebook, alt_booktext=None): aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}" scandata = iabook.get_scandata() aby_file = iabook.get_abbyy() scandata_ns = iabook.get_scandata_ns() bookData = iabook.get_bookdata() # some books no scanlog # scanLog = scandata.find(scandata_ns + 'scanLog') # if scanLog is None: # scanLog = scandata.scanLog contents = iabook.get_toc() metadata = iabook.get_metadata() title = common.get_metadata_tag_data(metadata, 'title') if title is None: title = '' author = common.get_metadata_tag_data(metadata, 'creator') if author is None: author = '' ebook.push_tag('frontmatter') ebook.add_tag('doctitle', title) # ebook.add_tag('covertitle', title) ebook.add_tag('docauthor', author) ebook.push_navpoint('level', 'h', 'Producer\'s Note') ebook.push_navpoint('level', 'h', 'About Internet Archive Daisy Books') ebook.add_tag('p', """This book was produced in DAISY format by the Internet Archive. The book pages were scanned and converted to DAISY format automatically. This process relies on optical character recognition, and is somewhat susceptible to errors. These errors may include weird characters, non-words, and incorrect guesses at structure. Page numbers and headers or footers may remain from the scanned page. The Internet Archive is working to improve the scanning process and resulting books, but in the meantime, we hope that this book will be useful to you. """) ebook.pop_navpoint() ebook.push_navpoint('level', 'h', 'About this DAISY book') has_nav = False if iabook.has_pagenos(): has_nav = True ebook.add_tag('p', "This book has page navigation.") if contents is not None: has_nav = True ebook.add_tag('p', "This book has chapter navigation.") if not has_nav: ebook.add_tag('p', "This book as paragraph navigation, " "but is otherwise unstructured.") ebook.pop_navpoint() ebook.push_navpoint('level', 'h', 'About the Internet Archive') ebook.add_tag('p', """The Internet Archive was founded in 1996 to build an Internet library and to promote universal access to all knowledge. The Archive's purposes include offering permanent access for researchers, historians, scholars, people with disabilities, and the general public to historical collections that exist in digital format. The Internet Archive includes texts, audio, moving images, and software as well as archived web pages, and provides specialized services for information access for the blind and other persons with disabilities. """) ebook.pop_navpoint() ebook.pop_navpoint() ebook.pop_tag() ebook.push_tag('bodymatter') # ebook.push_navpoint('level', 'h', 'Start of book') # pushed_navpoint = True if contents is None: ebook.push_navpoint('level', 'h', 'Book') part_number = 0 cover_number = 0 pushed_navpoint = False context = etree.iterparse(aby_file, tag=aby_ns+'page', resolve_entities=False) found_title = False for page_scandata in iabook.get_scandata_pages(): #confirm title exists t = page_scandata.pageType.text if t == 'Title' or t == 'Title Page': found_title = True break # True if no title found, else False now, True later. before_title_page = found_title for i, (event, page) in enumerate(context): # wrap in try/finally to ensure page.clear() is called try: if alt_booktext is not None: ebook.add_tag('p', alt_booktext) break page_scandata = iabook.get_page_scandata(i) pageno = None if page_scandata is not None: pageno = page_scandata.find(scandata_ns + 'pageNumber') if pageno: pageno = pageno.text if pageno: if contents is not None and pageno in contents: if pushed_navpoint: ebook.pop_navpoint() ebook.push_navpoint('level', 'h', contents[pageno]) pushed_navpoint = True part_str = 'part' + str(part_number).zfill(4) ebook.add_pagetarget(pageno, pageno) def include_page(page_scandata): if page_scandata is None: return False add = page_scandata.find(scandata_ns + 'addToAccessFormats') if add is None: add = page_scandata.addToAccessFormats if add is not None and add.text == 'true': return True else: return False if not include_page(page_scandata): continue page_type = page_scandata.pageType.text.lower() if page_type == 'cover': pass elif page_type == 'title' or page_type == 'title page': before_title_page = False pass elif page_type == 'copyright': pass elif page_type == 'contents': pass elif page_type == 'normal': if before_title_page: pass # XXX consider skipping if blank + no words? # make page image # (id, filename) = make_html_page_image(i, iabook, ebook) else: first_par = True saw_pageno_header_footer = False for block in page: if block.get('blockType') == 'Text': pass else: pass for el in block: if el.tag == aby_ns+'region': for rect in el: pass elif el.tag == aby_ns+'text': for par in el: # skip if its the first line and it could be a header if first_par and common.par_is_pageno_header_footer(par): saw_pageno_header_footer = True first_par = False continue first_par = False # skip if it's the last par and it could be a header if (not saw_pageno_header_footer and block == page[-1] and el == block[-1] and par == el[-1] and common.par_is_pageno_header_footer(par)): saw_pageno_header_footer = True continue lines = [] prev_line = '' for line in par: for fmt in line: fmt_text = etree.tostring(fmt, method='text', encoding=unicode) if len(fmt_text) > 0: if prev_line[-1:] == '-': if fmt[0].get('wordStart') == 'false': # ? and wordFromDictionary = true ? lines.append(prev_line[:-1]) else: lines.append(prev_line) else: lines.append(prev_line) lines.append(' ') prev_line = fmt_text lines.append(prev_line) ebook.add_tag('p', ''.join(lines)) elif (el.tag == aby_ns+'row'): pass else: print('unexpected tag type' + el.tag) sys.exit(-1) finally: page.clear() if pushed_navpoint: ebook.pop_navpoint() if contents is None: ebook.pop_navpoint() #level1 ebook.pop_tag() ebook.push_tag('rearmatter') ebook.push_tag('level1') ebook.add_tag('p', 'End of book') ebook.pop_tag() ebook.pop_tag()
def process_book(iabook, ebook): aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}" scandata = iabook.get_scandata() scandata_ns = iabook.get_scandata_ns() bookData = iabook.get_bookdata() aby_file = iabook.get_abbyy() # some books no scanlog # scanLog = scandata.find(scandata_ns + 'scanLog') # if scanLog is None: # scanLog = scandata.scanLog contents = iabook.get_toc() metadata = iabook.get_metadata() title = common.get_metadata_tag_data(metadata, 'title') if title is None: title = 'none' author = common.get_metadata_tag_data(metadata, 'creator') if author is None: author = 'none' cover_number = 0 toc_item_number = 0 picture_number = 0 pushed_chapters = False made_contents_navpoint = False made_pages = False context = etree.iterparse(aby_file, tag=aby_ns+'page', resolve_entities=False) found_title = False for page_scandata in iabook.get_scandata_pages(): #confirm title exists try: t = page_scandata.pageType.text.lower() except AttributeError: t = 'normal' if t == 'title' or t == 'title page': found_title = True break # True if no title found, else False now, True later. before_title_page = found_title for i, (event, page) in enumerate(context): # wrap in try/finally to ensure page.clear() is called try: page_scandata = iabook.get_page_scandata(i) pageno = None if page_scandata is not None: pageno = page_scandata.find(scandata_ns + 'pageNumber') if pageno: pageno = pageno.text if pageno: if contents is not None and pageno in contents: ebook.flush_els() if not pushed_chapters: cdiv = E.div({ 'class':'newnav', 'id':'chapters' }) href = ebook.add_el(cdiv) + '#' + 'chapters' ebook.push_navpoint('Chapters', href) pushed_chapters = True id = 'toc-' + str(toc_item_number) toc_item_number += 1 cdiv = E.div({ 'class':'newnav', 'id':id }) href = ebook.add_el(cdiv) + '#' + id ebook.add_navpoint(contents[pageno], href) id = 'page-' + pageno pdiv = E.div({ 'class':'newpage', 'id':id }) href = ebook.add_el(pdiv) + '#' + id ebook.add_pagetarget(pageno, pageno, href) def include_page(page_scandata): if page_scandata is None: return False add = page_scandata.find(scandata_ns + 'addToAccessFormats') if add is None: add = page_scandata.addToAccessFormats if add is not None and add.text == 'true': return True else: return False if not include_page(page_scandata): continue try: page_type = page_scandata.pageType.text.lower() except AttributeError: page_type = 'normal' if page_type == 'cover': if cover_number == 0: cover_title = 'Front Cover' front_cover = True else: cover_title = 'Back Cover' ## xxx detect back page? front_cover = False ebook.flush_els() if pushed_chapters: ebook.pop_navpoint() pushed_chapters = False (id, filename) = make_html_page_image(i, iabook, ebook, cover=front_cover) if id is not None: ebook.add_navpoint(cover_title, filename) if cover_number == 0: ebook.add_guide_item({ 'href':filename, 'type':'cover', 'title':cover_title }) # Add intro page after 1rst cover page tree = make_html('Archive', [E.p('This book made available by the Internet Archive.')]) ebook.add_content('intro', 'intro.html', 'application/xhtml+xml', common.tree_to_str(tree, xml_declaration=False)) ebook.add_spine_item({ 'idref':'intro', 'linear':'no' }) cover_number += 1 elif page_type == 'title' or page_type == 'title page': before_title_page = False (id, filename) = make_html_page_image(i, iabook, ebook) if id is not None: ebook.add_navpoint('Title Page', filename) ebook.add_guide_item({ 'href':filename, 'type':'title-page', 'title':'Title Page' }) elif page_type == 'copyright': (id, filename) = make_html_page_image(i, iabook, ebook) if id is not None: ebook.add_navpoint('Copyright', filename) ebook.add_guide_item({ 'href':filename, 'type':'copyright-page', 'title':'Title Page' }) elif page_type == 'contents': (id, filename) = make_html_page_image(i, iabook, ebook) if id is not None: if not made_contents_navpoint: ebook.add_navpoint('Table of Contents', filename) made_contents_navpoint = True ebook.add_guide_item({ 'href':filename, 'type':'toc', 'title':'Title Page' }) elif page_type == 'normal': if before_title_page: page_text = etree.tostring(page, method='text', encoding=unicode) # Skip if not much text if len(page_text) >= 10: (id, filename) = make_html_page_image(i, iabook, ebook) # XXX note that above might return None, None and do nothing... else: first_par = True saw_pageno_header_footer = False for block in page: if block.get('blockType') == 'Picture': region = ((int(block.get('l')), int(block.get('t'))), (int(block.get('r')), int(block.get('b')))) (l, t), (r, b) = region region_width = r - l region_height = b - t orig_page_size = (int(page.get('width')), int(page.get('height'))) page_width, page_height = orig_page_size # XXX bad aspect ratio! # XXX need fixed code to get requested size req_width = int(max_width * (region_width / float(page_width))) req_height = int(max_height * (region_height / float(page_height))) image = iabook.get_page_image(i, (req_width, req_height), orig_page_size, kdu_reduce=2, region=region) if image is not None: pic_id = 'picture' + str(picture_number) pic_href = 'images/' + pic_id + '.jpg' picture_number += 1 ebook.add_content(pic_id, pic_href, 'image/jpeg', image, deflate=False) el = E.p({ 'class':'illus' }, E.img(src=pic_href, alt=pic_id)) ebook.add_el(el) continue for el in block: if el.tag == aby_ns+'region': for rect in el: pass elif el.tag == aby_ns+'text': for par in el: # skip if its the first line and it could be a header if first_par and common.par_is_pageno_header_footer(par): saw_pageno_header_footer = True first_par = False continue first_par = False # skip if it's the last par and it could be a header if (not saw_pageno_header_footer and block == page[-1] and el == block[-1] and par == el[-1] and common.par_is_pageno_header_footer(par)): saw_pageno_header_footer = True continue lines = [] prev_line = '' for line in par: for fmt in line: fmt_text = etree.tostring(fmt, method='text', encoding=unicode) if len(fmt_text) > 0: if prev_line[-1:] == '-': if fmt[0].get('wordStart') == 'false': # ? and wordFromDictionary = true ? lines.append(prev_line[:-1]) else: lines.append(prev_line) else: lines.append(prev_line) lines.append(' ') prev_line = fmt_text lines.append(prev_line) if not made_pages: made_pages = True if not contents: href = ebook.add_el(E.div({ 'class':'pages', 'id':'pages' })) ebook.add_navpoint('Pages', href) to_add = ''.join(lines) ebook.add_el(E.p(to_add), len(to_add)) elif (el.tag == aby_ns+'row'): pass else: print('unexpected tag type' + el.tag) sys.exit(-1) finally: page.clear() ebook.flush_els() if pushed_chapters: ebook.pop_navpoint()
def __init__(self, out_name, metadata, content_dir=''): self.dt = datetime.now() self.z = zipfile.ZipFile(out_name, 'w') self.content_dir = content_dir self.book_id = common.get_metadata_tag_data(metadata, 'identifier') self.title = common.get_metadata_tag_data(metadata, 'title') self.author = common.get_metadata_tag_data(metadata, 'creator') self.nav_number = 1 self.opf_file = self.book_id + '_daisy.opf' self.dtbook_file = self.book_id + '_daisy.xml' self.dtbook, self.dtbook_book_el = make_dtbook(self.book_id, self.title) self.smil_file = self.book_id + '_daisy.smil' self.smil, self.smil_seq_el = make_smil(self.book_id) self.ncx_file = self.book_id + '_daisy.ncx' self.ncx, self.ncx_head_el, self.ncx_navmap_el, self.ncx_pagelist_el = make_ncx( self.book_id, self.title, self.author) self.tag_stack = [self.dtbook_book_el] self.navpoint_stack = [self.ncx_navmap_el] self.id_index = 1 self.depth = 0 self.current_depth = 0 self.total_page_count = 0 self.max_page_number = 0 # style sheet, etc. for content in ['daisy.css', 'daisyTransform.xsl', 'dtbook-2005-3.dtd', 'html.css', 'resource.res']: content_src = os.path.join(sys.path[0], 'daisy_files', content) content_str = open(content_src, 'r').read() self.add(self.content_dir + content, content_str) self.manifest_items = [ { 'id':'xml', 'href':self.dtbook_file, 'media-type':'application/x-dtbook+xml' }, { 'id':'opf', 'href':self.book_id + '_daisy.opf', 'media-type':'text/xml' }, { 'id':'ncx', 'href':self.ncx_file, 'media-type':'application/x-dtbncx+xml' }, { 'id':'smil', 'href':self.smil_file, 'media-type':'application/smil' }, { 'id':'daisyTransform', 'href':'daisyTransform.xsl', 'media-type':'text/xsl' }, { 'id':'daisyCss', 'href':'daisy.css', 'media-type':'text/css' }, { 'id':'htmlCss', 'href':'html.css', 'media-type':'text/css' }, { 'id':'resource', 'href':'resource.res', 'media-type':'application/x-dtbresource+xml' }, ]