Ejemplo n.º 1
0
Archivo: epub.py Proyecto: hornc/epub
    def finish(self, metadata):
        # ... Any remaining html?

        tree_str = common.tree_to_str(self.ncx)
        self.add_content('ncx', 'toc.ncx', 'application/x-dtbncx+xml',
                         tree_str)

        tree_str = common.tree_to_str(self.opf)
        self.add(self.content_dir + 'content.opf', tree_str)

        self.z.close()
Ejemplo n.º 2
0
def make_opf(meta_info_items,
             manifest_items,
             spine_items,
             guide_items,
             include_page_map,
             cover_id=None):
    root = etree.Element('package',
                         { 'xmlns' : 'http://www.idpf.org/2007/opf',
                           'unique-identifier' : 'bookid',
                           'version' : '2.0' },
                         nsmap={'dc' : dc })
    metadata = etree.SubElement(root, 'metadata')
    for item in meta_info_items:
        el = etree.SubElement(metadata, item['item'], item.get('atts'))
        if 'text' in item:
            el.text = item['text']
    manifest = etree.SubElement(root, 'manifest')
    for item in manifest_items:
        etree.SubElement(manifest, 'item', item)
#     if cover_id is not None:
#         etree.SubElement(manifest, 'meta', name='cover',
#                          content=cover_id)
    if len(spine_items) > 0:
        spine_attrs = { 'toc':'ncx' }
        if include_page_map:
            spine_attrs['page-map'] = 'page-map'
        spine = etree.SubElement(root, 'spine',
                                 spine_attrs)
    for item in spine_items:
        etree.SubElement(spine, 'itemref', item)
    if len(guide_items) > 0:
        guide = etree.SubElement(root, 'guide')
    for item in guide_items:
        etree.SubElement(guide, 'reference', item)
    return common.tree_to_str(root)
Ejemplo n.º 3
0
Archivo: daisy.py Proyecto: hornc/epub
def make_opf(metadata,
             manifest_items):
    xml = """<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE package PUBLIC "+//ISBN 0-9673008-1-9//DTD OEB 1.2 Package//EN"
"http://openebook.org/dtds/oeb-1.2/oebpkg12.dtd">
<package xmlns="http://openebook.org/namespaces/oeb-package/1.0/"
unique-identifier="bookid"/>
"""
    tree = etree.parse(StringIO(xml))
    root_el = tree.getroot()
    metadata_el = etree.SubElement(root_el, 'metadata')
    dc_metadata_el = etree.SubElement(metadata_el, 'dc-metadata',
        nsmap={ 'dc':dc,
            'oebpackage':'http://openebook.org/namespaces/oeb-package/1.0/' })
    el = etree.SubElement(dc_metadata_el, dcb + 'Format')
    el.text = 'ANSI/NISO Z39.86-2005'

    # TODO: ensure required elements: (copy code from epub.py)
    # title
    # publisher
    # date
    # format (must be 'ANSI/NISO Z39.86-2005')
    # language
    # identifier
    
    for md in metadata:
        tagname = md['tag']
        if not tagname in [ 'title', 'creator', 'subject', 'description',
                           'publisher', 'contributor', 'date', 'type',
                           'format', 'identifier', 'source', 'language',
                           'relation','coverage', 'rights' ]:
            continue
        dctag = dcb + tagname[:1].upper() + tagname[1:]
        if tagname == 'identifier':
            el = etree.SubElement(dc_metadata_el, dctag,
                                  { 'id':'bookid' })
#             el.text = md['text'] + xtra
            el.text = md['text']
        else:
            el = etree.SubElement(dc_metadata_el, dctag)
            el.text = md['text']
    x_metadata_el = etree.SubElement(metadata_el, 'x-metadata')
    el = etree.SubElement(x_metadata_el, 'meta',
                          { 'name':'dtb:multimediaType', 'content':'textNCX' })
    el = etree.SubElement(x_metadata_el, 'meta',
                          { 'name':'dtb:multimediaContent', 'content':'text' })
    el = etree.SubElement(x_metadata_el, 'meta',
                          { 'name':'dtb:totalTime', 'content':'0' })

    manifest_el = etree.SubElement(root_el, 'manifest')
    for item in manifest_items:
        etree.SubElement(manifest_el, 'item', item)

    spine_el = etree.SubElement(root_el, 'spine')
    etree.SubElement(spine_el, 'itemref',
                     { 'idref':'smil' })

    tree = etree.ElementTree(root_el)
    return common.tree_to_str(tree)
Ejemplo n.º 4
0
def make_container_info(content_dir='OEBPS/'):
    root = etree.Element('container',
                     version='1.0',
                     xmlns='urn:oasis:names:tc:opendocument:xmlns:container')
    rootfiles = etree.SubElement(root, 'rootfiles')
    etree.SubElement(rootfiles, 'rootfile',
                     { 'full-path' : content_dir + 'content.opf',
                       'media-type' : 'application/oebps-package+xml' } )
    return common.tree_to_str(root)
Ejemplo n.º 5
0
Archivo: daisy.py Proyecto: hornc/epub
    def finish(self, metadata):
        tree_str = make_opf(metadata, self.manifest_items)
        self.add(self.content_dir + self.opf_file, tree_str)

        metas = [
            { 'name':'dtb:depth', 'content':str(self.depth) },
            { 'name':'dtb:totalPageCount', 'content':str(self.total_page_count) },
            { 'name':'dtb:maxPageNumber', 'content':str(self.max_page_number) },
            ]
        for item in metas:
            etree.SubElement(self.ncx_head_el, 'meta', item)
        
        tree_str = common.tree_to_str(self.ncx)
        self.add(self.content_dir + self.ncx_file, tree_str)

        tree_str = common.tree_to_str(self.dtbook)
        self.add(self.content_dir + self.dtbook_file, tree_str)

        tree_str = common.tree_to_str(self.smil)
        self.add(self.content_dir + self.smil_file, tree_str)

        self.z.close()
Ejemplo n.º 6
0
def make_ncx(navpoints, page_items):
    import StringIO
    xml = """<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN"
"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"/>
"""
    tree = etree.parse(StringIO.StringIO(xml))
    root = tree.getroot()
    head = etree.SubElement(root, 'head')
    metas = [
        { 'name' : 'dtb:uid', 'content' : 'test id' },
        { 'name' : 'dtb:depth', 'content' : '1' },
        { 'name' : 'dtb:totalPageCount', 'content' : '0' },
        { 'name' : 'dtb:maxPageNumber', 'content' : '0' },
        ]
    for item in metas:
        etree.SubElement(head, 'meta', item)
    doctitle = etree.SubElement(root, 'docTitle')
    etree.SubElement(doctitle, 'text').text = 'Hello World';

    # navMap element
    navmap = etree.SubElement(root, 'navMap')
    for item in navpoints:
        navpoint = etree.SubElement(navmap, 'navPoint',
                                    { 'id':'navpoint-' + str(item['playOrder']),
                                      'playOrder':str(item['playOrder']) })
        navlabel = etree.SubElement(navpoint, 'navLabel')
        etree.SubElement(navlabel, 'text').text = item['text']
         # XXX 'content' should be 'href'
        etree.SubElement(navpoint, 'content', src=item['content'])

    # pageList element
    if len(page_items) > 0:
        pagelist = etree.SubElement(root, 'pageList',
                                    { 'id':'page-mapping', 'class':'pagelist' })
        navlabel = etree.SubElement(pagelist, 'navLabel')
        text = etree.SubElement(navlabel, 'text')
        text.text = 'Pages'
        for item in page_items:
            id = 'page-' + item['name']
            pagetarget = etree.SubElement(pagelist, 'pageTarget',
                                          { 'id':id, 'value':str(item['value']),
                                            'type':item['type'],
                                            'playOrder':item['playOrder'] })
            navlabel = etree.SubElement(pageTarget, 'navLabel')
            etree.SubElement(navlabel, 'text').text = 'Page ' + item['name']
            etree.SubElement(pagetarget, 'content', src=item['href'])

    tree = etree.ElementTree(root)
    return common.tree_to_str(tree)
Ejemplo n.º 7
0
Archivo: epub.py Proyecto: hornc/epub
 def flush_els(self):
     if self.current_part is None:
         return
     part_str = 'part' + str(self.part_number).zfill(4)
     part_str_href = part_str + '.html'
     self.add_content(part_str, part_str_href, 'application/xhtml+xml',
                      common.tree_to_str(self.current_part, xml_declaration=False))
     self.add_spine_item({ 'idref':part_str })
     if self.part_number == 0:
         self.add_guide_item({ 'href':part_str_href,
                                'type':'text',
                                'title':'Book' })
     self.part_number += 1
     self.el_stack = [] # xxx ? require popped?
     self.el_len_total = 0
     self.current_part = None
Ejemplo n.º 8
0
def make_html_page_image(i, iabook, ebook):
    image = iabook.get_page_image(i, width=600, height=800, quality=90)
    leaf_id = 'leaf' + str(i).zfill(4)
    leaf_image_id = 'leaf-image' + str(i).zfill(4)
    ebook.add_content({ 'id':leaf_image_id,
                         'href':'images/' + leaf_image_id + '.jpg',
                         'media-type':'image/jpeg' },
                       image);
    img_tag = E.img({ 'src':'images/' + leaf_image_id + '.jpg',
                      'alt':'leaf ' + str(i) })
    tree = make_html('leaf ' + str(i).zfill(4), [ img_tag ])
    ebook.add_content({ 'id':leaf_id,
                        'href':leaf_id + '.html',
                        'media-type':'application/xhtml+xml' },
                      common.tree_to_str(tree, xml_declaration=False))
    ebook.add_spine_item({ 'idref':leaf_id, 'linear':'no' })

    return leaf_id, leaf_id + '.html'
Ejemplo n.º 9
0
def make_html_page_image(i, iabook, ebook, cover=False):
    ebook.flush_els()
    image = iabook.get_page_image(i, (max_width, max_height))
    if image is None:
        return None, None
    leaf_id = 'leaf' + str(i).zfill(4)
    if not cover:
        leaf_image_id = 'leaf-image' + str(i).zfill(4)
    else:
        leaf_image_id = 'cover-image'
    ebook.add_content(leaf_image_id, 'images/' + leaf_image_id + '.jpg',
                      'image/jpeg', image, deflate=False)
    img_tag = E.img({ 'src':'images/' + leaf_image_id + '.jpg',
                      'alt':'leaf ' + str(i) })
    tree = make_html('leaf ' + str(i).zfill(4), [ img_tag ])
    ebook.add_content(leaf_id, leaf_id + '.html', 'application/xhtml+xml',
                      common.tree_to_str(tree, xml_declaration=False))
    ebook.add_spine_item({ 'idref':leaf_id, 'linear':'no' })
    return leaf_id, leaf_id + '.html'
Ejemplo n.º 10
0
def make_page_map(page_items):
    root = etree.Element('page-map',
                         xmlns='http://www.idpf.org/2007/opf')
    for item in page_map_items:
        etree.SubElement(root, 'page', name=item['name'], href=item['href'])
    return common.tree_to_str(root)
Ejemplo n.º 11
0
def process_book(iabook, ebook):
    aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}"
    scandata = iabook.get_scandata()

    scandata_ns = iabook.get_scandata_ns()
    bookData = iabook.get_bookdata()
    
    aby_file = iabook.get_abbyy()

    # some books no scanlog
#     scanLog = scandata.find(scandata_ns + 'scanLog')
#     if scanLog is None:
#         scanLog = scandata.scanLog

    contents = iabook.get_toc()
    metadata = iabook.get_metadata()
    title = common.get_metadata_tag_data(metadata, 'title')
    if title is None:
        title = 'none'
    author = common.get_metadata_tag_data(metadata, 'creator')
    if author is None:
        author = 'none'

    cover_number = 0
    toc_item_number = 0
    picture_number = 0
    pushed_chapters = False
    made_contents_navpoint = False
    made_pages = False
    context = etree.iterparse(aby_file,
                              tag=aby_ns+'page',
                              resolve_entities=False)
    found_title = False
    for page_scandata in iabook.get_scandata_pages(): #confirm title exists
        try:
            t = page_scandata.pageType.text.lower()
        except AttributeError:
            t = 'normal'

        if t == 'title' or t == 'title page':
            found_title = True
            break
    # True if no title found, else False now, True later.
    before_title_page = found_title
    for i, (event, page) in enumerate(context):
        # wrap in try/finally to ensure page.clear() is called
        try:
            page_scandata = iabook.get_page_scandata(i)
            pageno = None
            if page_scandata is not None:
                pageno = page_scandata.find(scandata_ns + 'pageNumber')
                if pageno:
                    pageno = pageno.text
            if pageno:
                if contents is not None and pageno in contents:
                    ebook.flush_els()
                    if not pushed_chapters:
                        cdiv = E.div({ 'class':'newnav', 'id':'chapters' })
                        href = ebook.add_el(cdiv) + '#' + 'chapters'
                        ebook.push_navpoint('Chapters', href)
                        pushed_chapters = True
                    id = 'toc-' + str(toc_item_number)
                    toc_item_number += 1
                    cdiv = E.div({ 'class':'newnav', 'id':id })
                    href = ebook.add_el(cdiv) + '#' + id
                    ebook.add_navpoint(contents[pageno], href)

                id = 'page-' + pageno
                pdiv = E.div({ 'class':'newpage', 'id':id })
                href = ebook.add_el(pdiv) + '#' + id
                ebook.add_pagetarget(pageno, pageno, href)

            def include_page(page_scandata):
                if page_scandata is None:
                    return False
                add = page_scandata.find(scandata_ns + 'addToAccessFormats')
                if add is None:
                    add = page_scandata.addToAccessFormats
                if add is not None and add.text == 'true':
                    return True
                else:
                    return False

            if not include_page(page_scandata):
                continue

            try:
                page_type = page_scandata.pageType.text.lower()
            except AttributeError:
                page_type = 'normal'

            if page_type == 'cover':
                if cover_number == 0:
                    cover_title = 'Front Cover'
                    front_cover = True
                else:
                    cover_title = 'Back Cover' ## xxx detect back page?
                    front_cover = False
                    ebook.flush_els()
                    if pushed_chapters:
                        ebook.pop_navpoint()
                        pushed_chapters = False

                (id, filename) = make_html_page_image(i, iabook, ebook,
                                                      cover=front_cover)
                if id is not None:
                    ebook.add_navpoint(cover_title, filename)
                    if cover_number == 0:
                        ebook.add_guide_item({ 'href':filename,
                                               'type':'cover',
                                               'title':cover_title })

                        # Add intro page after 1rst cover page
                        tree = make_html('Archive',
                             [E.p('This book made available by the Internet Archive.')])
                        ebook.add_content('intro', 'intro.html',
                                          'application/xhtml+xml',
                                          common.tree_to_str(tree,
                                                             xml_declaration=False))
                        ebook.add_spine_item({ 'idref':'intro', 'linear':'no' })
                    cover_number += 1

            elif page_type == 'title' or page_type == 'title page':
                before_title_page = False
                (id, filename) = make_html_page_image(i, iabook, ebook)
                if id is not None:
                    ebook.add_navpoint('Title Page', filename)
                    ebook.add_guide_item({ 'href':filename,
                                           'type':'title-page',
                                           'title':'Title Page' })
            elif page_type == 'copyright':
                (id, filename) = make_html_page_image(i, iabook, ebook)
                if id is not None:
                    ebook.add_navpoint('Copyright', filename)
                    ebook.add_guide_item({ 'href':filename,
                                           'type':'copyright-page',
                                           'title':'Title Page' })
            elif page_type == 'contents':
                (id, filename) = make_html_page_image(i, iabook, ebook)
                if id is not None:
                    if not made_contents_navpoint:
                        ebook.add_navpoint('Table of Contents', filename)
                        made_contents_navpoint = True
                    ebook.add_guide_item({ 'href':filename,
                                           'type':'toc',
                                           'title':'Title Page' })

            elif page_type == 'normal':
                if before_title_page:
                    page_text = etree.tostring(page,
                                               method='text',
                                               encoding=unicode)
                    # Skip if not much text
                    if len(page_text) >= 10:
                        (id, filename) = make_html_page_image(i, iabook, ebook)
                    # XXX note that above might return None, None and do nothing...
                else:
                    first_par = True
                    saw_pageno_header_footer = False

                    for block in page:
                        if block.get('blockType') == 'Picture':
                            region = ((int(block.get('l')),
                                       int(block.get('t'))),
                                      (int(block.get('r')),
                                       int(block.get('b'))))
                            (l, t), (r, b) = region
                            region_width = r - l
                            region_height = b - t
                            orig_page_size = (int(page.get('width')),
                                         int(page.get('height')))
                            page_width, page_height = orig_page_size

                            # XXX bad aspect ratio!
                            # XXX need fixed code to get requested size
                            req_width = int(max_width *
                                            (region_width / float(page_width)))
                            req_height = int(max_height *
                                             (region_height / float(page_height)))
                            image = iabook.get_page_image(i,
                                                          (req_width, req_height),
                                                          orig_page_size,
                                                          kdu_reduce=2,
                                                          region=region)
                            if image is not None:
                                pic_id = 'picture' + str(picture_number)
                                pic_href = 'images/' + pic_id + '.jpg'
                                picture_number += 1
                                ebook.add_content(pic_id, pic_href,
                                                  'image/jpeg', image, deflate=False)
                                el = E.p({ 'class':'illus' },
                                         E.img(src=pic_href,
                                               alt=pic_id))
                                ebook.add_el(el)
                            continue
                        for el in block:
                            if el.tag == aby_ns+'region':
                                for rect in el:
                                    pass
                            elif el.tag == aby_ns+'text':
                                for par in el:
                                    # skip if its the first line and it could be a header
                                    if first_par and common.par_is_pageno_header_footer(par):
                                        saw_pageno_header_footer = True
                                        first_par = False
                                        continue
                                    first_par = False

                                    # skip if it's the last par and it could be a header
                                    if (not saw_pageno_header_footer
                                        and block == page[-1]
                                        and el == block[-1]
                                        and par == el[-1]
                                        and common.par_is_pageno_header_footer(par)):
                                        saw_pageno_header_footer = True
                                        continue

                                    lines = []
                                    prev_line = ''
                                    for line in par:
                                        for fmt in line:
                                            fmt_text = etree.tostring(fmt,
                                                                  method='text',
                                                                  encoding=unicode)
                                            if len(fmt_text) > 0:
                                                if prev_line[-1:] == '-':
                                                    if fmt[0].get('wordStart') == 'false':
                                                        # ? and wordFromDictionary = true ?
                                                        lines.append(prev_line[:-1])
                                                    else:
                                                        lines.append(prev_line)
                                                else:
                                                    lines.append(prev_line)
                                                    lines.append(' ')
                                                prev_line = fmt_text
                                    lines.append(prev_line)

                                    if not made_pages:
                                        made_pages = True
                                        if not contents:
                                            href = ebook.add_el(E.div({ 'class':'pages', 'id':'pages' }))
                                            ebook.add_navpoint('Pages', href)
                                    to_add = ''.join(lines)
                                    ebook.add_el(E.p(to_add), len(to_add))
                            elif (el.tag == aby_ns+'row'):
                                pass
                            else:
                                print('unexpected tag type' + el.tag)
                                sys.exit(-1)
        finally:
            page.clear()

    ebook.flush_els()
    if pushed_chapters:
        ebook.pop_navpoint()
Ejemplo n.º 12
0
def process_book(iabook, ebook):
    aby_ns="{http://www.abbyy.com/FineReader_xml/FineReader6-schema-v1.xml}"
    scandata = iabook.get_scandata()
    metadata = objectify.parse(iabook.get_metadata_path()).getroot()
    aby_file = iabook.get_abbyy()

    bookData = scandata.find('bookData')
    # XXX should fix below and similar by ensuring that scandata is always the same fmt...
    # scandata.zip/scandata.xml parses different?
    if bookData is None:
        bookData = scandata.bookData

    # some books no scanlog
#     scanLog = scandata.find('scanLog')
#     if scanLog is None:
#         scanLog = scandata.scanLog

    paragraphs = []
    i = 0
    part_number = 0
    cover_number = 0
    nav_number = 0
    context = etree.iterparse(aby_file,
                              tag=aby_ns+'page',
                              resolve_entities=False)
    found_title = False
    for page_scandata in iabook.get_scandata_pages(): #confirm title exists
        t = page_scandata.pageType.text
        if t == 'Title' or t == 'Title Page':
            found_title = True
            break
    # True if no title found, else False now, True later.
    before_title_page = found_title
    for event, page in context:
        page_scandata = iabook.get_page_scandata(i)
        def include_page(page_scandata):
            if page_scandata is None:
                return False
            add = page_scandata.find('addToAccessFormats')
            if add is None:
                add = page_scandata.addToAccessFormats
            if add is not None and add.text == 'true':
                return True
            else:
                return False
        if not include_page(page_scandata):
            i += 1
            continue
        page_type = page_scandata.pageType.text.lower()
        if page_type == 'cover':
            (id, filename) = make_html_page_image(i, iabook, ebook)
            if cover_number == 0:
                cover_title = 'Front Cover'
            else:
                cover_title = 'Back Cover' ## xxx detect back page?
            ebook.add_navpoint( { 'text':cover_title, 'content':filename } )
            if cover_number == 0:
                ebook.add_guide_item( { 'href':filename,
                                        'type':'cover',
                                        'title':cover_title } )
                ebook.add_cover_id(id)

                # Add intro page after 1rst cover page
                tree = make_html('Archive',
                     [E.p('This book made available by the Internet Archive.')])
                ebook.add_content({ 'id':'intro',
                                    'href':'intro.html',
                                    'media-type':'application/xhtml+xml' },
                                  common.tree_to_str(tree,
                                                     xml_declaration=False))
                ebook.add_spine_item({ 'idref':'intro' })

            cover_number += 1

        elif page_type == 'title' or page_type == 'title page':
            before_title_page = False
            (id, filename) = make_html_page_image(i, iabook, ebook)
            ebook.add_navpoint( { 'text':'Title Page', 'content':filename } )
            ebook.add_guide_item( { 'href':filename,
                                    'type':'title-page',
                                    'title':'Title Page' } )
        elif page_type == 'copyright':
            (id, filename) = make_html_page_image(i, iabook, ebook)
            ebook.add_navpoint( { 'text':'Copyright', 'content':filename } )
            ebook.add_guide_item( { 'href':filename,
                                    'type':'copyright-page',
                                    'title':'Title Page' } )
        elif page_type == 'contents':
            (id, filename) = make_html_page_image(i, iabook, ebook)
            ebook.add_navpoint( { 'text':'Contents', 'content':filename } )
            ebook.add_guide_item( { 'href':filename,
                                    'type':'toc',
                                    'title':'Title Page' } )
        elif page_type == 'normal':
#             if i == 10:
#                 debug()
            if before_title_page:
                # XXX consider skipping if blank + no words?
                # make page image
                (id, filename) = make_html_page_image(i, iabook, ebook)
            else:
                first_par = True
                for block in page:
                    if block.get('blockType') == 'Text':
                        pass
                    else:
                        pass
                    for el in block:
                        if el.tag == aby_ns+'region':
                            for rect in el:
                                pass
                        elif el.tag == aby_ns+'text':
                            for par in el:
                                def par_is_header(par):
                                    # if:
                                    #   it's the first on the page
                                    #   there's only one line
                                    #   on that line, there's a formatting tag, s.t.
                                    #   - it has < 6 charParam kids
                                    #   - each is wordNumeric
                                    # then:
                                    #   Skip it!
                                    if len(par) != 1:
                                        return False
                                    line = par[0]
                                    for fmt in line:
                                        if len(fmt) > 6:
                                            continue
                                        saw_non_num = False
                                        for cp in fmt:
                                            if cp.get('wordNumeric') != 'true':
                                                saw_non_num = True
                                                break
                                        if not saw_non_num:
                                            return True
                                        hdr_text = etree.tostring(fmt,
                                                              method='text',
                                                              encoding=unicode)
                                        rnums = ['i', 'ii', 'iii', 'iv',
                                                 'v', 'vi', 'vii', 'viii',
                                                 'ix', 'x', 'xi', 'xii',
                                                 'xiii', 'xiv', 'xv', 'xvi',
                                                 'xvii', 'xviii', 'xix', 'xx',
                                                 'xxi', 'xxii',
                                                 ]
                                        if hdr_text in rnums:
                                            return True
                                    return False
                                if first_par and par_is_header(par):
                                    first_par = False
                                    continue
                                first_par = False
                                lines = []
                                prev_line = ''
                                for line in par:
                                    for fmt in line:
                                        fmt_text = etree.tostring(fmt,
                                                                  method='text',
                                                                  encoding=unicode)
                                        if len(fmt_text) > 0:
                                            if prev_line[-1:] == '-':
                                                if fmt[0].get('wordStart') == 'false':
                                                    # ? and wordFromDictionary = true ?
                                                    lines.append(prev_line[:-1])
                                                else:
                                                    lines.append(prev_line)
                                            else:
                                                lines.append(prev_line)
                                                lines.append(' ')
                                            prev_line = fmt_text
                                lines.append(prev_line)
                                paragraphs.append(E.p(''.join(lines)))
                        elif (el.tag == aby_ns+'row'):
                            pass
                        else:
                            print('unexpected tag type' + el.tag)
                            sys.exit(-1)

        page.clear()
        i += 1

        if len(paragraphs) > 100:
            # make a chunk!
            part_str = 'part' + str(part_number).zfill(4)
            part_str_href = part_str + '.html'
            tree = make_html('sample title', paragraphs)
            ebook.add_content({ 'id':part_str,
                                'href':part_str_href,
                                'media-type':'application/xhtml+xml' },
                              common.tree_to_str(tree, xml_declaration=False))
            ebook.add_spine_item({ 'idref':part_str })
            ebook.add_page_map_item(i, part_str_href)
            if part_number == 0:
                ebook.add_guide_item( { 'href':part_str_href,
                                        'type':'text',
                                        'title':'Book' } )
                ebook.add_navpoint({ 'text':'Pages',
                                     'content':part_str_href })
            part_number += 1
            paragraphs = []
    # make chunk from last paragraphs
    if len(paragraphs) > 100:
        part_str = 'part' + str(part_number).zfill(4)
        part_str_href = part_str + '.html'
        tree = make_html('sample title', paragraphs)
        ebook.add_content({ 'id':part_str,
                            'href':part_str_href,
                            'media-type':'application/xhtml+xml' },
                          common.tree_to_str(tree, xml_declaration=False))
        ebook.add_spine_item({ 'idref':part_str })
        if part_number == 0:
            book.add_guide_item( { 'href':part_str_href,
                                    'type':'text',
                                    'title':'Book' } )
            ebook.add_navpoint({ 'text':'Pages',
                                 'content':part_str_href })