def _climb_and_split(src, dest, chapters): for child in src.iterchildren(): if child.tag == 'hr' and child.get('class') == MARKER_CLASS: log('got a marker') new = copy_element(src, lxml.html.Element) #build a new tree to this point root = new for a in src.iterancestors(): a2 = copy_element(a, root.makeelement) a2.append(root) root = a2 chapters.append(root) #trim the tail of the finished one. dest.tail = None for a in dest.iterancestors(): a.tail = None #now the new tree is the destination dest = new else: new = copy_element(child, dest.makeelement) new.text = child.text dest.append(new) new2 = _climb_and_split(child, new, chapters) if new2 != new: dest = new2.getparent() return dest
def jostle_markers(root): """If a marker is not separating block level elements, try to move it out until it is, without completely ruining everything.""" stacks = [] for hr in root.iter(tag='hr'): if hr.get('class') == MARKER_CLASS: stack = frozenset(x for x in hr.iterancestors()) stacks.append((hr, stack)) for i, (hr, stack) in enumerate(stacks): if hr.get('class') == MARKER_CLASS: while True: parent = hr.getparent() log('i is %s hr is %s, parent is %s' % (i, hr, parent)) if parent.tag in ('html', 'body'): log('hit body') break #don't allow two stacks to merge if ((i > 0 and parent in stacks[i - 1][1]) or (i + 1 < len(stacks) and parent in stacks[i + 1][1])): log('hit neighbour') break #unless hr is right before the closing tag, don't jump #out of div, center or blockquote. if (parent.tag in INESCAPABLE_TAGS and not (hr.getnext() is None and not hr.tail)): log('hit %s' % parent.tag) break parent.addnext(hr) continue
def jostle_markers(root): """If a marker is not separating block level elements, try to move it out until it is, without completely ruining everything.""" stacks = [] for hr in root.iter(tag='hr'): if hr.get('class') == MARKER_CLASS: stack = frozenset(x for x in hr.iterancestors()) stacks.append((hr, stack)) for i, (hr, stack) in enumerate(stacks): if hr.get('class') == MARKER_CLASS: while True: parent = hr.getparent() log('i is %s hr is %s, parent is %s' %(i, hr, parent)) if parent.tag in ('html', 'body'): log('hit body') break #don't allow two stacks to merge if ((i > 0 and parent in stacks[i - 1][1]) or (i + 1 < len(stacks) and parent in stacks[i + 1][1])): log('hit neighbour') break #unless hr is right before the closing tag, don't jump #out of div, center or blockquote. if (parent.tag in INESCAPABLE_TAGS and not (hr.getnext() is None and not hr.tail)): log('hit %s' % parent.tag) break parent.addnext(hr) continue
def split_file(fn, splitter): f = open(fn) html = f.read() f.close() compressed_size = get_compressed_size(html) splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX, len(html) // config.EPUB_FILE_SIZE_MAX) log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits)) if splits: target = len(html) // (splits + 1) s = 0 fragments = [] for i in range(splits): e = html.find('<', target * (i + 1)) fragments.append(html[s:e]) fragments.append('<hr class="%s" id="split_%s" />' % (MARKER_CLASS, i)) s = e fragments.append(html[s:]) log([len(x) for x in fragments]) tree = lxml.html.fromstring(''.join(fragments)) jostle_markers(tree) html2 = etree.tostring(tree, encoding='UTF-8', method='html') f = open('/tmp/marked.html', 'w') f.write(html2) f.close() t = time.time() chapters, name = splitter(tree) print "%s took %s" % (splitter, time.time() - t) log(chapters) for i, c in enumerate(chapters): f = open('/tmp/%s_%s.html' % ( name, i + 1, ), 'w') f.write(etree.tostring(c, encoding='UTF-8', method='html')) f.close()
def split_file(fn, splitter): f = open(fn) html = f.read() f.close() compressed_size = get_compressed_size(html) splits = max(compressed_size // config.EPUB_COMPRESSED_SIZE_MAX, len(html) // config.EPUB_FILE_SIZE_MAX) log("uncompressed: %s, compressed: %s, splits: %s" % (len(html), compressed_size, splits)) if splits: target = len(html) // (splits + 1) s = 0 fragments = [] for i in range(splits): e = html.find('<', target * (i + 1)) fragments.append(html[s:e]) fragments.append('<hr class="%s" id="split_%s" />' % (MARKER_CLASS, i)) s = e fragments.append(html[s:]) log([len(x) for x in fragments]) tree = lxml.html.fromstring(''.join(fragments)) jostle_markers(tree) html2 = etree.tostring(tree, encoding='UTF-8', method='html') f = open('/tmp/marked.html', 'w') f.write(html2) f.close() t = time.time() chapters, name = splitter(tree) print "%s took %s" % (splitter, time.time() - t) log(chapters) for i, c in enumerate(chapters): f = open('/tmp/%s_%s.html' % (name, i + 1,), 'w') f.write(etree.tostring(c, encoding='UTF-8', method='html')) f.close()