def main(): input_assembled_file = Path(sys.argv[1]).resolve(strict=True) uuid_to_revised_path = Path(sys.argv[2]).resolve(strict=True) output_file_path = sys.argv[3] with open(uuid_to_revised_path, 'r') as f: uuid_to_revised_map = json.load(f) json_data = {} with open(input_assembled_file, "r") as in_file: binder = reconstitute(in_file) for doc in flatten_to_documents(binder): abstract = doc.metadata.get("summary") # Use the map revised value if available, otherwise expect it from the # metadata parsed from the assembled XHTML revised = uuid_to_revised_map.get(doc.id) or doc.metadata["revised"] json_data[doc.ident_hash] = { "abstract": abstract, "revised": utils.ensure_isoformat(revised) } with open(output_file_path, "w") as out_file: json.dump(json_data, out_file)
def parse_collection_binders(input_dir): """Create a list of binders from book collections""" baked_collections = Path(input_dir).glob("*.baked.xhtml") binders = [] for baked_collection in baked_collections: with open(baked_collection, "r") as baked_file: binder = reconstitute(baked_file) binders.append(binder) return binders
def main(argv=None): """Parse passed in cooked single HTML.""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('collated_html', type=argparse.FileType('r'), help='Path to the collated html' ' file (use - for stdin)') parser.add_argument('-d', '--dump-tree', action='store_true', help='Print out parsed model tree.') parser.add_argument('-o', '--output', type=argparse.FileType('w+'), help='Write out epub of parsed tree.') parser.add_argument('-i', '--input', type=argparse.FileType('r'), help='Read and copy resources/ for output epub.') args = parser.parse_args(argv) if args.input and args.output == sys.stdout: raise ValueError('Cannot output to stdout if reading resources') from cnxepub.collation import reconstitute binder = reconstitute(args.collated_html) if args.dump_tree: print(pformat(cnxepub.model_to_tree(binder)), file=sys.stdout) if args.output: cnxepub.adapters.make_epub(binder, args.output) if args.input: args.output.seek(0) zout = ZipFile(args.output, 'a', ZIP_DEFLATED) zin = ZipFile(args.input, 'r') for res in zin.namelist(): if res.startswith('resources'): zres = zin.open(res) zi = zin.getinfo(res) zout.writestr(zi, zres.read(), ZIP_DEFLATED) zout.close() # TODO Check for documents that have no identifier. # These should likely be composite-documents # or the the metadata got wiped out. # docs = [x for x in cnxepub.flatten_to(binder, only_documents_filter) # if x.ident_hash is None] return 0
def main(): """Main function""" in_dir = Path(sys.argv[1]).resolve(strict=True) out_dir = (in_dir / "disassembled").resolve(strict=True) baked_file = (in_dir / "collection.baked.xhtml").resolve(strict=True) baked_metdata_file = (in_dir / "collection.baked-metadata.json").resolve( strict=True) with open(baked_file, "rb") as file: html_root = etree.parse(file) binder = reconstitute(file) # It's important that we generate slug metadata in parallel with disassemble # so that where ident_hash values are based upon potentially randomly # generated UUIDs we can still use them as unique keys in JSON outputs # without diverging slugs = extract_slugs_from_binder(binder) nav = html_root.xpath("//xhtml:nav", namespaces=HTML_DOCUMENT_NAMESPACES)[0] toc_maker = ElementMaker(namespace=None, nsmap={None: "http://www.w3.org/1999/xhtml"}) toc = toc_maker.html(E.head(E.title("Table of Contents")), E.body(nav)) with open(f"{out_dir}/collection.toc.xhtml", "wb") as out: out.write(etree.tostring(toc, encoding="utf8", pretty_print=True)) with open(baked_metdata_file, "r") as baked_json: baked_metadata = json.load(baked_json) for doc in flatten_to(binder, lambda d: isinstance(d, Document)): with open(f"{out_dir / doc.ident_hash}.xhtml", "wb") as out: out.write(bytes(DocumentContentFormatter(doc))) with open(f"{out_dir / doc.ident_hash}-metadata.json", "w") as json_out: # Incorporate metadata from disassemble step while setting defaults # for cases like composite pages which may not have metadata from # previous stages json_metadata = { "slug": slugs.get(doc.ident_hash), "title": doc.metadata.get("title"), "abstract": None } # Add / override metadata from baking if available json_metadata.update(baked_metadata.get(doc.ident_hash, {})) json.dump(json_metadata, json_out)
def main(): """Main function""" xhtml_file = Path(sys.argv[1]).resolve(strict=True) metadata_file = Path(sys.argv[2]).resolve(strict=True) book_slug = sys.argv[3] out_dir = Path(sys.argv[4]) with open(xhtml_file, "rb") as file: html_root = etree.parse(file) binder = reconstitute(file) slugs = extract_slugs_from_binder(binder) with open(metadata_file, "r") as baked_json: baked_metadata = json.load(baked_json) book_toc_metadata = baked_metadata.get(binder.ident_hash) nav = html_root.xpath("//xhtml:nav", namespaces=HTML_DOCUMENT_NAMESPACES)[0] toc_maker = ElementMaker(namespace=None, nsmap={None: "http://www.w3.org/1999/xhtml"}) toc = toc_maker.html(E.head(E.title("Table of Contents")), E.body(nav)) nav_links = toc.xpath("//xhtml:a", namespaces=HTML_DOCUMENT_NAMESPACES) for doc in flatten_to_documents(binder): id_with_context = f'{binder.ident_hash}:{doc.id}' module_etree = content_to_etree(doc.content) for link in nav_links: link_href = link.attrib['href'] if not link_href.startswith('#'): continue if module_etree.xpath( f"/xhtml:body/xhtml:div[@id='{link_href[1:]}']", namespaces=HTML_DOCUMENT_NAMESPACES): link.attrib['href'] = f'./{id_with_context}.xhtml' # Add metadata to same-book-different-module links. # The module in which same-book link targets reside is only fully known # at time of disassembly. Different pipelines can make use of this # metadata in different ways for node in module_etree.xpath( '//xhtml:a[@href and starts-with(@href, "/contents/")]', namespaces=HTML_DOCUMENT_NAMESPACES): print('BEFORE:') print(node.attrib) page_link = node.attrib["href"].split("/")[-1] # Link may have fragment if "#" in page_link: page_uuid, page_fragment = page_link.split("#") else: page_uuid = page_link page_fragment = '' # This is either an intra-book link or inter-book link. We can # differentiate the latter by data-book-uuid attrib). if not node.attrib.get("data-book-uuid"): node.attrib["data-page-slug"] = slugs.get(page_uuid) node.attrib["data-page-uuid"] = page_uuid node.attrib["data-page-fragment"] = page_fragment print('AFTER:') print(node.attrib) doc.content = etree_to_content(module_etree) # Inject some styling and JS for QA xml_parser = etree.XMLParser(ns_clean=True) root = etree.XML(bytes(DocumentContentFormatter(doc)), xml_parser) head = root.xpath("//xhtml:head", namespaces=HTML_DOCUMENT_NAMESPACES) if not head: head = etree.Element("head") root.insert(0, head) style = etree.Element("style") script = etree.Element("script") style.text = u''' /* STYLING_FOR_DEVS */ /* Linking to a specific element should highlight the element */ :target { background-color: #ffffcc; border: 1px dotted #000000; animation-name: cssAnimation; animation-duration: 10s; animation-timing-function: ease-out; animation-delay: 0s; animation-fill-mode: forwards; } @keyframes cssAnimation { to { background-color: initial; border: initial; } } /* Style footnotes so that they stand out */ [role="doc-footnote"] { background-color: #ffcccc; border: 1px dashed #ff0000; } [role="doc-footnote"]:before { content: "FOOTNOTE " ; } /* Show a permalink when hovering over a heading or paragraph */ *:not(:hover) > a.-dev-permalinker { display: none; } * > a.-dev-permalinker { margin-left: .1rem; text-decoration: none; } ''' script.text = u'''//<![CDATA[ // SCRIPTS_FOR_DEVS window.addEventListener('load', () => { const pilcrow = '¶' function addPermalink(parent, id) { const link = window.document.createElement('a') link.classList.add('-dev-permalinker') link.setAttribute('href', '#' + id) link.textContent = pilcrow parent.appendChild(link) } const paragraphs = Array.from( document.querySelectorAll('p[id]') ) paragraphs.forEach(p => addPermalink(p, p.getAttribute('id')) ) const headings = Array.from( document.querySelectorAll( '*[id] > h1, *[id] > h2, *[id] > h3, ' + '*[id] > h4, *[id] > h5, *[id] > h6' ) ) headings.forEach(h => addPermalink( h, h.parentElement.getAttribute('id')) ) }) // ]]>''' head.append(style) head.append(script) with open(f"{out_dir / id_with_context}.xhtml", "wb") as out: out.write(etree.tostring(root)) with open(f"{out_dir / id_with_context}-metadata.json", "w") as json_out: # Incorporate metadata from disassemble step while setting defaults # for cases like composite pages which may not have metadata from # previous stages json_metadata = { "slug": slugs.get(doc.id), "title": doc.metadata.get("title"), "abstract": None, "id": doc.id, "revised": datetime.now(timezone.utc).isoformat() } # Add / override metadata from baking if available json_metadata.update(baked_metadata.get(doc.ident_hash, {})) json.dump(json_metadata, json_out) with open(f"{out_dir}/{book_slug}.toc.xhtml", "wb") as out: out.write(etree.tostring(toc, encoding="utf8", pretty_print=True)) with open(f"{out_dir}/{book_slug}.toc-metadata.json", "w") as toc_json: json.dump(book_toc_metadata, toc_json)
import sys import json from cnxepub.collation import reconstitute from cnxepub.models import flatten_to_documents in_path, out_path = sys.argv[1:3] json_data = {} with open(in_path, "r") as in_file: binder = reconstitute(in_file) for doc in flatten_to_documents(binder): abstract = doc.metadata.get("summary") json_data[doc.ident_hash] = {"abstract": abstract} with open(out_path, "w") as out_file: json.dump(json_data, out_file)
def test_html(self): page_path = os.path.join(TEST_DATA_DIR, 'desserts-single-page.html') with open(page_path) as html: from cnxepub.collation import reconstitute desserts = reconstitute(html) self.check_desserts(desserts)