def main():
    input_assembled_file = Path(sys.argv[1]).resolve(strict=True)
    uuid_to_revised_path = Path(sys.argv[2]).resolve(strict=True)
    output_file_path = sys.argv[3]

    with open(uuid_to_revised_path, 'r') as f:
        uuid_to_revised_map = json.load(f)

    json_data = {}

    with open(input_assembled_file, "r") as in_file:
        binder = reconstitute(in_file)

    for doc in flatten_to_documents(binder):
        abstract = doc.metadata.get("summary")
        # Use the map revised value if available, otherwise expect it from the
        # metadata parsed from the assembled XHTML
        revised = uuid_to_revised_map.get(doc.id) or doc.metadata["revised"]
        json_data[doc.ident_hash] = {
            "abstract": abstract,
            "revised": utils.ensure_isoformat(revised)
        }

    with open(output_file_path, "w") as out_file:
        json.dump(json_data, out_file)
def parse_collection_binders(input_dir):
    """Create a list of binders from book collections"""
    baked_collections = Path(input_dir).glob("*.baked.xhtml")
    binders = []

    for baked_collection in baked_collections:
        with open(baked_collection, "r") as baked_file:
            binder = reconstitute(baked_file)
            binders.append(binder)

    return binders
Exemple #3
0
def main(argv=None):
    """Parse passed in cooked single HTML."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('collated_html',
                        type=argparse.FileType('r'),
                        help='Path to the collated html'
                        ' file (use - for stdin)')
    parser.add_argument('-d',
                        '--dump-tree',
                        action='store_true',
                        help='Print out parsed model tree.')

    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('w+'),
                        help='Write out epub of parsed tree.')

    parser.add_argument('-i',
                        '--input',
                        type=argparse.FileType('r'),
                        help='Read and copy resources/ for output epub.')

    args = parser.parse_args(argv)

    if args.input and args.output == sys.stdout:
        raise ValueError('Cannot output to stdout if reading resources')

    from cnxepub.collation import reconstitute
    binder = reconstitute(args.collated_html)

    if args.dump_tree:
        print(pformat(cnxepub.model_to_tree(binder)), file=sys.stdout)
    if args.output:
        cnxepub.adapters.make_epub(binder, args.output)

    if args.input:
        args.output.seek(0)
        zout = ZipFile(args.output, 'a', ZIP_DEFLATED)
        zin = ZipFile(args.input, 'r')
        for res in zin.namelist():
            if res.startswith('resources'):
                zres = zin.open(res)
                zi = zin.getinfo(res)
                zout.writestr(zi, zres.read(), ZIP_DEFLATED)
        zout.close()

    # TODO Check for documents that have no identifier.
    #      These should likely be composite-documents
    #      or the the metadata got wiped out.
    # docs = [x for x in cnxepub.flatten_to(binder, only_documents_filter)
    #         if x.ident_hash is None]

    return 0
def main():
    """Main function"""
    in_dir = Path(sys.argv[1]).resolve(strict=True)
    out_dir = (in_dir / "disassembled").resolve(strict=True)
    baked_file = (in_dir / "collection.baked.xhtml").resolve(strict=True)
    baked_metdata_file = (in_dir / "collection.baked-metadata.json").resolve(
        strict=True)

    with open(baked_file, "rb") as file:
        html_root = etree.parse(file)
        binder = reconstitute(file)

        # It's important that we generate slug metadata in parallel with disassemble
        # so that where ident_hash values are based upon potentially randomly
        # generated UUIDs we can still use them as unique keys in JSON outputs
        # without diverging
        slugs = extract_slugs_from_binder(binder)

    nav = html_root.xpath("//xhtml:nav",
                          namespaces=HTML_DOCUMENT_NAMESPACES)[0]

    toc_maker = ElementMaker(namespace=None,
                             nsmap={None: "http://www.w3.org/1999/xhtml"})
    toc = toc_maker.html(E.head(E.title("Table of Contents")), E.body(nav))

    with open(f"{out_dir}/collection.toc.xhtml", "wb") as out:
        out.write(etree.tostring(toc, encoding="utf8", pretty_print=True))

    with open(baked_metdata_file, "r") as baked_json:
        baked_metadata = json.load(baked_json)

    for doc in flatten_to(binder, lambda d: isinstance(d, Document)):
        with open(f"{out_dir / doc.ident_hash}.xhtml", "wb") as out:
            out.write(bytes(DocumentContentFormatter(doc)))

        with open(f"{out_dir / doc.ident_hash}-metadata.json",
                  "w") as json_out:
            # Incorporate metadata from disassemble step while setting defaults
            # for cases like composite pages which may not have metadata from
            # previous stages
            json_metadata = {
                "slug": slugs.get(doc.ident_hash),
                "title": doc.metadata.get("title"),
                "abstract": None
            }

            # Add / override metadata from baking if available
            json_metadata.update(baked_metadata.get(doc.ident_hash, {}))

            json.dump(json_metadata, json_out)
Exemple #5
0
def main(argv=None):
    """Parse passed in cooked single HTML."""
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('collated_html', type=argparse.FileType('r'),
                        help='Path to the collated html'
                             ' file (use - for stdin)')
    parser.add_argument('-d', '--dump-tree', action='store_true',
                        help='Print out parsed model tree.')

    parser.add_argument('-o', '--output', type=argparse.FileType('w+'),
                        help='Write out epub of parsed tree.')

    parser.add_argument('-i', '--input', type=argparse.FileType('r'),
                        help='Read and copy resources/ for output epub.')

    args = parser.parse_args(argv)

    if args.input and args.output == sys.stdout:
        raise ValueError('Cannot output to stdout if reading resources')

    from cnxepub.collation import reconstitute
    binder = reconstitute(args.collated_html)

    if args.dump_tree:
        print(pformat(cnxepub.model_to_tree(binder)),
              file=sys.stdout)
    if args.output:
        cnxepub.adapters.make_epub(binder, args.output)

    if args.input:
        args.output.seek(0)
        zout = ZipFile(args.output, 'a', ZIP_DEFLATED)
        zin = ZipFile(args.input, 'r')
        for res in zin.namelist():
            if res.startswith('resources'):
                zres = zin.open(res)
                zi = zin.getinfo(res)
                zout.writestr(zi, zres.read(), ZIP_DEFLATED)
        zout.close()

    # TODO Check for documents that have no identifier.
    #      These should likely be composite-documents
    #      or the the metadata got wiped out.
    # docs = [x for x in cnxepub.flatten_to(binder, only_documents_filter)
    #         if x.ident_hash is None]

    return 0
def main():
    """Main function"""
    xhtml_file = Path(sys.argv[1]).resolve(strict=True)
    metadata_file = Path(sys.argv[2]).resolve(strict=True)
    book_slug = sys.argv[3]
    out_dir = Path(sys.argv[4])

    with open(xhtml_file, "rb") as file:
        html_root = etree.parse(file)
        binder = reconstitute(file)
        slugs = extract_slugs_from_binder(binder)

    with open(metadata_file, "r") as baked_json:
        baked_metadata = json.load(baked_json)
        book_toc_metadata = baked_metadata.get(binder.ident_hash)

    nav = html_root.xpath("//xhtml:nav",
                          namespaces=HTML_DOCUMENT_NAMESPACES)[0]

    toc_maker = ElementMaker(namespace=None,
                             nsmap={None: "http://www.w3.org/1999/xhtml"})
    toc = toc_maker.html(E.head(E.title("Table of Contents")), E.body(nav))

    nav_links = toc.xpath("//xhtml:a", namespaces=HTML_DOCUMENT_NAMESPACES)

    for doc in flatten_to_documents(binder):
        id_with_context = f'{binder.ident_hash}:{doc.id}'

        module_etree = content_to_etree(doc.content)
        for link in nav_links:
            link_href = link.attrib['href']
            if not link_href.startswith('#'):
                continue
            if module_etree.xpath(
                    f"/xhtml:body/xhtml:div[@id='{link_href[1:]}']",
                    namespaces=HTML_DOCUMENT_NAMESPACES):
                link.attrib['href'] = f'./{id_with_context}.xhtml'

        # Add metadata to same-book-different-module links.
        # The module in which same-book link targets reside is only fully known
        # at time of disassembly. Different pipelines can make use of this
        # metadata in different ways
        for node in module_etree.xpath(
                '//xhtml:a[@href and starts-with(@href, "/contents/")]',
                namespaces=HTML_DOCUMENT_NAMESPACES):
            print('BEFORE:')
            print(node.attrib)

            page_link = node.attrib["href"].split("/")[-1]
            # Link may have fragment
            if "#" in page_link:
                page_uuid, page_fragment = page_link.split("#")
            else:
                page_uuid = page_link
                page_fragment = ''

            # This is either an intra-book link or inter-book link. We can
            # differentiate the latter by data-book-uuid attrib).
            if not node.attrib.get("data-book-uuid"):
                node.attrib["data-page-slug"] = slugs.get(page_uuid)
                node.attrib["data-page-uuid"] = page_uuid
                node.attrib["data-page-fragment"] = page_fragment

            print('AFTER:')
            print(node.attrib)

        doc.content = etree_to_content(module_etree)

        # Inject some styling and JS for QA
        xml_parser = etree.XMLParser(ns_clean=True)
        root = etree.XML(bytes(DocumentContentFormatter(doc)), xml_parser)
        head = root.xpath("//xhtml:head", namespaces=HTML_DOCUMENT_NAMESPACES)

        if not head:
            head = etree.Element("head")
            root.insert(0, head)

        style = etree.Element("style")
        script = etree.Element("script")

        style.text = u'''
            /* STYLING_FOR_DEVS */
            /* Linking to a specific element should highlight the element */
            :target {
                background-color: #ffffcc;
                border: 1px dotted #000000;

                animation-name: cssAnimation;
                animation-duration: 10s;
                animation-timing-function: ease-out;
                animation-delay: 0s;
                animation-fill-mode: forwards;
            }
            @keyframes cssAnimation {
                to {
                    background-color: initial;
                    border: initial;
                }
            }

            /* Style footnotes so that they stand out */
            [role="doc-footnote"] {
                background-color: #ffcccc;
                border: 1px dashed #ff0000;
            }
            [role="doc-footnote"]:before { content: "FOOTNOTE " ; }

            /* Show a permalink when hovering over a heading or paragraph */
            *:not(:hover) > a.-dev-permalinker { display: none; }
            * > a.-dev-permalinker {
                margin-left: .1rem;
                text-decoration: none;
            }
        '''

        script.text = u'''//<![CDATA[
            // SCRIPTS_FOR_DEVS
            window.addEventListener('load', () => {
                const pilcrow = '¶'

                function addPermalink(parent, id) {
                    const link = window.document.createElement('a')
                    link.classList.add('-dev-permalinker')
                    link.setAttribute('href', '#' + id)
                    link.textContent = pilcrow
                    parent.appendChild(link)
                }

                const paragraphs = Array.from(
                    document.querySelectorAll('p[id]')
                )
                paragraphs.forEach(p => addPermalink(p, p.getAttribute('id')) )

                const headings = Array.from(
                    document.querySelectorAll(
                        '*[id] > h1, *[id] > h2, *[id] > h3, ' +
                        '*[id] > h4, *[id] > h5, *[id] > h6'
                    )
                )
                headings.forEach(h => addPermalink(
                    h, h.parentElement.getAttribute('id'))
                )
            })
        // ]]>'''

        head.append(style)
        head.append(script)

        with open(f"{out_dir / id_with_context}.xhtml", "wb") as out:
            out.write(etree.tostring(root))

        with open(f"{out_dir / id_with_context}-metadata.json",
                  "w") as json_out:
            # Incorporate metadata from disassemble step while setting defaults
            # for cases like composite pages which may not have metadata from
            # previous stages
            json_metadata = {
                "slug": slugs.get(doc.id),
                "title": doc.metadata.get("title"),
                "abstract": None,
                "id": doc.id,
                "revised": datetime.now(timezone.utc).isoformat()
            }

            # Add / override metadata from baking if available
            json_metadata.update(baked_metadata.get(doc.ident_hash, {}))

            json.dump(json_metadata, json_out)

    with open(f"{out_dir}/{book_slug}.toc.xhtml", "wb") as out:
        out.write(etree.tostring(toc, encoding="utf8", pretty_print=True))

    with open(f"{out_dir}/{book_slug}.toc-metadata.json", "w") as toc_json:
        json.dump(book_toc_metadata, toc_json)
import sys
import json
from cnxepub.collation import reconstitute
from cnxepub.models import flatten_to_documents

in_path, out_path = sys.argv[1:3]

json_data = {}

with open(in_path, "r") as in_file:
    binder = reconstitute(in_file)

for doc in flatten_to_documents(binder):
    abstract = doc.metadata.get("summary")
    json_data[doc.ident_hash] = {"abstract": abstract}

with open(out_path, "w") as out_file:
    json.dump(json_data, out_file)
Exemple #8
0
 def test_html(self):
     page_path = os.path.join(TEST_DATA_DIR, 'desserts-single-page.html')
     with open(page_path) as html:
         from cnxepub.collation import reconstitute
         desserts = reconstitute(html)
     self.check_desserts(desserts)
 def test_html(self):
     page_path = os.path.join(TEST_DATA_DIR, 'desserts-single-page.html')
     with open(page_path) as html:
         from cnxepub.collation import reconstitute
         desserts = reconstitute(html)
     self.check_desserts(desserts)