def colorize(doc): doc = copy.deepcopy(doc) # Replace rules with empty level 2 headers rules = [] for elt, path in pandoc.iter(doc, path=True): if isinstance(elt, HorizontalRule): holder, index = path[-1] rules.append((elt, holder, index)) for _, holder, index in reversed(rules): empty_header = Header(2, ("", [], []), []) holder[index] = empty_header # Colors color = None for block in doc[1]: if isinstance(block, Header): header = block lvl, attr, inlines = header[:] if lvl == 2: id_, classes, kvs = attr title = pandoc.write(inlines) for emoji in COLOR_THEME: if emoji in title: color = f"{COLOR_THEME[emoji]}" break if color is not None: kvs.append(("data-background-color", color)) return doc
def notebookify(doc): from pandoc.types import Pandoc, Meta, CodeBlock notebook = Notebook() cells = notebook["cells"] blocks = doc[1] # print(blocks) execution_count = 1 for block in blocks: if isinstance(block, CodeBlock): source = block[1] code_cell = CodeCell() code_cell["source"] = source code_cell["execution_count"] = execution_count execution_count += 1 cells.append(code_cell) else: wrapper = Pandoc(Meta({}), [block]) options = ["-t", "markdown-smart"] # needed for en-dashes for # example: we don't expect Jupyter cells to be smart, so we # *disable* the smart output so that '–' won't get represented # as '--'. source = pandoc.write(wrapper, options=options) merge_markdown = False if (merge_markdown and len(cells) >= 1 and cells[-1]["cell_type"] == "markdown"): cells[-1]["source"] += "\n" + source else: markdown_cell = MarkdownCell() markdown_cell["source"] = source cells.append(markdown_cell) return notebook
def print_sections(doc): for elt, path in pandoc.iter(doc, path=True): if isinstance(elt, Header): header = elt level, attr, inlines = header[:] minidoc = Pandoc(Meta({}), [Plain(inlines)]) title = pandoc.write(minidoc) depth = len( [holder for holder, index in path if isinstance(holder, Div)]) print(str(depth) + "> " + depth * 4 * " " + title, end="")
def round_trip_check(self, json_doc): json_doc_2 = None try: doc = pandoc.read(json_doc) json_doc_2 = pandoc.write(doc) except Exception as error: jsond_doc_2 = error if json_doc == json_doc_2 : return True else: return json_doc, json_doc_2
def to_pandoc(self): rich_text_ast = self.rich_text.to_pandoc() rich_text_html = pandoc.write(rich_text_ast, format='html') # TODO: handle citations return RawBlock( Format('html'), "\n".join([ '<blockquote><p>', rich_text_html.rstrip(), '</p></blockquote>', '', ]))
def handle_typed_sections(doc): types = { "theorem": "Théorème", "definition": "Définition", "lemma": "Lemme", "proposition": "Proposition" } levels = { 1: "section", 2: "subsection", 3: "subsubsection", 4: "paragraph", 5: "subparagraph" } todos = [] for elt, path in pandoc.iter(doc, path=True): if isinstance(elt, Header): header = elt level, attr, inlines = header id_, classes, kvs = attr shared = [type_ for type_ in classes if type_ in types] if shared: classes.extend(["unnumbered", "unlisted"]) holder, index = path[-1] minidoc = Pandoc(Meta(map()), [Para(inlines)]) latex_title = pandoc.write(minidoc, format="latex").strip() todos.append([holder, index, level, latex_title]) type_ = shared[0] inlines = [Str(types[type_]), Space(), Str("–"), Space()] + inlines header[2] = inlines for holder, index, level, latex_title in todos: latex_code = r"\addcontentsline{toc}{" + levels[ level] + "}{" + latex_title + "}" holder.insert(index + 1, Para([RawInline("latex", latex_code)])) for elt, path in pandoc.iter(doc, path=True): if isinstance(elt, (RawInline, RawBlock)): format = elt[0] if format[0] == "html": holder, i = path[-1] found.insert(0, (holder, i))
doc = transform(doc) # Code and Doctest code = generate_code(doc) if code is not None: #print("code:\n\n", code) with open(doc_py, "w") as py_file: py_file.write(code) python("-m", "doctest", doc_py) try: shutil.rmtree(output / "__pycache__") # otherwise, top build has issues. except: pass pandoc.write(doc, file=doc_pdf, options=PDF_options) # PDF Output (Print) pandoc.write(doc, file=doc_pdf_print, options=PDF_PRINT_options) # LaTeX Output pandoc.write(doc, file=doc_tex, options=TEX_options) gl = lambda pattern: list(images.glob(pattern)) image_filenames = gl("*.pdf") + gl("*.png") + gl("*.jpg") for image in image_filenames: shutil.copy(image, output_latex_images) shutil.make_archive(str(output_latex), "zip", str(output_latex)) shutil.rmtree(str(output_latex)) #pandoc.write(doc, format="html5", file=doc_html, options=HTML_options) #pandoc.write(doc, format="odt", file=doc_odt, options=ODT_options)
for action, index, contents in reversed(divs): del root[index] if action == "unpack": root[index:index] = contents return doc slides_doc = make_slides_doc(doc) options = [ "--standalone", "-V", "theme:white", "--mathjax", "-V", "slideNumber:true" ] pandoc.write(slides_doc, file=doc_name + ".html", format="revealjs", options=options) # Notebook Generation # ------------------------------------------------------------------------------ # Two issues here: for one some Header stuff flagged 'slides' # should be removed from the notebook output but isn't. Mmmmmm shit. # Second, deeper issue: Headers are NOT the holders of the elements # that follow, so the algorithm needs to be smarter and identify the # corresponding content. # For now, short term: squash two issues by using only divs in the # document to deal with conditional content. # Ouch: the reveal target doesn't like divs very much (overlap # and / or no newpage). Can we solve this by unpacking slides divs ?
options += ["-V", "lang=fr"] options += ["--table-of-contents"] if bibliography.exists(): options += [ "--bibliography=bibliography.json", "-M", "link-citations=true" ] TEX_options = options.copy() PDF_options = options.copy() # To use package titlesec, see <https://stackoverflow.com/questions/42916124/not-able-to-use-titlesec-with-markdown-and-pandoc> # Update: titlesec is off limit anyway with pandoc, # as it is not compatible with hyperref # PDF_options += ["--variable", "subparagraph"] ODT_options = options.copy() HTML_options = options.copy() HTML_options += ["--mathjax"] # PDF Output doc = pandoc.read(file=doc_md) doc = transform(doc) pandoc.write(doc, file=doc_pdf, options=PDF_options) pandoc.write(doc, file=doc_tex, options=TEX_options) gl = lambda pattern: list(images.glob(pattern)) image_filenames = gl("*.pdf") + gl("*.png") + gl("*.jpg") for image in image_filenames: shutil.copy(image, output_latex_images) shutil.make_archive(str(output_latex), "zip", str(output_latex)) shutil.rmtree(str(output_latex)) #pandoc.write(doc, format="html5", file=doc_html, options=HTML_options) #pandoc.write(doc, format="odt", file=doc_odt, options=ODT_options)
error = "cannot identify the main document " error += f"(found {len(docs)} markdown files)" raise RuntimeError(error) doc = _docs[0] doc_md = doc + ".md" doc_pdf = str(output / (doc + ".pdf")) doc_odt = str(output / (doc + ".odt")) doc_html = str(output / (doc + ".html")) doc_md_md = str(output / (doc + ".md")) # Doctest python("-m", "doctest", doc_md) # Pandoc Options options = ["--standalone"] options += ["-V", "lang=fr"] options += ["--table-of-contents"] if bibliography.exists(): options += ["--bibliography=bibliography.json", "-M", "link-citations=true"] PDF_options = options.copy() ODT_options = options.copy() HTML_options = options.copy() HTML_options += ["--mathjax"] # PDF Output doc = pandoc.read(file=doc_md) doc = transform(doc) pandoc.write(doc, file=doc_pdf, options=PDF_options) pandoc.write(doc, format="html5", file=doc_html, options=HTML_options) pandoc.write(doc, format="odt", file=doc_odt, options=ODT_options)
def notebookify(doc): from pandoc.types import Pandoc, Meta, CodeBlock, Header, Para, Str, Space notebook = Notebook() cells = notebook["cells"] blocks = doc[1] # execution_count = 1 metamap = doc[0][0] hero_title = [ Str("Control"), Space(), Str("Engineering"), Space(), Str("with"), Space(), Str("Python"), ] title = metamap["title"][0] author = metamap["author"][0][0][0] header = Pandoc( Meta({}), [ Header(1, ("", [], []), hero_title), Header(1, ("", [], []), title), Para(author), ], ) header_cell = MarkdownCell() header_cell["source"] = pandoc.write(header) cells.append(header_cell) for block in blocks: if isinstance(block, CodeBlock): source = block[1] code_cell = CodeCell() code_cell["source"] = source code_cell["execution_count"] = None # execution_count # execution_count += 1 cells.append(code_cell) else: wrapper = Pandoc(Meta({}), [block]) options = ["-t", "markdown-smart-raw_attribute-simple_tables"] # markdown-smart-raw_attribute variant # ------------------------------------------------------------------ # -smart needed for en-dashes for example: we don't expect Jupyter # cells to be smart, so we *disable* the smart output so that # '–' won't get represented as '--'. Doesn't work in metadata (?) # -raw_attribute so that raw html is output as HTML, not as # the non-standard markdown syntax `<p>Hello</p>`{=html} that # the Jupyter notebooks do not understand. # ------------------------------------------------------------------ # UPDATE: replace this markdown variant by github-flavored # markdown (for example to get a proper rendering of tables # in notebooks). Arf, no, would f**k up the math. Need to # find selectively what kind of tables are allowed. # UPDATE: ok, the removal of simple_tables works. source = pandoc.write(wrapper, options=options) merge_markdown = False if (merge_markdown and len(cells) >= 1 and cells[-1]["cell_type"] == "markdown"): cells[-1]["source"] += "\n" + source else: markdown_cell = MarkdownCell() markdown_cell["source"] = source cells.append(markdown_cell) return notebook
from ics import * import pandoc # ------------------------------------------------------------------------------ url = "https://calendar.google.com/calendar/ical/o1rahvtc75kj2qcc5tmsrfr6e0%40group.calendar.google.com/public/basic.ics" calendar = Calendar(urlopen(url).read().decode("utf-8")) # assert len(calendar.events) == 60 # relax this; we have more slots with the exam of EC2 now. events = list(calendar.events) events.sort() for i, event in enumerate(events): print(f"{i+1:2d}) {event.name}") assert event.begin.date() == event.end.date() #assert event.duration.seconds == 1.5 * 3600 begin = event.begin.to("Europe/Paris") end = event.end.to("Europe/Paris") print(" " + begin.format("dddd DD MMMM YYYY", locale="fr_FR"), end=", ") print(begin.format("HH:mm") + "-" + end.format("HH:mm") + ".") description = event.description if event.description: doc = pandoc.read(event.description, format="html") print(pandoc.write(doc, format="markdown")) #description = "\n".join([" " + line for line in description.splitlines()]) #description = " " + description.strip() #print(description) print()
def handle_typed_sections(doc): types = { "theorem": "Théorème", "corollary": "Corollaire", "definition": "Définition", "lemma": "Lemme", "proposition": "Proposition", "remark": "Remarque", "exercise": "Exercice", "example": "Exemple", "question": None, "answer": None, } levels = { 1: "section", 2: "subsection", 3: "subsubsection", 4: "paragraph", 5: "subparagraph", } todos = [] for elt, path in pandoc.iter(doc, path=True): if isinstance(elt, Header): header = elt level, attr, inlines = header id_, classes, kvs = attr shared = [type_ for type_ in classes if type_ in types] if shared: classes.extend(["unnumbered", "unlisted"]) holder, index = path[-1] minidoc = Pandoc(Meta(map()), [Para(inlines)]) latex_title = pandoc.write(minidoc, format="latex").strip() todos.append([holder, index, level, latex_title]) type_ = shared[0] if types[type_]: inlines = [Str(types[type_]), Space(), Str("–"), Space()] + inlines if "exercise" in classes or "question" in classes: if "zero" in classes: inlines += [ Space(), Str("("), Math(InlineMath(), r"\mathord{\boldsymbol{\circ}}"), Str(")"), ] if "one" in classes: inlines += [ Space(), Str("("), Math(InlineMath(), r"\mathord{\bullet}"), Str(")"), ] if "two" in classes: inlines += [ Space(), Str("("), Math(InlineMath(), r"\mathord{\bullet}" * 2), Str(")"), ] if "three" in classes: inlines += [ Space(), Str("("), Math(InlineMath(), r"\mathord{\bullet}" * 3), Str(")"), ] if "four" in classes: inlines += [ Space(), Str("("), Math(InlineMath(), r"\mathord{\bullet}" * 4), Str(")"), ] # r"\mathord{\pmb{\infty}}" header[2] = inlines for holder, index, level, latex_title in todos: latex_code = (r"\addcontentsline{toc}{" + levels[level] + "}{" + latex_title + "}") holder.insert(index + 1, Para([RawInline("latex", latex_code)])) for elt, path in pandoc.iter(doc, path=True): if isinstance(elt, (RawInline, RawBlock)): format = elt[0] if format[0] == "html": holder, i = path[-1] found.insert(0, (holder, i))
doc = transform(doc) # Code and Doctest code = generate_code(doc) if code is not None: # print("code:\n\n", code) with open(doc_py, "w") as py_file: py_file.write(code) python("-m", "doctest", doc_py) try: shutil.rmtree(output / "__pycache__") # otherwise, top build has issues. except: pass pandoc.write(doc, file=doc_pdf, options=PDF_options) pandoc.write(doc, file=doc_md_md, options=PDF_options) # PDF Output (Print) pandoc.write(doc, file=doc_pdf_print, options=PDF_PRINT_options) # LaTeX Output pandoc.write(doc, file=doc_tex, options=TEX_options) gl = lambda pattern: list(images.glob(pattern)) image_filenames = gl("*.pdf") + gl("*.png") + gl("*.jpg") for image in image_filenames: shutil.copy(image, output_latex_images) shutil.make_archive(str(output_latex), "zip", str(output_latex)) shutil.rmtree(str(output_latex)) # pandoc.write(doc, format="html5", file=doc_html, options=HTML_options) # pandoc.write(doc, format="odt", file=doc_odt, options=ODT_options)
def notebookify(doc): from pandoc.types import Pandoc, Meta, CodeBlock, Header, Para, Str, Space notebook = Notebook() cells = notebook["cells"] blocks = doc[1] # execution_count = 1 metamap = doc[0][0] hero_title = [ Str("Control"), Space(), Str("Engineering"), Space(), Str("with"), Space(), Str("Python"), ] title = metamap["title"][0] author = metamap["author"][0][0][0] header = Pandoc( Meta({}), [ Header(1, ("", [], []), hero_title), Header(1, ("", [], []), title), Para(author), ], ) header_cell = MarkdownCell() header_cell["source"] = pandoc.write(header) cells.append(header_cell) for block in blocks: if isinstance(block, CodeBlock): source = block[1] code_cell = CodeCell() code_cell["source"] = source code_cell["execution_count"] = None # execution_count # execution_count += 1 cells.append(code_cell) else: wrapper = Pandoc(Meta({}), [block]) options = ["-t", "markdown-smart-raw_attribute"] # -smart needed for en-dashes for example: we don't expect Jupyter # cells to be smart, so we *disable* the smart output so that # '–' won't get represented as '--'. # -raw_attribute so that raw html is output as HTML, not as # the non-standard markdown syntax `<p>Hello</p>`{=html} that # the Jupyter notebooks do not understand. source = pandoc.write(wrapper, options=options) merge_markdown = False if (merge_markdown and len(cells) >= 1 and cells[-1]["cell_type"] == "markdown"): cells[-1]["source"] += "\n" + source else: markdown_cell = MarkdownCell() markdown_cell["source"] = source cells.append(markdown_cell) return notebook
"Remove everything before the first real paragraph" blocks = doc[1] for i, block in enumerate(blocks): if isinstance(block, Para): para = block inlines = para[0] if len(inlines) > 0 and isinstance(inlines[0], Str): break doc[1] = blocks[i:] return doc # Simplify # ------------------------------------------------------------------------------ def simplify(doc): doc = unpack_divs(doc) # doc = unpack_divs_2(doc) doc = remove_preamble(doc) return doc # Entry Point # ------------------------------------------------------------------------------ if __name__ == "__main__": url = 'https://pandoc.org/getting-started.html' src = urllib.request.urlopen(url).read() doc = pandoc.read(src, format="html") doc = simplify(doc) print(pandoc.write(doc, format="markdown", options=["-s"]))