def escape_ticks_before_markdown(html): """ Escapes backticks and quotes in code Also removes comments <!--- --> """ soup = bs(html) for code in soup.select('code, pre, mcdp-poset, mcdp-value, mcdp-fvalue, mcdp-rvalue, render'): if not code.string: continue #unicode s = code.string if '`' in code.string: s = s.replace('`', '`') # print('replacing %r -> %r' %(code.string, s)) if '"' in code.string: s = s.replace('"', '"') # print('replacing %r -> %r' %(code.string, s)) code.string = s comments=soup.find_all(string=lambda text:isinstance(text, bs4.Comment)) for c in comments: # print('stripping comment %s' % str(c)) c.extract() res = to_html_stripping_fragment(soup) return res
def test_toc(): s = """ <html> <head></head> <body> <h1 id='one'>One</h1> <p>a</p> <h2 id='two'>Two</h2> <p>a</p> <h3 id='three'>Three</h3> <h2 id='four'>Four</h2> <p>a</p> </body> </html> """ soup = bs(s) # print(soup) # body = soup.find('body') _toc = generate_toc(soup) s = str(soup) expected = ['sec:one', 'sub:two'] # print(indent(s, 'transformed > ')) for e in expected: assert e in s
def render_markdown(s, fix_blockquote_pre=True): """ Returns an HTML string encoded in UTF-8""" if isinstance(s, unicode): msg = 'I expect utf-8 encoded bytes.' raise_desc(TypeError, msg, s=s.__repr__()) import markdown # @UnresolvedImport import logging logging.getLogger("MARKDOWN").setLevel(logging.CRITICAL) extensions = [ 'markdown.extensions.smarty', # 'markdown.extensions.toc', 'markdown.extensions.attr_list', 'markdown.extensions.extra', # need for markdown=1 'markdown.extensions.fenced_code', 'markdown.extensions.admonition', 'markdown.extensions.tables', ] # markdown takes and returns unicode u = unicode(s, 'utf-8') html = markdown.markdown(u, extensions) html = html.encode('utf-8') if fix_blockquote_pre: if 'blockquote' in html: soup = bs(html) for code in soup.select('blockquote > p > code'): code.parent.name = 'pre' html = to_html_stripping_fragment(soup) return html
def test_toc2(): s = """ <html> <head></head> <body> <h1>One</h1> <h1>Two</h1> <h1>Three</h1> <p></p> <h2>A</h2> <h2>B</h2> <h2>C</h2> <h3>a</h3> <h3>b</h3> <h3>c</h3> </body> </html> """ soup = bs(s) # print(soup) # body = soup.find('body') _toc = generate_toc(soup) s = str(soup)
def make_page(contents, head0, add_toc, extra_panel_content, add_home_link): """ Returns html (Beautiful Soup document) """ html = Tag(name='html') head = head0.__copy__() html.append(head) body = Tag(name='body') with timeit('make_page() / copy toc'): if add_toc is not None: tocdiv = Tag(name='div') tocdiv.attrs['id'] = 'tocdiv' if add_home_link: a = Tag(name='a') a.append('Home') a.attrs['href'] = 'index.html' p = Tag(name='p') p.append(a) tocdiv.append(p) if extra_panel_content is not None: details = Tag(name='details') details.attrs['id'] = 'build-details' summary = Tag(name='summary') summary.append('build details') details.append(summary) details.append(extra_panel_content) tocdiv.append(details) tocdiv.append(add_toc) body.append(tocdiv) section_name = get_first_header_title(contents) if section_name is not None: section_name = section_name.replace('</code>', '</code> ') section_name = gettext(bs(section_name)) title2 = Tag(name='title') title2.append(section_name) title = head.find('title') if title is None: head.append(title2) else: title.replace_with(title2) not_toc = Tag(name='div') not_toc.attrs['id'] = 'not-toc' not_toc.append(contents) body.append(not_toc) html.append(body) # delete the original one if False: main_toc = contents.find(id=MCDPManualConstants.MAIN_TOC_ID) if main_toc is not None: main_toc.extract() return html
def elements_abbrevs_test1(): s = "<p>TODO: paragraph</p>" e = """<div class="todo-wrap"><p class="todo">paragraph</p></div>""" soup = bs(s.strip()) substitute_special_paragraphs(soup) o = to_html_stripping_fragment(soup) #print o assert_equal(o, e)
def task_markers_test1(): s = "<p>We should do this (TODO)</p>" e = """<p class="status-todo">We should do this (TODO)</p>""" soup = bs(s.strip()) substitute_task_markers(soup) o = to_html_stripping_fragment(soup) #print o assert_equal(o, e)
def link_to_command_explanation_check2(): s = """ <pre class="console"><code><span class="console_sign">$</span><span class="space"> </span><span class="curl program">curl</span><span class="space"> </span><span class="program_option">-o</span><span class="space"> </span>duckiebot-RPI3-AC-aug10.img.xz<span class="space"> </span><span class="placeholder">URL above</span> </code></pre>""" soup = bs(s) link_to_command_explanation(soup) s2 = str(soup) print s2 assert '<a href="#curl"' in s2
def link_to_command_explanation_check1(): s = """ <pre class='console'> <span class='program'>ls</span> file </pre> """ soup = bs(s) link_to_command_explanation(soup) s2 = str(soup) # print s2 assert '<a href="#ls"' in s2
def displayfile1(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <display-file src="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> """ soup = bs(s) n = display_files(soup, defaults, raise_errors=True) assert n == 1 s2 = str(soup) logger.debug('\n' + indent(s2, ' '))
def sub1(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <a href="github:path=context_eval_as_constant.py"></a> """ soup = bs(s) n = substitute_github_refs(soup, defaults) assert n == 1 s2 = str(soup) logger.debug(indent(s2, ' ')) expect = '<code class="github-resource-link">context_eval_as_constant.py</code>' if not expect in s2: raise Exception(s2)
def sub2(): defaults = {'org': 'AndreaCensi', 'repo': 'mcdp', 'branch': 'duckuments'} s = """ <a href="github:path=context_eval_as_constant.py,from_text=get_connections_for,to_text=return"></a> """ soup = bs(s) n = substitute_github_refs(soup, defaults) assert n == 1 s2 = str(soup) logger.debug('\n' + indent(s2, ' ')) expect = 'context_eval_as_constant.py#L7-L12' if not expect in s2: raise Exception('No %s in %s' % (expect, s2))
def tags_in_titles2(): template = """ <html> <head> </head> <body> <div id='toc'></div> </body> </html> """ s = """ <span id='frag'>I will refer to <a href="#two" class='number_name'></a></span> # One is ok {#one} Ignore # Two with `program` {#two} Another. """ library = MCDPLibrary() raise_errors = True realpath = 'transformations.py' s2 = render_complete(library, s, raise_errors, realpath, generate_pdf=False) files_contents= [DocToJoin(docname='one', contents=s2, source_info=None)] stylesheet = 'v_manual_blurb_ready' res_aug = manual_join(template, files_contents, stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None) soup = bs(res_aug.get_result()) element = soup.find(id='main_toc') print element if 'fragment' in str(element): raise Exception(str(element))
def tags_in_titles1(): template = """ <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE html> <html lang="en"> <head> <title>The Duckietown book</title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> </head> <body> </body> </html> """ s = """ <span id='frag'>I will refer to <a href="#two" class='number_name'></a></span> # Two with `program` {#two} Another. """ library = MCDPLibrary() raise_errors = True realpath = 'transformations.py' s2 = render_complete(library, s, raise_errors, realpath, generate_pdf=False) files_contents= [DocToJoin(docname='one', contents=s2, source_info=None)] stylesheet = 'v_manual_blurb_ready' res_aug = manual_join(template=template, files_contents=files_contents, stylesheet=stylesheet, remove=None, extra_css=None, remove_selectors=None, hook_before_toc=None) soup = bs(res_aug.get_result()) element = soup.find(id='frag') print element if '<code>' in str(element): raise Exception(str(element))
def split_file(html, directory): soup = BeautifulSoup(html, 'lxml', from_encoding='utf-8') body = soup.html.body # extract the main toc if it is there main_toc = body.find(id='main_toc') # if main_toc: # main_toc.extract() assert body is not None, soup filename2contents = split_in_files(body) add_prev_next_links(filename2contents) for filename, contents in list(filename2contents.items()): html = Tag(name='html') head = soup.html.head.__copy__() html.append(head) body = Tag(name='body') if main_toc: tocdiv = Tag(name='div') tocdiv.attrs['id'] = 'tocdiv' tocdiv.append(main_toc.__copy__()) body.append(tocdiv) body.append(contents) html.append(body) PAGE_IDENTIFIER = filename.replace('.html', '') PAGE_URL = 'https://duckietown.github.io/duckuments/master/' + filename s = disqus s = s.replace('PAGE_IDENTIFIER', PAGE_IDENTIFIER) s = s.replace('PAGE_URL', PAGE_URL) disqus_section = bs(s) from mcdp import logger logger.info(str(s)) body.append(disqus_section) filename2contents[filename] = html update_refs(filename2contents) write_split_files(filename2contents, directory)
def link_to_command_explanation_check3(): s = """ <fragment><div style="display:none">Because of mathjax bug</div> <h1 id="networking">Networking tools</h1> <div class="special-par-assigned-wrap"><p class="special-par-assigned">Andrea</p></div> <div class="requirements"> <p>Preliminary reading:</p> <ul> <li> <p>Basics of networking, including</p> <ul> <li>what are IP addresses</li> <li>what are subnets</li> <li>how DNS works</li> <li>how <code>.local</code> names work</li> <li>…</li> </ul> </li> </ul> <div class="special-par-see-wrap"><p class="status-XXX special-par-see"> (ref to find).</p></div> </div> <div class="todo-wrap"><p class="todo">to write</p></div> <p>Make sure that you know:</p> <h2 id="visualizing-information-about-the-network">Visualizing information about the network</h2> <h3 id="ping-are-you-there"><code>ping</code>: are you there?</h3> <div class="todo-wrap"><p class="todo">to write</p></div> <h3 id="ifconfig"><code>ifconfig</code></h3> <div class="todo-wrap"><p class="todo">to write</p></div> <pre class="console"><code><span class="console_sign">$</span><span class="space"> </span><span class="ifconfig program">ifconfig</span> </code></pre></fragment> """ soup = bs(s) link_to_command_explanation(soup) s2 = str(soup) # print s2 assert '<a href="#ifconfig"' in s2
def add_footnote_polyfill(soup): body = soup.find('body') x = bs(footnote_javascript) body.append(x)
s = '' for element in soup.select('details.' + ERROR_CLASS): summary = element.summary.text.encode('utf8') e2 = element.__copy__() e2.summary.extract() other = e2.text.encode('utf8') s0 = summary + '\n\n' + other s += '\n\n' + indent(s0, '', '* ') return s if __name__ == '__main__': filename = sys.argv[1] data = open(filename).read() soup = bs(data) s = search_for_errors(soup) if s: logger.error('Found a few errors:') logger.error(s) else: logger.info('No errors found.') @contract(long_error='str|$Tag') def insert_inset(element, short, long_error, klasses=[]): """ Inserts an errored details after element """ details = Tag(name='details') summary = Tag(name='summary') s = Tag(name='strong') s.append(short)
def go(context, worker_i, num_workers, data, mathjax, preamble, output_dir, assets_dir, add_toc_if_not_existing, extra_panel_content, permalink_prefix=None, output_crossref=None, only_refs=False): res = AugmentedResult() soup = bs_entire_document(data) # extract the main toc if it is there with timeit("Extracting main toc"): main_toc = soup.find(id=MCDPManualConstants.MAIN_TOC_ID) if main_toc is None: if add_toc_if_not_existing: # logger.info('Generating TOC because it is not there') tocg = generate_toc(soup) main_toc = bs(tocg).ul main_toc.attrs['class'] = 'toc' # XXX: see XXX13 assert main_toc is not None substituting_empty_links(main_toc, raise_errors=False, res=res, extra_refs=soup) else: msg = 'Could not find main toc (id #%s)' % MCDPManualConstants.MAIN_TOC_ID res.note_error(msg) main_toc = Tag(name='div') main_toc.append('TOC NOT FOUND') else: main_toc = main_toc.__copy__() if 'id' in main_toc.attrs: del main_toc.attrs['id'] # XXX: this is not the place to do it mark_toc_links_as_errored(main_toc, soup) body = soup.html.body with timeit("split_in_files"): filename2contents = split_in_files(body) id2filename = get_id2filename(filename2contents) res.set_result(id2filename) if output_crossref is not None: from mcdp_docs.mcdp_render_manual import write_crossref_info context.comp(write_crossref_info, data=data, id2filename=id2filename, output_crossref=output_crossref, permalink_prefix=permalink_prefix) if only_refs: logger.debug('Skipping rest because only_refs') return res with timeit("add_prev_next_links"): filename2contents = add_prev_next_links(filename2contents) with timeit("preparing assets dir"): if not os.path.exists(output_dir): try: os.makedirs(output_dir) except: pass with timeit("creating link.html and link.js"): linkbase = 'link.html' # do not change (it's used by http://purl.org/dth) linkbasejs = 'link.js' lb = create_link_base(id2filename) write_data_to_file(str(lb), os.path.join(output_dir, linkbase), quiet=True) linkjs = create_link_base_js(id2filename) write_data_to_file(str(linkjs), os.path.join(output_dir, linkbasejs), quiet=True) if preamble is not None: if preamble.endswith('.tex'): # XXX preamble = open(preamble).read() ids_to_use = [] for k in list(id2filename): if not 'autoid' in k: ids_to_use.append(k) ids_to_use = sorted(ids_to_use) pointed_to = [] for k in ids_to_use: f = id2filename[k] if not f in pointed_to: pointed_to.append(f) # data = ",".join(pointed_to) head0 = soup.html.head if True: context.comp(remove_spurious, output_dir, list(filename2contents)) with timeit('main_toc copy'): main_toc0 = main_toc.__copy__() main_toc0_s = str(main_toc0) asset_jobs = [] for i, (filename, contents) in enumerate(filename2contents.items()): if i % num_workers != worker_i: continue with timeit('main_toc copy hack'): main_toc = bs(main_toc0_s).ul assert main_toc is not None # Trick: we add the main_toc, and then ... (look below) with timeit('make_page'): add_home_link = 'index.html' not in filename2contents html = make_page(contents, head0, main_toc, extra_panel_content, add_home_link=add_home_link) with timeit("direct job"): result = only_second_part(mathjax, preamble, html, id2filename, filename) # ... we remove it. In this way we don't have to copy it every time... main_toc.extract() fn = os.path.join(output_dir, filename) h = get_md5(result)[:8] r = context.comp(extract_assets_from_file, result, fn, assets_dir, job_id='%s-%s-assets' % (filename, h)) asset_jobs.append(r) update_refs_('toc.html', main_toc, id2filename) out_toc = os.path.join(output_dir, 'toc.html') write_data_to_file(str(main_toc), out_toc, quiet=True) return context.comp(wait_assets, res, asset_jobs)