def convert_ipynb_to_gallery(file_name): python_file = "" nb_dict = json.load(open(file_name)) cells = nb_dict['cells'] for i, cell in enumerate(cells): if i == 0: assert cell['cell_type'] == 'markdown', \ 'First cell has to be markdown' md_source = ''.join(cell['source']) rst_source = pdoc.convert_text(md_source, 'rst', 'md') python_file = '"""\n' + rst_source + '\n"""' else: if cell['cell_type'] == 'markdown': md_source = ''.join(cell['source']) rst_source = pdoc.convert_text(md_source, 'rst', 'md') commented_source = '\n'.join(['# ' + x for x in rst_source.split('\n')]) python_file = python_file + '\n\n\n' + '#' * 70 + '\n' + \ commented_source elif cell['cell_type'] == 'code': source = ''.join(cell['source']) python_file = python_file + '\n' * 2 + source open(file_name.replace('.ipynb', '.py'), 'w').write(python_file)
def convert_ipynb_to_gallery(file_name): """ Blatantly stolen + adapted from https://gist.github.com/wuhuikai/4a7ceb8bc52454e17a4eb8327d538d85 """ python_file = "" nb_dict = json.load(open(file_name)) cells = nb_dict['cells'] for i, cell in enumerate(cells): if i == 0: assert cell['cell_type'] == 'markdown', \ 'First cell has to be markdown' md_source = ''.join(cell['source']) rst_source = pdoc.convert_text(md_source, 'rst', 'md') python_file = '"""\n' + rst_source + '\n"""' else: if cell['cell_type'] == 'markdown': md_source = ''.join(cell['source']) rst_source = pdoc.convert_text(md_source, 'rst', 'md') commented_source = '\n'.join([ '# ' + x for x in rst_source.split('\n') ]) python_file = python_file + '\n\n\n' + '#' * 70 + '\n' + \ commented_source elif cell['cell_type'] == 'code': source = ''.join(cell['source']) python_file = python_file + '\n' * 2 + source open(file_name.replace('.ipynb', '.py'), 'w').write(python_file)
def expand_description(self, exp): return { "general": pypandoc.convert_text(exp.find("./description/general").text, "latex", format="md"), "details": [ pypandoc.convert_text(detail.text.strip(), "latex", format="md") for detail in exp.findall("./description/details/detail") ], }
def main(): if len(sys.argv) <= 1: sys.exit("Please supply a filename") input_format = "markdown" pdf_output = common_md() html_output = pdf_output["html"] pdf_output = pdf_output["pdf"] print() for arg in sys.argv[1:]: p = Path(arg).resolve() print(f"Generating: {p}") ext = p.suffix if ext == ".md": p.write_text(pdf_output) elif ext == ".html": html_output = "# " + VERSION_STR + "\n\n" + html_output pypandoc.convert_text( html_output, format=input_format, to="html5", outputfile=str(p), extra_args=["--standalone", "--self-contained", "--toc", "--toc-depth=2", "--css=" + str(TEMPLATE_DIR / "docs.css"), "--template=" + str(TEMPLATE_DIR / "template.html")]) elif ext == ".pdf" or ext == ".tex": latex_preamble = env.get_template("latex_preamble.jinja2.md") latex = latex_preamble \ .render(title=VERSION_STR, fonts_dir=FONTS_DIR) + "\n\n" latex += pdf_output pandoc_version = int(pypandoc.get_pandoc_version()[0]) engine = ("--pdf-engine=xelatex" if pandoc_version >= 2 else "--latex-engine=xelatex") pypandoc.convert_text( latex, format=input_format, to=ext[1:], outputfile=str(p), extra_args=["--standalone", "--column=80", "--toc", "--toc-depth=2", engine, "--variable=papersize:A4"])
def render_to_format(request, format, title, template_src, context): # for some weird reason we have to cast here explicitly format = str(format) title = str(title) if format in settings.EXPORT_FORMATS: # render the template to a html string template = get_template(template_src) html = template.render(context) # remove empty lines html = os.linesep.join([line for line in html.splitlines() if line.strip()]) if format == 'html': # create the response object response = HttpResponse(html) else: if format == 'pdf': args = ['-V', 'geometry:margin=1in'] content_disposition = 'filename=%s.%s' % (title, format) else: args = [] content_disposition = 'attachment; filename=%s.%s' % (title, format) print (content_disposition) # create a temporary file (tmp_fd, tmp_filename) = mkstemp('.' + format) # convert the file using pandoc pypandoc.convert_text(html, format, format='html', outputfile=tmp_filename, extra_args=args) # read the temporary file file_handler = os.fdopen(tmp_fd, 'rb') file_content = file_handler.read() file_handler.close() # delete the temporary file os.remove(tmp_filename) # create the response object response = HttpResponse(file_content, content_type='application/%s' % format) response['Content-Disposition'] = content_disposition return response else: return HttpResponseBadRequest(_('This format is not supported.'))
def ChangeSpellDesc2MD(): with open(json_file['spells']) as json_data: spells = json.load(json_data) for spell in spells: #print(spell) spell['desc'] = pypandoc.convert_text(spell['desc'],'md',format='html',extra_args=['--wrap=none']) if 'higher_level' in spell: spell['higher_level'] = pypandoc.convert_text(spell['higher_level'],'md',format='html',extra_args=['--wrap=none']) if 'material' in spell: spell['material'] = pypandoc.convert_text(spell['material'],'md',format='html',extra_args=['--wrap=none']) with open(json_file['spells'], 'w') as outfile: json.dump(spells, outfile)
def parse(self, response): talk_ids = collections.defaultdict(list) for day in response.css('div.schedule__day.iframe_schedule_day'): curr_date = day.css('p.schedule__date::text').get() for r in day.css('div::attr(data-link)'): talk_ids[r.get()] = curr_date yield talk_ids for talk in response.css('div.details.uv-card__mask'): for session in talk.css('div.uv-card--session'): time_of_day = session.css( 'span.session__time:nth-child(1)').xpath( 'normalize-space()').get() talk_id = talk.xpath('@id').get() desc = session.css('div.safe-description').get() try: desc_md = html2text(desc) desc = pypandoc.convert_text(desc_md, 'rst', format='md') except: pass yield {'title': session.xpath('string(.//h2)').get(), 'datetime': dateparser.parse('{date} {year} {tod}'.format( date=talk_ids[talk_id], year=2016, tod=time_of_day)), 'description': desc, 'spearkers': session.css(''' div.session__speakers-box div.uv-shortcard__title::text''').extract()}
def convert(self, text): text = '\n\n'.join([re.sub(self.regexCodeBlock, r'<pre>\1</pre>', block) for block in text.split('\n\n')]) # convert from textile to markdown text = pypandoc.convert_text(text, 'markdown_strict', format='textile') # pandoc does not convert everything, notably the [[link|text]] syntax # is not handled. So let's fix that. # [[ wikipage | link_text ]] -> [link_text](wikipage) text = re.sub(self.regexWikiLinkWithText, self.wiki_link, text, re.MULTILINE | re.DOTALL) # [[ link_url ]] -> [link_url](link_url) text = re.sub(self.regexWikiLinkWithoutText, self.wiki_link, text, re.MULTILINE | re.DOTALL) # nested lists, fix at least the common issues text = text.replace(" \\#\\*", " -") text = text.replace(" \\*\\#", " 1.") # Redmine is using '>' for blockquote, which is not textile text = text.replace("> ", ">") # wiki note macros text = re.sub(self.regexTipMacro, r'---\n**TIP**: \1\n---\n', text, re.MULTILINE | re.DOTALL) text = re.sub(self.regexNoteMacro, r'---\n**NOTE**: \1\n---\n', text, re.MULTILINE | re.DOTALL) text = re.sub(self.regexWarningMacro, r'---\n**WARNING**: \1\n---\n', text, re.MULTILINE | re.DOTALL) text = re.sub(self.regexImportantMacro, r'---\n**IMPORTANT**: \1\n---\n', text, re.MULTILINE | re.DOTALL) # all other macros text = re.sub(self.regexAnyMacro, r'\1', text, re.MULTILINE | re.DOTALL) return text
def test_pdf_conversion(self): with closed_tempfile('.pdf') as file_name: ret = pypandoc.convert_text('#some title\n', to='pdf', format='md', outputfile=file_name) assert ret == "" with io.open(file_name, mode='rb') as f: written = f.read() assert written[:4] == b"%PDF" # TODO: find a test for the content? def f(): # needs an outputfile pypandoc.convert_text('#some title\n', to='pdf', format='md') self.assertRaises(RuntimeError, f) def f(): # outputfile needs to end in pdf with closed_tempfile('.WRONG') as file_name: pypandoc.convert_text('#some title\n', to='pdf', format='md', outputfile=file_name) self.assertRaises(RuntimeError, f) def f(): # no extensions allowed with closed_tempfile('.pdf') as file_name: pypandoc.convert_text('#some title\n', to='pdf+somethign', format='md', outputfile=file_name) self.assertRaises(RuntimeError, f)
def render_markdown(value): """Render Markdown""" try: output = pypandoc.convert_text(value, to='html5', format='md', extra_args=['--mathjax']) except RuntimeError: output = value return output
def md2rst(comment): """Convert a comment from protobuf markdown to restructuredtext. This method: - Replaces proto links with literals (e.g. [Foo][bar.baz.Foo] -> `Foo`) - Resolves relative URLs to https://cloud.google.com - Runs pandoc to convert from markdown to restructuredtext """ comment = _replace_proto_link(comment) comment = _replace_relative_link(comment) # Calling pypandoc.convert_text is slow, so we try to avoid it if there are # no special characters in the markdown. if any([i in comment for i in '`[]*_']): comment = pypandoc.convert_text(comment, 'rst', format='commonmark') # Comments are now valid restructuredtext, but there is a problem. They # are being inserted back into a descriptor set, and there is an # expectation that each line of a comment will begin with a space, to # separate it from the '//' that begins the comment. You would think # that we could ignore this detail, but it will cause formatting # problems down the line in gapic-generator because parsing code will # try to remove the leading space, affecting the indentation of lines # that actually do begin with a space, so we insert the additional # space now. Comments that are not processed by pypandoc will already # have a leading space, so should not be changed. comment = _insert_spaces(comment) return comment
def tokenize_block(source: str, pandoc_extra_args: list=None) -> list: """ Convert a Jupyter output to Pandoc's JSON AST. """ if pandoc_extra_args is None: pandoc_extra_args = [] json_doc = pypandoc.convert_text(source, to='json', format='markdown', extra_args=pandoc_extra_args) return json.loads(json_doc)['blocks']
def test_basic_conversion_from_string(self): expected = u'some title{0}=========={0}{0}'.format(os.linesep) received = pypandoc.convert('#some title', 'rst', format='md') self.assertEqualExceptForNewlineEnd(expected, received) expected = u'some title{0}=========={0}{0}'.format(os.linesep) received = pypandoc.convert_text('#some title', 'rst', format='md') self.assertEqualExceptForNewlineEnd(expected, received)
def test_convert_text_with_existing_file(self): with closed_tempfile('.md', text='#some title\n') as file_name: received = pypandoc.convert_text(file_name, 'rst', format='md') self.assertTrue("title" not in received) # The following is a problematic case received = pypandoc.convert(file_name, 'rst', format='md') self.assertTrue("title" in received)
def text_decode(text): if re.search(r'\\u', text): body = fix_arnaud_post(text) elif is_html(text): text = escape_special_characters(text) body = pypandoc.convert_text(text, 'markdown_strict', format='html') else: body = text return body
def create(self, variables, md_output, pdf_output): env = Environment(loader=PackageLoader('qanta', 'reporting/templates')) template = env.get_template(self.template) markdown = template.render(variables) if md_output is not None: with open(md_output, 'w') as f: f.write(markdown) try: import pypandoc pypandoc.convert_text( markdown, 'pdf', format='md', outputfile=pdf_output, extra_args=['-V', 'geometry:margin=.75in'] ) except Exception as e: log.warn('Pandoc was not installed or there was an error calling it, omitting PDF report') log.warn(str(e))
def save_url(chapter, title, url): file_name = '{}.tex'.format(title.replace('/', '\\').replace(':', ' -')) path = pathlib.Path(os.path.join('content', chapter, 'images')) path.mkdir(parents=True, exist_ok=True) p = mercury.parse(url) html = save_images(p.content, path) content = pypandoc.convert_text(html, 'tex', format='html') write_content(path.parent.joinpath(file_name), content)
def main(): if len(sys.argv) <= 1: sys.exit("Please supply a filename") input_format = "markdown" output = common_md() print() for arg in sys.argv[1:]: p = Path(arg).resolve() print(f"Generating: {p}") ext = p.suffix if ext == ".md": p.write_text(output) elif ext == ".html": pypandoc.convert_text( output, format=input_format, to="html5", outputfile=str(p), extra_args=["--standalone", "--self-contained", "--toc", "--toc-depth=2", "--css=" + str(TEMPLATE_DIR / "docs.css")]) elif ext == ".pdf" or ext == ".tex": latex = Path(TEMPLATE_DIR / "latex_preamble.md").read_text() latex += output pypandoc.convert_text( latex, format=input_format, to=ext[1:], outputfile=str(p), extra_args=["--standalone", "--column=80", "--toc", "--toc-depth=2", "--latex-engine=xelatex", "--variable=papersize:A4"])
def read_long_description(): try: import pypandoc with open("README.md") as f: text = f.read() # Remove screenshots as they get rendered poorly on PyPi stripped_text = text[:text.index("# Screenshots")].rstrip() return pypandoc.convert_text(stripped_text, 'rst', format='md') except: return ""
def test_basic_pypandoc_example(self): """ This test is testing a basic pypandoc function call. """ pypandoc_result = pypandoc.convert_text( '- *foo* bar', 'html5', format='org') expected_html5_result = '<ul>\n<li><strong>foo</strong> bar</li>\n</ul>\n' self.assertEqual( Utils.normalize_lineendings(pypandoc_result), Utils.normalize_lineendings(expected_html5_result))
def test_pypandoc_with_umlauts(self): """ This test is testing umlaut and charset with pypandoc. """ pypandoc_result = pypandoc.convert_text( 'This is an umlaut test: öÄ߀', 'html5', format='org', encoding='utf-8') expected_html5_result = '<p>This is an umlaut test: öÄ߀</p>\n' # FIXXME: Umlaut conversion does habe encoding issues. self.assertEqual(Utils.normalize_lineendings(pypandoc_result), Utils.normalize_lineendings(expected_html5_result))
def _init_settings(): import yaml def adjust_path(loader, node): return os.path.join(BASE_DIR, loader.construct_scalar(node)) yaml.add_constructor('!path', adjust_path) configuration_files = ('settings.yml', 'static/settings.yml', 'local_settings.yml') for filename in configuration_files: with open(os.path.join(BASE_DIR, 'lerna', filename), encoding='utf-8-sig') as f: for yml_key, yml_data in yaml.load(f).items(): if yml_key == 'PREPEND': for key, value in yml_data.items(): globals()[key] = value + globals()[key] elif yml_key == 'APPEND': for key, value in yml_data.items(): globals()[key] += value elif yml_key == 'OVERRIDE': for cnf_name, sub_data in yml_data.items(): cnf = globals()[cnf_name] for key, value in sub_data.items(): cnf[key] = value else: globals()[yml_key] = yml_data # TODO: Log every failure. try: import pypandoc as pd except ImportError: pass else: try: pd.get_pandoc_version() except OSError: pass else: output = pd.convert_text('', 'html', format='latex') if output not in ('', '\n'): raise Exception('pandoc is found, but has not passed a sample test (%r)' % output) def check_filter(f): try: pd.convert_text('', 'html', format='latex', filters=[f]) return True except RuntimeError: return False PANDOC['REQUIRED'] = True PANDOC['FILTERS'] = list(filter(check_filter, PANDOC['FILTERS']))
def rst_to_notebook(infile, outfile): """Convert an rst file to a notebook file.""" # Read infile into a string with open(infile, 'r') as fin: rststr = fin.read() # Convert string from rst to markdown mdfmt = 'markdown_github+tex_math_dollars+fenced_code_attributes' mdstr = pypandoc.convert_text(rststr, mdfmt, format='rst', extra_args=['--atx-headers']) # In links, replace .py extensions with .ipynb mdstr = re.sub(r'\(([^\)]+).py\)', r'(\1.ipynb)', mdstr) # Enclose the markdown within triple quotes and convert from # python to notebook mdstr = '"""' + mdstr + '"""' nb = py2jn.py_string_to_notebook(mdstr) py2jn.tools.write_notebook(nb, outfile, nbver=4)
def exercise(src): # import pdb; pdb.set_trace() d, p, *_ = html.fragments_fromstring(src) # title = d.attrib['data-title'] title = d.find('h1').text_content().strip().replace("Exercise: ", "") question = convert_text(p.text, "latex", format="markdown") tpl = dedent('''\ --- \\begin{{Exercise}}[title={{{title}}}] {question} \\end{{Exercise}} ''').format(title=title, question=question) return tpl
def convert_md_2_rst_process(filename_root): filename_source = filename_root + ".md" filename_target = filename_root + ".rst" #convert_text形式 print 'Converting', os.path.basename(filename_source), 'to', os.path.basename(filename_target) file_source = open(filename_source) lines = file_source.readlines() file_source.close() data = '\n'.join(lines) data = data.encode('utf-8') data = pypandoc.convert_text(data, 'rst', format='md') file_target = open(filename_target, "w") file_target.write(data) file_target.flush() file_target.close() #shutil.move(filename_target, os.path.abspath('.') + '/source/' + os.path.basename(filename_target)) #convert_file形式 """
def copy_md2rst(infile, outfile): # Read infile try: with open(infile) as f: text = f.read() except: text = '' # Strip top of file try: text = text[text.index('## Introduction') + 1:] except: pass # Write to outfile with open(outfile, 'w') as f: f.write(pypandoc.convert_text(text, 'rst', format='md').replace('\r\n', '\n'))
def pandoc_process(app, what, name, obj, options, lines): """"Convert docstrings in Markdown into reStructureText using pandoc """ if not lines: return None input_format = app.config.mkdsupport_use_parser output_format = 'rst' # Since default encoding for sphinx.ext.autodoc is unicode and pypandoc.convert_text, which will always return a # unicode string, expects unicode or utf-8 encodes string, there is on need for dealing with coding text = SEP.join(lines) text = pypandoc.convert_text(text, output_format, format=input_format) # The 'lines' in Sphinx is a list of strings and the value should be changed del lines[:] lines.extend(text.split(SEP))
def renderer(self, text): """ Renders a flat page to HTML. :param text: the text of the flat page :type text: string """ #if type(text) == str: # text = str(text, self.app.config["FLATPAGES_ENCODING"]) if self.pre_render: text = render_template_string(Markup(text)) extra_args = [ "--filter=pandoc-crossref", "--filter=pandoc-citeproc", "--filter=pandoc-sidenote", "--standalone", "--mathml", "--base-header-level=2", "--highlight-style", "pygments", "--bibliography=pages/all.bib", "--csl=pages/lncs.csl", "-Mreference-section-title=References", "-Mlink-citations=true" ] pandocver = int(pypandoc.get_pandoc_version()[0]) if pandocver < 2: extra_args.append("-S") format_str = "markdown+raw_tex+yaml_metadata_block" else: format_str = "markdown+raw_tex+smart+yaml_metadata_block" output = pypandoc.convert_text( text.encode("utf8"), 'html', format = format_str, extra_args=extra_args ) return output
def convert(source: str, to: str, extra_args=(), output_file: str = None) -> None: """ Convert a source document to an output file. Parameters ---------- source : str to : str extra_args : iterable output_file : str Notes ----- Either writes to ``output_file`` or prints to stdout. """ output_name = (os.path.splitext(os.path.basename(output_file))[0] if output_file is not None else 'std_out') standalone = '--standalone' in extra_args self_contained = '--self-contained' in extra_args use_prompt = '--use-prompt' in extra_args extra_args = [item for item in extra_args if item != '--use-prompt'] stitcher = Stitch(name=output_name, to=to, standalone=standalone, self_contained=self_contained, use_prompt=use_prompt) result = stitcher.stitch(source) result = json.dumps(result) newdoc = pypandoc.convert_text(result, to, format='json', extra_args=extra_args, outputfile=output_file) if output_file is None: print(newdoc)
def convert(ctx, name, destination_format, destination_file, list_docs, formats): """Convert to destination_format and print to stdout or save to file if provided.""" # yew = ctx.obj["YEW"] if formats or not destination_format: formats = pypandoc.get_pandoc_formats() click.echo("Input formats:") for f in formats[0]: click.echo("\t" + f) click.echo("Output formats:") for f in formats[1]: click.echo("\t" + f) sys.exit(0) docs = shared.get_document_selection(ctx, name, list_docs) if not docs: sys.exit(1) doc = docs[0] click.echo(doc.name) click.echo(doc.kind) click.echo(destination_format) if destination_format in ["docx", "pdf", "odt"]: destination_file = "{}.{}".format(slugify(doc.name), destination_format) if destination_file: dest = pypandoc.convert( doc.get_content(), format=doc.kind, to=destination_format, outputfile=destination_file, ) click.echo(destination_file) else: dest = pypandoc.convert_text(doc.get_content(), format=doc.kind, to=destination_format) click.echo(dest) sys.stdout.flush()
def __call__(self, fh, fh_w): try: cal = Calendar.from_ical(fh.read()) except ValueError as e: msg = "Parsing error: {}".format(e) raise IcalError(msg) now = datetime.now() start = now - timedelta(days=self.days) end = now + timedelta(days=self.days) events = recurring_ical_events.of(cal).between(start, end) for event in tqdm(events): summary = event["SUMMARY"] summary = summary.replace('\\,', ',') location = None if event.get("LOCATION", None): location = event['LOCATION'].replace('\\,', ',') if not any((summary, location)): summary = u"(No title)" else: summary += " - " + location if location and self.include_location else '' fh_w.write(u"* {}".format(summary)) fh_w.write(u"\n") if isinstance(event["DTSTART"].dt, datetime): fh_w.write(u" {}--{}\n".format( org_datetime(event["DTSTART"].dt, self.tz), org_datetime(event["DTEND"].dt, self.tz))) else: # all day event fh_w.write(u" {}--{}\n".format( org_date(event["DTSTART"].dt, timezone('UTC')), org_date(event["DTEND"].dt - timedelta(days=1), timezone('UTC')))) description = event.get("DESCRIPTION", None) if description: if bool(BeautifulSoup(description, "html.parser").find()): description = pypandoc.convert_text(description, "org", format="html") description = '\n'.join(description.split('\\n')) description = description.replace('\\,', ',') fh_w.write(u"{}\n".format(description)) fh_w.write(u"\n")
def convert(self, text): text = '\n\n'.join([re.sub(self.regexCodeBlock, r'<pre>\1</pre>', block) for block in text.split('\n\n')]) # convert from textile to markdown text = pypandoc.convert_text(text, 'markdown_strict', format='textile') # gitlab does not support escaped underscores in a url (???) text = re.sub(self.regexHttpLink, self.unescape_link_underscore, text) # if the markdown starts with a code block, gitlab will trim the start of the string if text[0:4] == ' ': text = "Codeblock:\n\n" + text # pandoc does not convert everything, notably the [[link|text]] syntax # is not handled. So let's fix that. # [[ wikipage | link_text ]] -> [link_text](wikipage) text = re.sub(self.regexWikiLinkWithText, self.wiki_link, text, re.MULTILINE | re.DOTALL) # [[ link_url ]] -> [link_url](link_url) text = re.sub(self.regexWikiLinkWithoutText, self.wiki_link, text, re.MULTILINE | re.DOTALL) # nested lists, fix at least the common issues text = text.replace(" \\#\\*", " -") text = text.replace(" \\*\\#", " 1.") # Redmine is using '>' for blockquote, which is not textile text = text.replace("> ", ">") # wiki note macros text = re.sub(self.regexTipMacro, r'---\n**TIP**: \1\n---\n', text, re.MULTILINE | re.DOTALL) text = re.sub(self.regexNoteMacro, r'---\n**NOTE**: \1\n---\n', text, re.MULTILINE | re.DOTALL) text = re.sub(self.regexWarningMacro, r'---\n**WARNING**: \1\n---\n', text, re.MULTILINE | re.DOTALL) text = re.sub(self.regexImportantMacro, r'---\n**IMPORTANT**: \1\n---\n', text, re.MULTILINE | re.DOTALL) # all other macros text = re.sub(self.regexAnyMacro, r'\1', text, re.MULTILINE | re.DOTALL) return text
def build_shell_dict(self): """ This will house all values the templates need. :return: """ shell_dict = { 'id': self.json_file['name'].lower().split(' ')[0], 'name': Template('GATK4 AUTO $name').substitute(self.json_file), 'short_name': self.json_file['name'].split(' ')[0], 'profile': self.profile, 'description': self.json_file['summary'].rstrip(' '), 'summary': pypandoc.convert_text(self.json_file['description'], 'rst', format='html') } return shell_dict
def with_markdown(content, space, name): """User pandoc to get markdown from MediaWiki format.""" try: json_converted = pypandoc.convert_text(content, 'json', format='mediawiki') stream = io.StringIO(json_converted) traversable_doc = panflute.load(stream) panflute.run_filter(drop_loose_categories, doc=traversable_doc) panflute.run_filter(rewrite_internal_links, doc=traversable_doc) content = back_to_markdown(traversable_doc) except Exception: click.echo('Failed to parse content! Continuing ...\n') with open(FAILURE_LOG, 'a') as handle: handle.write(('Failed to parse content. Could not re-write links ' 'and drop categories for page {}\n'.format(name))) return convert_image_format(content)
def rst_to_notebook(infile, outfile, diridx=False): """Convert an rst file to a notebook file.""" # Read infile into a string with open(infile, 'r') as fin: rststr = fin.read() # Convert string from rst to markdown mdfmt = 'markdown_github+tex_math_dollars+fenced_code_attributes' mdstr = pypandoc.convert_text(rststr, mdfmt, format='rst', extra_args=['--atx-headers']) # In links, replace .py extensions with .ipynb mdstr = re.sub(r'\(([^\)]+).py\)', r'(\1.ipynb)', mdstr) # Links to subdirectories require explicit index file inclusion if diridx: mdstr = re.sub(r']\(([^\)/]+)\)', r'](\1/index.ipynb)', mdstr) # Enclose the markdown within triple quotes and convert from # python to notebook mdstr = '"""' + mdstr + '"""' nb = py2jn.py_string_to_notebook(mdstr) py2jn.tools.write_notebook(nb, outfile, nbver=4)
def show_dataobj(dataobj_id): dataobj = data.get_item(dataobj_id) if not dataobj: flash("Data could not be found!") return redirect("/") if request.args.get("raw") == "1": return frontmatter.dumps(dataobj) extra_pandoc_args = ["--highlight-style=" + app.config['PANDOC_HIGHLIGHT_THEME'], "--standalone"] content = pypandoc.convert_text(dataobj.content, 'html', format='md', extra_args=extra_pandoc_args) return render_template( "dataobjs/show.html", title=dataobj["title"], dataobj=dataobj, content=content, form=forms.DeleteDataForm())
def test_basic_conversion_to_file(self): with closed_tempfile('.rst', ) as file_name: expected = u'some title{0}=========={0}{0}'.format(os.linesep) received = pypandoc.convert_text('# some title\n', to='rst', format='md', outputfile=file_name) self.assertEqualExceptForNewlineEnd("", received) with io.open(file_name) as f: written = f.read() self.assertEqualExceptForNewlineEnd(expected, written) # to odf does not work without a file def f(): pypandoc.convert_text('# some title\n', to='odf', format='md', outputfile=None) with self.assertRaisesRegex(RuntimeError, "Invalid output format! Got odf but "): f()
def test_pdf_conversion(self): with closed_tempfile('.pdf') as file_name: ret = pypandoc.convert_text('# some title\n', to='pdf', format='md', outputfile=file_name) assert ret == "" with io.open(file_name, mode='rb') as f: written = f.read() assert written[:4] == b"%PDF" # TODO: find a test for the content? def f(): # needs an outputfile pypandoc.convert_text('# some title\n', to='pdf', format='md') self.assertRaises(RuntimeError, f) # outputfile needs to end in pdf with closed_tempfile('.WRONG') as file_name: def f(): pypandoc.convert_text('# some title\n', to='pdf', format='md', outputfile=file_name) self.assertRaises(RuntimeError, f) # no extensions allowed with closed_tempfile('.pdf') as file_name: def f(): pypandoc.convert_text('# some title\n', to='pdf+somethign', format='md', outputfile=file_name) self.assertRaises(RuntimeError, f)
def markdown_to_reveal(text: str, config: Config) -> str: """ Transform a Markdown input file to an HTML (reveal.js) output string. Parameters ---------- markdown_text Markdown text to convert to HTML. config Markdownreveal configuration. Returns ------- The converted string. """ extra_args = ['-s', '--slide-level=2', '-V', 'revealjs-url=revealjs'] if config['katex']: pandoc_version = get_pandoc_version() if LooseVersion(pandoc_version) < LooseVersion('2.0'): extra_args.extend([ '--katex=katex/katex.min.js', '--katex-stylesheet=katex/katex.min.css', ]) else: extra_args.extend(['--katex=katex/']) extra_args.extend(pandoc_extra_to_args(config)) extra_args.extend(reveal_extra_to_args(config)) input_format = 'markdown' if config['emoji_codes']: input_format += '+emoji' output = convert_text(source=text, format=input_format, to='revealjs', extra_args=extra_args) # HTML substitution output = tweak_html(output, config) return output
def refresh_page(self): """Convert markdown to html and set webView""" parsed_stylesheet = parse_stylesheet(get_resource('ViewPaneStyle.css'), CONSTANTS.theme) # Write parsed stylesheet to file so it can be passed to pandoc with open(get_resource("parsed_stylesheet.css"), "w") as file: file.write(parsed_stylesheet) # Convert markdown to html using pandoc html = pypandoc.convert_text( self.edit_pane.toPlainText(), "html", format="markdown", extra_args=[ f"--highlight-style={get_resource('syntax.theme')}", "-s", "--css=" f"{get_resource('parsed_stylesheet.css')}", f"--katex={get_resource('katex/')}" ]) self.setHtml(html, QtCore.QUrl().fromLocalFile(self.edit_pane.current_file))
def gen(): for name, option in options: default = option.get("default") if default is not None: default = json.dumps(default) example = option.get("example") if example is not None: if type(example) == dict and example.get("_type") == "literalExample": example = json.dumps(example["text"]) else: example = json.dumps(example) description = option.get("description") if description is not None: xml_description = ( f'<xml xmlns:xlink="http://www.w3.org/1999/xlink">' f"<para>{description}</para>" f"</xml>" ) # we first check if there are some xml elements before using pypandoc # since pypandoc calls are quite slow root = xml.etree.ElementTree.fromstring(xml_description) if len(list(root.find("para"))) > 0: description = pypandoc.convert_text( xml_description, "html", format="docbook", ) yield dict( type="option", option_name=name, option_name_query=parse_query(name), option_description=description, option_type=option.get("type"), option_default=default, option_example=example, option_source=option.get("declarations", [None])[0], )
def handle_law_from_xml(self, book, book_xml) -> LawBook: previous_law = None law_order = 1 # Parse XML tree tree = etree.fromstring(book_xml) for sect in tree.xpath('sect1'): section_title = sect.xpath('title/text()')[0] logger.debug('Section: %s' % section_title) # if section_title == 'Grundgesetz für die Bundesrepublik Deutschland': # continue book.add_section(from_order=law_order, title=section_title.strip()) for law_key, law_raw in enumerate(sect.xpath('sect2')): law_title = law_raw.xpath('title')[0] law_title.getparent().remove(law_title) # law_docbook = tostring(law_raw).decode('utf-8') law_docbook = '\n'.join(tostring(x).decode('utf-8') for x in law_raw.iterchildren()) law_text = pypandoc.convert_text(law_docbook, 'html', format='docbook') law_section = tostring(law_title, method="text").decode('utf-8').strip() law = Law(book=book, title='', section=law_section, slug=slugify(law_section), content=law_text, previous=previous_law, order=law_order ) law.save() law_order += 1 previous_law = law return book
def convert_issue_data(self, redmine_issue): """ Generate the data for a new GitHub issue """ description_md = convert_text( redmine_issue['description'], 'markdown_github', 'textile' ) porting_note = '###### ported from Redmine #%s (created %s)' % ( redmine_issue['id'], redmine_issue['created_on'].split('T')[0] ) if self.is_closed(redmine_issue): porting_note = '%s (CLOSED %s)' % ( porting_note, redmine_issue['closed_on'].split('T')[0] ) body = "%s\n\n%s" % (porting_note, description_md) title = "%(subject)s (RM#%(id)s)" % redmine_issue return { "title": title, "body": body, "assignees": ["adam-iris"], }
def pdf(data, check_type): """ Generate pdf and file path and return file path """ rendered = _pdf_string(data, check_type) document_id = data['id'] check_type = check_type outputfile = os.path.join(app.config['MEDIA_FOLDER'], f'{document_id}_{check_type}.pdf') pdf = pypandoc.convert_text(rendered, 'pdf', format='html', outputfile=outputfile, extra_args=[ '--latex-engine=xelatex', '-V', 'mainfont="FreeSerifBold"' ]) return outputfile
def sendEmail(display_config_params, coordinates, address, to, sending_email, message=None): #TODO yag = yagmail.SMTP(sending_email, oauth2_file="~/.config/bounds/gm_oauth2.json") if message is None: contents = [ pypandoc.convert_text( 'Sent with [bounds](https://github.com/hdb/bounds)', to='html', format='md') ] else: contents = message img_file = '/tmp/folium.png' display(*display_config_params, coordinates, img_file) yag.send(to, address, contents, attachments=img_file)
def make_cloze_roles(self): """ Pandoc *sometimes inserts an escape before my backticks, so I'm catching only those escaped backticks that happen in the cloze translations. It also randomly inserts newlines, which is a dilemma. For now I will simply remove them, since the cloze text is intended to be short. """ def clozerepl(matchobj): return ':c'+matchobj.group(1)+':`'+matchobj.group(2)+'removeme`' t0 = re.sub(r'{{c([0-9]+)::(.+?)}}', clozerepl, self.text) clozeIDs = [r[1] for r in set(re.findall('(:)(c[0-9]+)(:)', t0))] t1 = pypandoc.convert_text(t0, 'rst', format='html', extra_args=['--wrap=preserve']) t2 = re.sub(r'removeme\\*', '', t1) text = re.sub('(?<![\n])(\r?\n)(?![\n])', ' ', t2) roles = '\n'.join(['.. role:: '+c+'(emphasis)' for c in sorted(clozeIDs)]) return (roles, text.strip())
def previewNewsItem(self, item): # 1. Get first lines from markdown content that almost fit 400 characters. # 2. Append `...` at the end to signify `more` # 3. Append links section of markdown content # 4. Convert resulting content to HTML lines = item.contents.splitlines(True) # Compose preview section. preview = "" for ln in lines: preview += ln if (len(preview) >= self.PREVIEW_LIMIT): break preview += "..." # Compose links section. links = "" for ln in lines: if re.match("\[.+\]:.+", ln): links += ln contents = preview + "\n\n\n" + links html = pypandoc.convert_text(contents, "html", format="md") html = html.encode("utf-8") return html
def newsExcerpt(article): # 1. Get first lines from markdown content that almost fit 400 characters. # 2. Append `...` at the end to signify `more` # 3. Append links section of markdown content # 4. Convert resulting content to HTML, this is an excerpt charsLimit = 250 lines = article.contents.splitlines(True) # Compose excerpt section. excerpt = "" for ln in lines: excerpt += ln if (len(excerpt) >= charsLimit): break excerpt += "..." # Compose links section. links = "" for ln in lines: if re.match("\[.+\]:.+", ln): links += ln content = excerpt + "\n\n\n" + links html = pypandoc.convert_text(content, "html", format="md") return html
def _convert_md_table_to_rst(table): """Convert a markdown table to rst format""" if len(table) < 3: return '' out = '```eval_rst\n.. list-table::\n :header-rows: 1\n\n' for i, l in enumerate(table): cols = l.split('|')[1:-1] if i == 0: ncol = len(cols) else: if len(cols) != ncol: return '' if i == 1: for c in cols: if len(c) is not 0 and '---' not in c: return '' else: for j, c in enumerate(cols): out += ' * - ' if j == 0 else ' - ' out += pypandoc.convert_text(c, 'rst', format='md').replace( '\n', ' ').replace('\r', '') + '\n' out += '```\n' return out
def getfeed(mastodon, limit=10): toots = mastodon.timeline_home(limit=limit) formatted = [] for t in reversed(toots): if t['reblog'] is not None: continue try: toot = pypandoc.convert_text(t['content'], 'plain', format='html') toot = '\n'.join(textwrap.wrap(toot, 80)) toot = textwrap.indent(toot, ' | ') formatted += [{ 'content': toot, 'author': t['account']['display_name'], 'timestamp': t['created_at'], }] except KeyError as e: print(t.keys()) return formatted
def _convert_md_table_to_rst(table): """Convert a markdown table to rst format""" if len(table) < 3: return '' out = '```eval_rst\n.. list-table::\n :header-rows: 1\n\n' for i,l in enumerate(table): cols = l.split('|')[1:-1] if i == 0: ncol = len(cols) else: if len(cols) != ncol: return '' if i == 1: for c in cols: if len(c) is not 0 and '---' not in c: return '' else: for j,c in enumerate(cols): out += ' * - ' if j == 0 else ' - ' out += pypandoc.convert_text( c, 'rst', format='md').replace('\n', ' ').replace('\r', '') + '\n' out += '```\n' return out
def download_wikis(fb): resp = fb.listWikis() for wiki in resp.wikis.childGenerator(): wiki_id = wiki.ixWiki.string wiki_name = wiki.sWiki.string print(wiki_id, wiki_name) # Create a subdirectory with the name of the wiki if not os.path.exists(wiki_name): os.mkdir(wiki_name) article_ids = get_article_ids(fb, wiki_id) for article_id in article_ids: article = fb.viewArticle(ixWikiPage=article_id) headline = article.wikipage.sHeadline.string body = article.wikipage.sBody.string print(headline) filename = headline.replace('/', '') + '.html' path = os.path.join(wiki_name, filename) # Block for just writing out HTML if False: with open(path, 'w') as f: try: f.write(body) except: print("Unable to write {} - {}".format(wiki_name, headline)) # Convert to markdown and write try: output = pypandoc.convert_text(body, to='md', format='html', outputfile=path) except: print("Unable to write {} - {}".format(wiki_name, headline))
def fill_notebook(work_notebook, script_blocks, gallery_conf): """Writes the Jupyter notebook cells If available, uses pypandoc to convert rst to markdown. Parameters ---------- script_blocks : list Each list element should be a tuple of (label, content, lineno). """ for blabel, bcontent, lineno in script_blocks: if blabel == 'code': add_code_cell(work_notebook, bcontent) else: if gallery_conf["pypandoc"] is False: markdown = rst2md(bcontent + '\n') else: import pypandoc # pandoc automatically addds \n to the end markdown = pypandoc.convert_text(bcontent, to='md', format='rst', **gallery_conf["pypandoc"]) add_markdown_cell(work_notebook, markdown)
def html(self, pandoc=False): """Returns the note formatted as HTML. Will use markdown2 as default, with the option of pandoc (WIP)""" # LOG.debug(f"Converting {self.title} into HTML...") if pandoc: # Still WIP import pypandoc filters = ['pandoc-xnos'] args = [] html = pypandoc.convert_text(self.content, 'html', format='md', filters=filters, extra_args=args) else: html = render_markdown(self.content) # Wrapping converted markdown in a div for styling html = f"<div id=\"content\">{html}</div>" # LOG.debug(f"{self.title} converted into HTML and placed inside div with id=\"content\"") return html
def make_tex(metadata_dict, markdown_text): metadata = yaml.dump(metadata_dict, default_flow_style=False) markdown_text = '---\n' + metadata + '\n---\n\n' + markdown_text latex_text = convert_text( source=markdown_text, to='latex', format='markdown', extra_args=( '--natbib', '--bibliography', 'refs.bib', '--template', os.path.join(os.getcwd(), 'assets', 'template.latex'), # Variables '-V', 'documentclass:report', '-V', 'classoption:a4paper', # Filters '--filter', 'pandoc-crossref', ) ) latex_text = latex_text.replace( '\\begin{table}[]', '\\begin{table}[htpb]\n\\centering') return latex_text
def md2gopher(self, md): # move links below the current block md = pypandoc.convert_text(md, "md", format="md", extra_args=[ "--wrap=preserve", "--reference-links", "--reference-location=block" ]) # make links into actual links # also try to repair Gopher lines corrupted by pandoc entries = list() for line in md.splitlines(): match = MARKDOWN_LINK_PATTERN.match(line) if match: entries.append(self.gopher_menu.html(*match.groups())) continue match = CORRUPTED_LINE_PATTERN.match(line) if match: entries.append(self.gopher_menu.entry(*match.groups())) continue entries.append(line) # remove remaining HTML tags and make it a gophermap return self.html2gopher("\n".join(entries))
def as_github_issue(self): import pypandoc is_bug = self.type == 'BUG' front_matter = f""" | Reported by | URL | OS | Browser | Device type | Bug happened at | |----------------------|------------|-----------|----------------|---------------| ----------------| | {self.user.username} | {self.url} | {self.os} | {self.browser} | {self.device} | {self.happened} | """ return { 'title': self.title or self.content[0:30], 'body': (front_matter.strip() if is_bug else '') + pypandoc.convert_text( self.content, 'markdown_github', format='html'), 'assignees': ['ewen-lbh'], 'labels': [ 'lang:' + self.language, { 'FEATURE': 'enhancement', 'BUG': 'bug' }[self.type], 'from:schoolsyst.com' ] }
def generate_provider_pdf(url, filename, s=None): s = rh.HTMLSession() if not s else s r1 = s.get(url) html = "" anchors = r1.html.find('.nav-visible a') links = [a.absolute_links.pop() for a in anchors] links = filter(lambda href: href.find('/r/') != -1 or href.find('/d/') != -1, links) # filter out links not data or resource print("downloading...") for l1 in links: r2 = s.get(l1) # r2.html.render() div = r2.html.find('#inner', first=True) # with open("/tmp/b.html", "wt") as f: # f.write(content.html) if div: html += div.html print("generating pdf...") try: output = pypandoc.convert_text(html, "pdf", format="html", outputfile="./{}.pdf".format(filename), extra_args=['--pdf-engine=xelatex']) except Exception as e: print(e)
def twlight_wikicode2html(value): """Passes string through pandoc and returns html""" output = pypandoc.convert_text(value, 'html', format='mediawiki') return output
def append_markdown(self, markdown, metadata): markdown = _ensure_string(markdown) html = pypandoc.convert_text(markdown, 'html', format='md') # ignore metadata, not supported. self._output.write(self._create_tag('text', html) + "\n")