Esempio n. 1
0
def run(args):
    if not args or len(args) < 2:
        print('Usage: rexex.py <path of CppCoreGuidelines.md> <output file>',
              file=sys.stderr)
        return -1

    input_file = args[0]
    output_file = args[1]
    data = pypandoc.convert_file(input_file, to='json')
    doc = panflute.load(io.StringIO(data))
    doc.headers = []
    doc.codeblocks = []
    doc = panflute.run_filter(action, doc=doc)
    rules = collections.defaultdict(lambda: collections.defaultdict(list))

    for codeblock in doc.codeblocks:
        possible_header = codeblock
        header_match = match_rule_id(possible_header)
        example_match = match_example_header(possible_header)
        while not header_match:
            if not possible_header:
                print('ERROR: cannot identify rule of codeblock\n{}'.format(
                    codeblock.text))
                break
            if not example_match:
                example_match = match_example_header(possible_header)
            possible_header = possible_header.prev
            header_match = match_rule_id(possible_header)
        if not header_match:
            continue
        example_class = classify_example(codeblock.text, str(example_match))
        rules[header_match][example_class].append(codeblock.text)

    with open(output_file, 'w') as output:
        output.write(json.dumps(rules))
Esempio n. 2
0
def main(ctx, filter_to, input_file, read, output, to, standalone,
         self_contained):
    if not filter_to:
        raise KnittyError(f"Invalid Pandoc filter arg: '{filter_to}'")

    fmts = dict(commonmark='md', markdown='md', gfm='md')
    if output and (output != '-'):
        dir_name = p.basename(output).replace('.', '_')
    elif input_file and (input_file != '-'):
        dir_name = p.basename(input_file).replace('.', '_') + '_' + fmts.get(
            filter_to, filter_to)
    else:
        dir_name = 'stdout' + '_' + fmts.get(filter_to, filter_to)

    pandoc_extra_args = ctx.args
    if standalone:
        pandoc_extra_args.append('--standalone')
    if self_contained:
        pandoc_extra_args.append('--self-contained')

    out = knitty_pandoc_filter(sys.stdin.read(),
                               name=dir_name,
                               filter_to=filter_to,
                               standalone=standalone,
                               self_contained=self_contained,
                               pandoc_format=read,
                               pandoc_extra_args=pandoc_extra_args)
    if filter_to == 'ipynb':
        with io.StringIO(out) as f:
            doc = pf.load(f)
        pf.run_filter(action, doc=doc)
        with io.StringIO() as f:
            pf.dump(doc, f)
            out = f.getvalue()
    sys.stdout.write(out)
Esempio n. 3
0
def convert_markdown_to_json(markdown_file):
    """Converts our markdown file into JSON, which becomes a list of elements.
    We also create an empty dict, where we will store all the code blocks
    we will need to replace with images.

    Where the JSON data looks like:

    ..
        data
        '{"blocks":[{"t":"Para","c":[{"t":"Str","c":"title:"},{"t":"Space"},{"t":"Str","c":"Provisioning"},{"t":"Space"},{"t":"Str","c":"API"},{"t":"Space"},{"t":"Str","c":"LLD"}]},{"t":"Header","c":[1,["low-level-design",[],[]],[{"t":"Str","c":"Low"},{"t":"Space"}}}'

    ..
        doc.content.list
            00: Para(Str(title:) Space Str(Provisioning) Space Str(API) Space Str(LLD))
            01: Header(Str(Low) Space Str(Level) Space Str(Design); level=1, identifier='low-level-design')
            ...
            12: CodeBlock(graph LR;\n    A--> B; classes=['mermaid'])

    Args:
        markdown_file (str): List of paths of the markdown files, we will parse/convert.

    Return:
        panflute.Doc: Pandoc document container.

    """
    try:
        data = pypandoc.convert_file(str(markdown_file), "json")
    except OSError as e:
        logger.error(f"Pandoc is not installed on the host machine. {e}")
        sys.exit(1)

    doc = panflute.load(io.StringIO(data))
    doc.mermaid = {}
    return doc
Esempio n. 4
0
def run_filters(
    actions,
    prepare=None,
    finalize=None,
    input_stream=None,
    output_stream=None,
    doc=None,
    **kwargs,
):

    load_and_dump = doc is None

    if load_and_dump:
        doc = pf.load(input_stream=input_stream)

    if prepare is not None:
        prepare(doc)

    for action in actions:
        if kwargs:
            if isinstance(action, _BeforeAction):
                action.partial(**kwargs)
            else:
                action = functools.partial(action, **kwargs)

        doc = doc.walk(action, doc)

    if finalize is not None:
        finalize(doc)

    if load_and_dump:
        pf.dump(doc, output_stream=output_stream)
    else:
        return doc
Esempio n. 5
0
def main(doc: Optional[Doc] = None) -> None:
    import sys
    import io
    import panflute

    json_input = sys.stdin.read()
    json_stream = io.StringIO(json_input)
    doc = panflute.load(json_stream)
    tangle.prepare(doc)
    doc = doc.walk(tangle.action)
    doc = doc.walk(action)
    panflute.dump(doc)
Esempio n. 6
0
def main(doc=None):
    """Remove empty headings from Vimwiki file.

    Pandoc filter using panflute
    """
    newdoc = pf.load()
    for i in range(5):
        newdoc = pf.run_filter(action,
                               prepare=prepare,
                               finalize=finalize,
                               doc=newdoc)

    return pf.dump(newdoc)
Esempio n. 7
0
def main():
    from manubot.command import setup_logging_and_errors, exit_if_error_handler_fired

    diagnostics = setup_logging_and_errors()
    args = parse_args()
    # Let panflute handle io to sys.stdout / sys.stdin to set utf-8 encoding.
    # args.input=None for stdin, args.output=None for stdout
    doc = pf.load(input_stream=args.input)
    log_level = doc.get_metadata("manubot-log-level", "WARNING")
    diagnostics["logger"].setLevel(getattr(logging, log_level))
    process_citations(doc)
    pf.dump(doc, output_stream=args.output)
    if doc.get_metadata("manubot-fail-on-errors", False):
        exit_if_error_handler_fired(diagnostics["error_handler"])
Esempio n. 8
0
def inner_test_idempotent(input_fn, output_fn):

    print('\nLoading JSON...')

    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)

    print('Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')

    print(' - Done!')

    print('\nComparing...')

    with open(input_fn, encoding='utf-8') as f:
        input_data = f.read()

    with open(output_fn, encoding='utf-8') as f:
        output_data = f.read()

    print('Are both files the same?')
    print(' - Length:',
          len(input_data) == len(output_data), len(input_data),
          len(output_data))
    print(' - Content:', input_data == output_data)

    print('\nApplying trivial filter...')
    doc = doc.walk(action=empty_test, doc=doc)
    print(' - Done!')

    print(' - Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')
    print(' - Done!')
    print(' - Comparing...')
    with open(input_fn, encoding='utf-8') as f:
        input_data = f.read()
    with open(output_fn, encoding='utf-8') as f:
        output_data = f.read()
    print(' - Are both files the same?')
    print('   - Length:',
          len(input_data) == len(output_data), len(input_data),
          len(output_data))
    print('   - Content:', input_data == output_data)

    assert input_data == output_data
Esempio n. 9
0
def get_doc_from_markup(markup,
                        style="innoconv-debug",
                        output=None,
                        lang="de"):
    """Run panzer on markup and return Doc."""

    cmd = [
        "panzer",
        "---panzer-support",
        PANZER_SUPPORT_DIR,
        "--metadata=style:{}".format(style),
        "--metadata=lang:{}".format(lang),
        "--from=latex+raw_tex",
        "--to=json",
        "--standalone",
    ]

    if output:
        cmd.append("--output={}".format(output))

    env = os.environ.copy()
    proc = subprocess.Popen(
        cmd,
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        env=env,
    )

    proc.stdin.write(markup.encode(ENCODING))
    try:
        outs, errs = proc.communicate(timeout=30)
    except subprocess.TimeoutExpired:
        proc.kill()
        outs, errs = proc.communicate()

    errout = errs.decode(ENCODING).strip()
    if errout:
        pf.debug(errout)

    if proc.returncode != 0:
        raise RuntimeError("Failed to run panzer!")

    json_raw = outs.decode(ENCODING)
    try:
        return pf.load(StringIO(json_raw))
    except JSONDecodeError:
        log("Couldn't decode JSON: {}".format(json_raw))
Esempio n. 10
0
def inner_test_idempotent(input_fn, output_fn):

    print('\nLoading JSON...')

    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)

    print('Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')

    print(' - Done!')


    print('\nComparing...')

    with open(input_fn, encoding='utf-8') as f:
        input_data = f.read()

    with open(output_fn, encoding='utf-8') as f:
        output_data = f.read()

    print('Are both files the same?')
    print(' - Length:', len(input_data) == len(output_data), len(input_data), len(output_data))
    print(' - Content:', input_data == output_data)

    print('\nApplying trivial filter...')
    doc = doc.walk(action=empty_test, doc=doc)
    print(' - Done!')

    print(' - Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')
    print(' - Done!')
    print(' - Comparing...')
    with open(input_fn, encoding='utf-8') as f:
        input_data = f.read()
    with open(output_fn, encoding='utf-8') as f:
        output_data = f.read()
    print(' - Are both files the same?')
    print('   - Length:', len(input_data) == len(output_data), len(input_data), len(output_data))
    print('   - Content:', input_data == output_data)


    assert input_data == output_data
Esempio n. 11
0
def run():
    print('\nLoading JSON...')
    input_fn = 'benchmark.json'
    output_fn = 'panflute.json'

    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)

    print('\nApplying trivial filter...')
    doc = doc.walk(action=empty_test, doc=doc)

    print('Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')

    print(' - Done!')
Esempio n. 12
0
def pandoc_filters():
    """ run a set of ipypublish pandoc filters directly on the pandoc AST,
    via ``pandoc --filter ipubpandoc``
    """
    doc = pf.load()

    # in an rmarkdown file, the metadata will be under a root `jupyter` key
    jmeta = doc.get_metadata('jupyter', {})
    meta = pf.tools.meta2builtin(doc.metadata)
    if 'jupyter' in meta and hasattr(meta["jupyter"], 'items'):
        jmeta = meta.pop("jupyter")
        meta.update(jmeta)
        doc.metadata = meta  # builtin2meta(meta)

    apply_filters = doc.get_metadata(IPUB_META_ROUTE + ".apply_filters",
                                     default=True)
    convert_raw = doc.get_metadata(IPUB_META_ROUTE + ".convert_raw",
                                   default=True)

    if apply_filters:
        if convert_raw:
            filters = [
                prepare_raw.main,
                prepare_cites.main,
                prepare_labels.main,
                format_cite_elements.main,
                format_raw_spans.main,
                format_label_elements.main,
                rmarkdown_to_mpe.main
            ]
        else:
            filters = [
                prepare_cites.main,
                prepare_labels.main,
                format_cite_elements.main,
                format_label_elements.main,
                rmarkdown_to_mpe.main
            ]
    else:
        filters = []

    out_doc = doc
    for func in filters:
        out_doc = func(out_doc)  # type: pf.Doc
    # TODO strip meta?
    pf.dump(doc)
Esempio n. 13
0
def run():
    print('\nLoading JSON...')
    input_fn = 'benchmark.json'
    output_fn = 'panflute.json'

    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)

    print('\nApplying trivial filter...')
    doc = doc.walk(action=empty_test, doc=doc)

    print('Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')

    print(' - Done!')
Esempio n. 14
0
def test_all():
    input_fn = './tests/fenced/input.json'
    output_fn = './tests/fenced/output.json'

    # Test fenced filter

    print('\nLoading JSON...')
    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)
    print('Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')
    print(' - Done!')

    print('\nComparing...')
    with open(input_fn, encoding='utf-8') as f:
        input_data = f.read()
    with open(output_fn, encoding='utf-8') as f:
        output_data = f.read()

    print('Are both files the same?')
    print(' - Length:',
          len(input_data) == len(output_data), len(input_data),
          len(output_data))
    print(' - Content:', input_data == output_data)

    print('\nApplying trivial filter...')
    pf.run_filter(empty_filter, doc=doc)
    print(' - Done!')
    dump_and_compare(doc, input_fn, output_fn)

    print('\nApplying YAML filter...')
    pf.run_filter(pf.yaml_filter, tag='spam', function=fenced_action, doc=doc)
    print(' - Done!')
    dump_and_compare(doc, input_fn, output_fn)

    print('\nApplying Strict YAML filter...')
    pf.run_filter(pf.yaml_filter,
                  tag='eggs',
                  function=fenced_action,
                  doc=doc,
                  strict_yaml=True)
    print(' - Done!')
    dump_and_compare(doc, input_fn, output_fn)
Esempio n. 15
0
def inner_test_stringify(input_fn, output_fn):

    output_txt_benchmark = './tests/temp_benchmark.txt'
    output_txt_panflute = './tests/temp_panflute.txt'

    print('Testing stringify()')
    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)
    ans = pf.stringify(doc)
    #print(repr(ans).encode('utf-8'))
    with open(output_txt_panflute, encoding='utf-8', mode='w') as f:
        f.write(ans)

    with open(input_fn, encoding='utf-8') as f:
        doc = json.load(f)
    ans = pandocfilters.stringify(doc)
    with open(output_txt_benchmark, encoding='utf-8', mode='w') as f:
        f.write(ans)
Esempio n. 16
0
def inner_test_stringify(input_fn, output_fn):

    output_txt_benchmark = './tests/temp_benchmark.txt'
    output_txt_panflute = './tests/temp_panflute.txt'

    print('Testing stringify()')
    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)
    ans = pf.stringify(doc)
    #print(repr(ans).encode('utf-8'))
    with open(output_txt_panflute, encoding='utf-8', mode='w') as f:
        f.write(ans)

    with open(input_fn, encoding='utf-8') as f:
        doc = json.load(f)
    ans = pandocfilters.stringify(doc)
    with open(output_txt_benchmark, encoding='utf-8', mode='w') as f:
        f.write(ans)
Esempio n. 17
0
def main() -> None:
    ## ------ begin <<load-document>>[0]
    import io
    import sys

    json_input = sys.stdin.read()
    json_stream = io.StringIO(json_input)
    doc = panflute.load(json_stream)
    ## ------ end
    doc.config = read_config()

    tangle.prepare(doc)
    doc = doc.walk(tangle.action)

    annotate.prepare(doc)
    doc = doc.walk(annotate.action)

    doctest.prepare(doc)
    doc = doc.walk(doctest.action)

    panflute.dump(doc)
Esempio n. 18
0
def pandoc_filters():
    """ run a set of rst2myst pandoc filters directly on the pandoc AST,
    via ``pandoc --filter rst2myst``
    """
    doc = pf.load()
    meta = pf.tools.meta2builtin(doc.metadata)

    apply_filters = doc.get_metadata(IPUB_META_ROUTE + ".apply_filters",
                                     default=True)
    convert_raw = doc.get_metadata(IPUB_META_ROUTE + ".convert_raw",
                                   default=True)

    filters = [
        # Filters
    ]

    out_doc = doc
    for func in filters:
        out_doc = func(out_doc)  # type: pf.Doc

    pf.dump(doc)
Esempio n. 19
0
def with_markdown(content, space, name):
    """User pandoc to get markdown from MediaWiki format."""
    try:
        json_converted = pypandoc.convert_text(content,
                                               'json',
                                               format='mediawiki')

        stream = io.StringIO(json_converted)
        traversable_doc = panflute.load(stream)

        panflute.run_filter(drop_loose_categories, doc=traversable_doc)

        panflute.run_filter(rewrite_internal_links, doc=traversable_doc)

        content = back_to_markdown(traversable_doc)
    except Exception:
        click.echo('Failed to parse content! Continuing ...\n')
        with open(FAILURE_LOG, 'a') as handle:
            handle.write(('Failed to parse content. Could not re-write links '
                          'and drop categories for page {}\n'.format(name)))

    return convert_image_format(content)
Esempio n. 20
0
def test_all():
    input_fn = './tests/fenced/input.json'
    output_fn = './tests/fenced/output.json'

    # Test fenced filter

    print('\nLoading JSON...')
    with open(input_fn, encoding='utf-8') as f:
        doc = pf.load(f)
    print('Dumping JSON...')
    with open(output_fn, mode='w', encoding='utf-8') as f:
        pf.dump(doc, f)
        f.write('\n')
    print(' - Done!')

    print('\nComparing...')
    with open(input_fn, encoding='utf-8') as f:
        input_data = f.read()
    with open(output_fn, encoding='utf-8') as f:
        output_data = f.read()

    print('Are both files the same?')
    print(' - Length:', len(input_data) == len(output_data), len(input_data), len(output_data))
    print(' - Content:', input_data == output_data)

    print('\nApplying trivial filter...')
    pf.run_filter(empty_filter, doc=doc)
    print(' - Done!')
    dump_and_compare(doc, input_fn, output_fn)

    print('\nApplying YAML filter...')
    pf.run_filter(pf.yaml_filter, tag='spam', function=fenced_action, doc=doc)
    print(' - Done!')
    dump_and_compare(doc, input_fn, output_fn)

    print('\nApplying Strict YAML filter...')
    pf.run_filter(pf.yaml_filter, tag='eggs', function=fenced_action, doc=doc, strict_yaml=True)
    print(' - Done!')
    dump_and_compare(doc, input_fn, output_fn)
Esempio n. 21
0
def test():
    # chcp 65001 --> might be required if running from cmd on Windows

    print('\nLoading JSON...')
    fn = "./tests/input/heavy_metadata/benchmark.json"

    with open(fn, encoding='utf-8') as f:
        doc = pf.load(f)

    meta = doc.get_metadata('title')
    assert meta == "Lorem Ipsum: Title"

    meta = doc.get_metadata('title', builtin=False)
    assert type(meta) == pf.MetaInlines

    # foobar key doesn't exist
    meta = doc.get_metadata('foobar', True)
    assert meta == True

    meta = doc.get_metadata('foobar', 123)
    assert meta == 123

    meta = doc.get_metadata('abstract')
    assert meta.startswith('Bring to the table win-win')

    meta = doc.get_metadata('key1.key1-1')
    assert meta == ['value1-1-1', 'value1-1-2']

    meta = doc.get_metadata('amsthm.plain')
    assert type(meta) == list
    assert meta[0]['Theorem'] == 'Lemma'

    print('--')
    meta = doc.get_metadata('')
    assert len(meta) > 10

    print('\nDone...')
def apply_to_json(in_json, filter_func):
    # type: (dict, FunctionType) -> dict
    f = io.StringIO(u(json.dumps(in_json)))
    doc = pf.load(f)
    new_doc = filter_func(doc)  # type: Doc
    return new_doc.to_json()
Esempio n. 23
0
def empty_test(element, doc):
	return

def test_filter(element, doc):
	if type(element)==pf.Header:
		return []
	if type(element)==pf.Str:
		element.text = element.text + '!!'
		return element


print('\nLoading JSON...')

with open(input_fn, encoding='utf-8') as f:
	doc = pf.load(f)

print('Dumping JSON...')
with open(output_fn, mode='w', encoding='utf-8') as f:
	pf.dump(doc, f)
	f.write('\n')

print(' - Done!')


print('\nComparing...')

with open(input_fn, encoding='utf-8') as f:
	input_data = f.read()

with open(output_fn, encoding='utf-8') as f:
Esempio n. 24
0
import panflute as pf


doc = pf.load(filename=None) # If no fn, from stdin
# fmt = pf.format() ???
# doc.content doc.metadata doc.raw_metadata doc.format

doc = pf.walk(doc, some_filter)
doc_json = doc.to_json()

pf.dump(filename=None) # if no fn, to stdout

Esempio n. 25
0
import pypandoc
import panflute


def prepare(doc):
	doc.images = []
	doc.links = []


def action(elem, doc):
    if isinstance(elem, panflute.Image):
    	doc.images.append(elem)
    elif isinstance(elem, panflute.Link):
    	doc.links.append(elem)


if __name__ == '__main__':
	data = pypandoc.convert_file('example.md', 'json')
	f = io.StringIO(data)
	doc = panflute.load(f)
	doc = panflute.run_filter(action, prepare=prepare, doc=doc)
	
	print("\nImages:")
	for image in doc.images:
		print(image.url)

	print("\nLinks:")
	for link in doc.links:
		print(link.url)
Esempio n. 26
0
def _scanner(node, env, path, arg=None):
    """ Attempt to scan the final target for images and bibliographies

    In Pandoc flavored MarkDown, the only "included" files are the
    images and the bibliographies.  We need to tell SCons about these,
    but we don't want to do this by hand.  To do this, we directly use
    Pandoc's json output and analyze the document tree for the images
    and the metadata for bibliographies.  We need to operate on the
    filtered syntax tree so we can get the final filtered version.  The
    logic should work on any input format Pandoc can translate into its
    AST.

    Note you must respect Pandoc's bibliography file rules.  The command
    line arguments will override files specified in the YAML block of
    the header file.

    This logic is primarily aimed at the MarkDown sources, but it should
    work with the other plain text sources too.  However, this is not
    rigorously tested.  For LaTeX sources, you should really just use
    the SCons builder to have the right thing done.

    """
    import panflute
    logger = logging.getLogger(__name__ + ".scanner")
    # Grab the base command SCons will run and remove the output flag.
    # This does assume the user did not override the command variable
    # and hard code the output.
    cmd = shlex.split(env.subst_target_source("$PANDOCCOM"))
    for flag in ("-o", "--output"):
        try:
            cmd.remove(flag)
        except ValueError:
            # They specified the other flag
            pass

    # If the user provided the --from flag, we need to move it to the
    # beginning of the command
    newidx = 1
    for idx, item in enumerate(cmd):
        match = re.match(r"(-f|--from=?)([-+\w]*)?", item)
        if match:
            cmd[newidx:newidx] = [cmd.pop(idx)]
            newidx += 1
            if not match.group(2):
                cmd[newidx:newidx] = [cmd.pop(idx + 1)]
                newidx += 1

    logger.debug("initial command: '{0}'".format(" ".join(cmd)))
    # Now parse the command line for the known arguments with files that
    # are needed generate the final document.  But, we want to make sure
    # the file is actually in the build tree and not simply an installed
    # executable or file.  To do this, we map destinations in an
    # :class:`argparser.ArgumentParser` to Pandoc flags.  We do not want
    # to deal with searching all over creation so we do not deal with
    # the data directory.
    #
    # .. note:: This does not deal with the --resource-path flag which
    #           provides additional search paths for Pandoc.
    arguments = {
        "filter": ("-F", "--filter"),
        "lua": ("--lua-filter", ),
        "metadata": ("--metadata-file", ),
        "abbreviations": ("--abbreviations", ),
        "highlight": ("--highlight-style", ),
        "syntax": ("--syntax-definition", ),
        "header": ("-H", "--include-in-header"),
        "before": ("-B", "--include-before-body"),
        "after": ("-A", "--include-after-body"),
        "css": ("-c", "--css"),
        "reference": ("--reference-doc", ),
        "epubcover": ("--epub-cover-image", ),
        "epubmeta": ("--epub-metadata", ),
        "epubfont": ("--epub-embed-font", ),
        "bibliography": ("--bibliography", ),
        "csl": ("--csl", ),
        "citeabbrev": ("--citation-abbreviations", ),
    }
    parser = argparse.ArgumentParser()
    for dest in arguments:
        parser.add_argument(*arguments[dest],
                            dest=dest,
                            action="append",
                            default=[])

    # Add the target format in case it was specified as this overrides
    # the output format.  We also need the data directory for finding
    # installed filters.
    parser.add_argument("-t", "--to")
    parser.add_argument("--data-dir", dest="datadir")
    parser.add_argument("--template", default="default")

    args, _ = parser.parse_known_args(cmd)
    files = []
    for dest in arguments:
        files.extend(
            [env.File(x) for x in getattr(args, dest) if os.path.exists(x)])

    # Now we need to determine the files inside the document that will
    # influence the output.  To do this, we need to analyze the tree
    # Pandoc will write out after all of the filters have been run.  The
    # best parser for a Pandoc document is Pandoc itself; however, we
    # want to interrupt the processing before the Writer is called.
    # Looking at the filter documentation, we can achieve this by
    # calling Pandoc with the appropriate flags and piping the JSON
    # output through each filter.  The output format is passed as an
    # argument to each filter so we must replicate that behavior to
    # ensure the syntax tree has the final files.
    #
    # If the user provided the ``--to`` flag (with possible extensions),
    # that _is_ the output format.  Otherwise, we take the format from
    # the file extension.  The only exception is the 'beamer' output.
    if args.to:
        if args.to == "beamer":
            format = "latex"
        else:
            format = re.match(r"(\w+)[-+]?", args.to).group(1)

    else:
        _, format = os.path.splitext(str(node))
        format = format[1:]

    # Now that we have the format, we can figure out if the template was
    # defined and inside the project.  First, we need the root of the
    # build and the template.
    template = args.template
    # Add the extension if needed.
    _, ext = os.path.splitext(template)
    if ext == "":
        template = template + "." + format

    # First, check that the file exists or is findable in the data
    # directory.
    if not os.path.exists(template):
        if args.datadir:
            template = os.path.join(args.datadir, "templates", template)

    if os.path.exists(template) and format not in ("docx", "pptx"):
        files.append(env.File(template))

    def run_command(cmd, proc=None):
        """Helper function for running a command
        """
        logger = logging.getLogger(__name__ + ".scanner.run_command")
        logger.debug("command: '{0}'".format(" ".join(cmd)))
        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stdin=proc.stdout if proc else None)
        return proc

    # We need to run each filter in order; however, we also need to
    # run any Lua filters in their proper location.  We can do this
    # by reading from the front of the command until we find a
    # filter.  We consume the command until we find a filter and run
    # each stage.  We start by processing the input files.
    proc = None
    cmd_ = []
    cmd0 = [_detect(env), "--from", "json", "--to", "json"] + (
        ["--data-dir={0}".format(args.datadir)] if args.datadir else [])
    sources = [x.path for x in node.sources if os.path.exists(x.path)]
    while cmd and sources:
        # Grab the first item off the list
        item = cmd.pop(0)
        # Is this a 'to' flag?
        match = re.match(r"(-T|--to=?)([-+\w+]+)?", item)
        if match:
            if not match.group(2):
                cmd.pop(0)

            continue

        # Determine if it is a filter
        match = re.match(r"(-F|--filter=?)([-\w/.]+)?", item)
        if match:
            # Grab the filter
            filt = match.group(2) if match.group(2) else cmd.pop(0)
            logger.debug("cmd : '{0}'".format(" ".join(cmd)))
            logger.debug("item: '{0}'".format(item))
            logger.debug("filt: '{0}'".format(filt))
            logger.debug("cmd_: '{0}'".format(" ".join(cmd_)))

            # First, deal with any intervening commands
            if cmd_:
                if proc:
                    proc = run_command(cmd0 + cmd_, proc)
                else:
                    # If this is the first filter, we need to process the
                    # input files.
                    cmd_.extend(["--to", "json"])
                    cmd_.extend(sources)
                    proc = run_command(cmd_)

            # Now figure out the filter.
            cmd_ = _find_filter(filt, args.datadir, env)
            proc = run_command(cmd_ + [format], proc)
            cmd_ = []
        else:
            # Otherwise, put it on the running command.
            cmd_.append(item)

    # Now process any arguments after the last filter
    if cmd_:
        if proc:
            proc = run_command(cmd0 + cmd_, proc)
        else:
            # If we have no filters, process the sources.
            cmd_.extend(["--to", "json"])
            cmd_.extend(sources)
            proc = run_command(cmd_)

    doc = panflute.load(proc.stdout) if proc else None

    def _path(x):
        """A helper for getting the path right"""
        root = os.path.dirname(str(node))
        if os.path.commonprefix([root, x]) == root:
            return env.File(x)
        else:
            return env.File(os.path.join(root, x))

    # For images, we only concern ourselves with outputs that are a
    # final stage.  This includes formats such as 'docx', 'pptx',
    # 'html', and 'epub'.  It excludes 'markdown' and 'latex'.  The
    # rationale is these are not delivery formats and, therefore, still
    # need to be processed as another stage in SCons.  That is when the
    # scanning needs to be done.  We also exclude PDF because SCons has
    # a better scanner built in. (And why would you want to use SCons if
    # you just want to use Pandoc to go straight to PDF?)
    skip = (
        "asciidoc",
        "commonmark",
        "context",
        "gfm",
        "json",
        "latex",
        "markdown",
        "markdown_mmd",
        "markdown_phpextra",
        "markdown_strict",
        "native",
        "org",
        "plain",
        "rst",
        "tex",
    )
    if format not in skip:

        def walk(src):
            """Walk the tree and find images and bibliographies
            """
            if isinstance(src, panflute.Image):
                return [src.url]
            else:
                tmp = [walk(y) for y in getattr(src, "content", [])]
                return [y for z in tmp for y in z if y]

        images = [x for x in walk(doc) if x]
        logger.debug("images: {0}".format(images))
        files.extend([_path(x) for x in images])

    # And, finally, check the metadata for a bibliography file
    if doc:
        if not args.bibliography:
            bibs = doc.metadata.content.get("bibliography", [])
            if bibs:
                files.extend(
                    [_path(x.text) for x in getattr(bibs, "content", [bibs])])

    logger.debug("{0!s}: {1!s}".format(node, [str(x) for x in files]))
    return files
Esempio n. 27
0
def main():
    doc = pf.load(input_stream=sys.stdin)
    merge_settings(doc)
    pf.dump(pf.run_filters([gloss, gloss_refs], doc=doc),
            output_stream=sys.stdout)
Esempio n. 28
0
def apply_filter(in_object,
                 filter_func=None,
                 out_format="panflute",
                 in_format="markdown",
                 strip_meta=False,
                 strip_blank_lines=False,
                 replace_api_version=True,
                 dry_run=False,
                 **kwargs):
    # type: (list[str], FunctionType) -> str
    """convenience function to apply a panflute filter(s)
    to a string, list of string lines, pandoc AST or panflute.Doc

    Parameters
    ----------
    in_object: str or list[str] or dict
        can also be panflute.Doc
    filter_func:
        the filter function or a list of filter functions
    out_format: str
        for use by pandoc or, if 'panflute', return the panflute.Doc
    in_format="markdown": str
    strip_meta=False: bool
        strip the document metadata before final conversion
    strip_blank_lines: bool
    strip_ends: bool
        strip any blank lines or space from the start and end
    replace_api_version: bool
        for dict input only, if True,
        find the api_version of the available pandoc and
        reformat the json as appropriate
    dry_run: bool
        If True, return the Doc object, before applying the filter
    kwargs:
        to parse to filter func

    Returns
    -------
    str

    """
    if isinstance(in_object, pf.Doc):
        pass
    elif isinstance(in_object, dict):
        if not in_format == "json":
            raise AssertionError("the in_format for a dict should be json, "
                                 "not {}".format(in_format))
        if "meta" not in in_object:
            raise ValueError("the in_object does contain a 'meta' key")
        if "blocks" not in in_object:
            raise ValueError("the in_object does contain a 'blocks' key")
        if "pandoc-api-version" not in in_object:
            raise ValueError(
                "the in_object does contain a 'pandoc-api-version' key")
        if replace_api_version:
            # run pandoc on a null object, to get the correct api version
            null_raw = pf.run_pandoc("", args=["-t", "json"])
            null_stream = io.StringIO(null_raw)
            api_version = pf.load(null_stream).api_version

            # see panflute.load, w.r.t to legacy version
            if api_version is None:
                in_object = [{
                    "unMeta": in_object["meta"]
                }, in_object["blocks"]]
            else:
                ans = OrderedDict()
                ans["pandoc-api-version"] = api_version
                ans["meta"] = in_object["meta"]
                ans["blocks"] = in_object["blocks"]
                in_object = ans
        in_str = json.dumps(in_object)
    elif isinstance(in_object, (list, tuple)):
        in_str = "\n".join(in_object)
    elif isinstance(in_object, string_types):
        in_str = in_object
    else:
        raise TypeError("object not accepted: {}".format(in_object))

    if not isinstance(in_object, pf.Doc):
        doc = pf.convert_text(in_str, input_format=in_format, standalone=True)
        # f = io.StringIO(in_json)
        # doc = pf.load(f)
    else:
        doc = in_object

    doc.format = out_format

    if dry_run:
        return doc

    if not isinstance(filter_func, (list, tuple, set)):
        filter_func = [filter_func]

    out_doc = doc
    for func in filter_func:
        out_doc = func(out_doc, **kwargs)  # type: Doc

    # post-process Doc
    if strip_meta:
        out_doc.metadata = {}
    if out_format == "panflute":
        return out_doc

    # create out str
    # with io.StringIO() as f:
    #     pf.dump(doc, f)
    #     jsonstr = f.getvalue()
    # jsonstr = json.dumps(out_doc.to_json()
    out_str = pf.convert_text(out_doc,
                              input_format="panflute",
                              output_format=out_format)

    # post-process final str
    if strip_blank_lines:
        out_str = out_str.replace("\n\n", "\n")

    return out_str