Esempio n. 1
0
def find(input, xpaths, root_tag, no_root, **kwargs):
    """
    Extracts specified portions of XML data from the input. Requires valid input.

    This command can be used to extract a data subset from within more complex data.

    Note: The 'find' command is both similar to and different than the 'strip -x' command.
    The 'find' command outputs MATCHING input, while 'strip -x' outputs NON-matching input.

    Note: The ElementTree package (Python builtin) has limited XPath support.
    Therefore, some of the examples below will only work if the lxml package is used (instead of ElementTree).

    Examples:

        \b
        Example: Find all b elements:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b' -nr
        <b><c/></b>
        <b><d><e/></d><d/></b>

        \b
        Example: Find the 1st b element:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b[1]' -nr
        <b><c/></b>

        \b
        Example: Find the 2nd b element:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b[2]' -nr
        <b><d><e/></d><d/></b>

        \b
        Example: Find the last b element:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//b[last()]' -nr
        <b><d><e/></d><d/></b>

        \b
        Example: Find all e elements that are a child of a d element:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//d/e' -nr
        <e/>

        \b
        Example: Find all d elements with a child e element:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//d/e/parent::*' -nr
        <d><e/></d>

        \b
        Example: Find all elements with exactly 1 inner element:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(*)=1]' -nr
        <b><c/></b>
        <d><e/></d>

        \b
        Example: Find all elements with exactly 2+ inner elements:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(*)>=2]' -nr
        <a><b><c/></b><b><d><e/></d><d/></b></a>
        <b><d><e/></d><d/></b>

        \b
        Example: Find all elements with 1 child element:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(/*)=1]' -nr
        <b><c/></b>
        <d><e/></d>

        \b
        Example: Find all elements with 1 inner element with tag c:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(./c)=1]' -nr
        <b><c/></b>

        \b
        Example: Find all elements with 1 inner element with either the c OR e tag:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool find -x '//*[count(./c|./e)=1]' -nr
        <b><c/></b>
        <d><e/></d>

        \b
        Example: Find all b elements with attribute @z=1:
        $ echo '<a><b z="1"><c/></b><b z="2"><d><e z="1"/></d><d/></b></a>' | \\
        python -mclifunzone.xmltool find -x '//b[@z="1"]' -nr
        <b z="1"><c/></b>

        \b
        Example: Find all elements with attribute @z=1:
        $ echo '<a><b z="1"><c/></b><b z="2"><d><e z="1"/></d><d/></b></a>' | \\
        python -mclifunzone.xmltool find -x '//*[@z="1"]' -nr
        <b z="1"><c/></b>
        <e z="1"/>

        \b
        Example: Find all elements with attribute @z except those with @z=2:
        $ echo '<a><b z="1"><c/></b><b z="2"><d z="1"><e z="2"/></d></b></a>' | \\
          python -mclifunzone.xmltool find -x '//*[@z and @z!="2"]' -nr
        <b z="1"><c/></b>
        <d z="1"><e z="2"/></d>

        \b
        Example: Find all elements with attribute @z=1 and a node position greater than 2:
        $ echo '<a><b z="1"><c/></b><b z="2"><d z="1"><e z="2"/></d></b></a>' | \\
          python -mclifunzone.xmltool find -x '//*[@z="1" and position()>2]' -nr
        <d z="1"><e z="2"/></d>

        \b
        Example: Find all elements with text that contains "3":
        $ echo '<z><a>1a1</a><b>2b1</b><c>3c1</c><a>4a2</a><b>5b2</b><c>6c2</c><a>7a3</a></z>' | \\
          python -mclifunzone.xmltool find -x '//*[contains(text(),"3")]' -nr
        <c>3c1</c>
        <a>7a3</a>
    """

    if not input:
        input = '-'
    with click.open_file(input, mode='rb') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        if xpaths:
            elements = list(itertools.chain(*(xml_utils.get_elements(root, xpath=xpath) for xpath in xpaths)))
        else:
            elements = []
        # output = ET.tostring(root, method='text')
        if no_root:
            root_tag = None
        if root_tag:
            header = '<%s>' % root_tag
            footer = '</%s>' % root_tag
        else:
            header, footer = None, None

        if header:
            click.echo(header)
        for i in elements:
            output = ET.tostring(i)
            click.echo(output)
        if footer:
            click.echo(footer)
Esempio n. 2
0
def strip(input, whitespace, empty, xpaths, tags, attributes, attribute_values, empty_attributes, all_attributes,
          all_text, **kwargs):
    """
    Removes specified portions of XML data from the input. Requires valid input.

    This command can be used to simplify complex data (by discarding specific portions of it).
    Such simplification might be used (for example) as part of an interactive data analysis process.

    Note: The 'find' command is both similar to and different than the 'strip -x' command.
    The 'find' command outputs MATCHING input, while 'strip -x' outputs NON-matching input.

    Examples:

        \b
        Example: Remove all d tags that are direct children of b tags:
        $ echo '<a><b><c/></b><b><d><e/></d><d/></b></a>' | python -mclifunzone.xmltool strip -x "//b/d"
        <a><b><c/></b><b/></a>
    """

    if not input:
        input = '-'
    with click.open_file(input, mode='rb') as f:
        parser = None
        if whitespace:
            try:
                parser = ET.XMLParser(remove_blank_text=True)
                # since the parser will take care of the whitespace removal, we don't need to do it manually below
                whitespace = False
            except TypeError:
                # TypeError: __init__() got an unexpected keyword argument 'remove_blank_text'
                # lxml not imported?
                pass
        if parser:
            tree = ET.parse(f, parser=parser)
        else:
            tree = ET.parse(f)
        root = tree.getroot()
        # from clifunzone import reflection_utils
        # click.echo('tree: %s' % reflection_utils.varsdict(tree))
        # click.echo('tree: %s' % dir(tree))
        # click.echo('root: %s' % reflection_utils.varsdict(root))
        # click.echo('root: %s' % dir(root))

        if tags:
            # convert each tag to an xpath
            if not xpaths:
                xpaths = tuple()
            xpaths += tuple('//{tag}'.format(tag=s) for s in tags)
        for xpath in xpaths:
            xml_utils.remove_elements(root, xpath=xpath)
        if all_attributes:
            for i in [i for i in root.iter() if i.attrib]:
                i.attrib.clear()
        else:
            if empty_attributes:
                xml_utils.remove_attributes_with_empty_value(root)
            if attributes:
                for attrib_name in attributes:
                    xml_utils.remove_attributes_with_name(root, attrib_name)
            if attribute_values:
                for attrib_value in attribute_values:
                    xml_utils.remove_attributes_with_value(root, attrib_value)
        if all_text:
            for i in [i for i in root.iter() if i.text]:
                i.text = ''
        if whitespace:
            for i in [i for i in root.iter() if i.text]:
                i.text = i.text.strip()
        if empty:
            # Note: the repeat flag will cause elements that become empty (as a result of removal of empty children)
            # to be subsequently detected as empty and removed.
            repeat = True
            while repeat:
                repeat = False  # stop unless a removal occurs
                for parent in [i for i in root.iter() if xml_utils.is_parent_element(i)]:
                    for child in [i for i in xml_utils.get_elements(parent, xpath='./*')
                                  if xml_utils.is_empty_element(i)]:
                        repeat = True
                        parent.remove(child)
        # output = ET.tostring(root, method='text')
        output = ET.tostring(root)
        click.echo(output)