Exemple #1
0
 def test_set_caret(self) -> None:
     """Open or close branches to prepare for next branch or item."""
     inst = DepthCollector(3)
     inst.set_caret(3)
     assert inst.rightmost_branches == [[[[]]], [[]], []]
     inst.set_caret(2)
     assert inst.rightmost_branches == [[[[]]], [[]]]
     inst.set_caret(1)
     assert inst.rightmost_branches == [[[[]]]]
Exemple #2
0
 def test_raise_caret(self) -> None:
     """Reduce caret list by one."""
     inst = DepthCollector(3)  # caret = [[]]
     inst.drop_caret()
     assert inst.rightmost_branches == [[[]], []]
     inst.raise_caret()
     assert inst.rightmost_branches == [[[]]]
Exemple #3
0
 def test_caret_will_not_drop_past_item_depth(self) -> None:
     """Raise error before dropping caret past item_depth"""
     inst = DepthCollector(3)  # at depth 1
     inst.drop_caret()  # at depth 2
     inst.drop_caret()  # at depth 3 (item_depth)
     with pytest.raises(CaretDepthError):
         inst.drop_caret()
Exemple #4
0
def get_text(xml: bytes, context: Dict[str, Any]) -> TablesList:
    """Xml as a string to a list of cell strings.

    :param xml: an xml bytes object which might contain text
    :param context: dictionary of document attributes generated in get_docx_text
    :returns: A 4-deep nested list of strings.

    Sorts the text into the DepthCollector instance, five-levels deep

    ``[table][row][cell][paragraph][run]`` is a string

    Joins the runs before returning, so return list will be

    ``[table][row][cell][paragraph]`` is a string

    If you'd like to extend or edit this package, this function is probably where you
    want to do it. Nothing tricky here except keeping track of the text formatting.
    """
    tables = DepthCollector(5)
    do_html = context["do_html"]

    # noinspection PyPep8Naming
    def branches(branch: Element) -> None:
        """
        Recursively iterate over descendents of branch. Add text when found.

        :param branch: An Element from an xml file (ElementTree)
        :return: None. Adds text cells to outer variable `tables`.
        """
        for child in branch:
            tag = child.tag

            # set caret depth
            if tag == TABLE:
                tables.set_caret(1)
            elif tag == TABLE_ROW:
                tables.set_caret(2)
            elif tag == TABLE_CELL:
                tables.set_caret(3)
            elif tag == PARAGRAPH:
                tables.set_caret(4)

            # open elements
            if tag == PARAGRAPH:
                tables.insert(_get_bullet_string(child, context))

            elif tag == RUN and do_html is True:
                # new text run
                run_style = get_run_style(child)
                open_style = getattr(tables, "open_style", ())
                if run_style != open_style:
                    tables.insert(style_close(open_style))
                    tables.insert(style_open(run_style))
                    tables.open_style = run_style

            elif tag == TEXT:
                # new text object. oddly enough, these don't all contain text
                text = child.text if child.text is not None else ""
                if do_html is True:
                    text = text.replace("<", "&lt;")
                    text = text.replace(">", "&gt;")
                tables.insert(text)

            elif tag == FOOTNOTE:
                if "separator" not in child.attrib.get(qn("w:type"), "").lower():
                    tables.insert("footnote{})\t".format(child.attrib[qn('w:id')]))

            elif tag == ENDNOTE:
                if "separator" not in child.attrib.get(qn("w:type"), "").lower():
                    tables.insert("endnote{})\t".format(child.attrib[qn('w:id')]))

            # add placeholders
            elif tag == FOOTNOTE_REFERENCE:
                tables.insert("----footnote{}----".format(child.attrib[qn('w:id')]))

            elif tag == ENDNOTE_REFERENCE:
                tables.insert("----endnote{}----".format(child.attrib[qn('w:id')]))

            elif tag == IMAGE:
                rId = child.attrib[qn("r:embed")]
                image = context["rId2Target"].get(rId)
                if image:
                    tables.insert("----{}----".format(image))

            elif tag == IMAGEDATA:
                rId = child.attrib[qn("r:id")]
                image = context["rId2Target"].get(rId)
                if image:
                    tables.insert("----{}----".format(image))

            elif tag == TAB:
                tables.insert("\t")

            # enter child element
            branches(child)

            # close elements
            if tag == PARAGRAPH and do_html is True:
                tables.insert(style_close(getattr(tables, "open_style", ())))
                tables.open_style = ()

            if tag in {TABLE_ROW, TABLE_CELL, PARAGRAPH}:
                tables.raise_caret()

            elif tag == TABLE:
                tables.set_caret(1)

    branches(ElementTree.fromstring(xml))

    tree = tables.tree
    for (i, j, k, l), paragraph in enum_at_depth(tree, 4):
        tree[i][j][k][l] = "".join(paragraph)

    return tree
Exemple #5
0
 def test_insert(self) -> None:
     """Place item at inst.item_depth."""
     inst = DepthCollector(5)
     inst.insert("text")
     assert inst.rightmost_branches[0] == [[[[["text"]]]]]
Exemple #6
0
 def test_caret_will_not_raise_past_root(self) -> None:
     """Raise error before raising caret to depth 0."""
     inst = DepthCollector(3)  # caret = [[]]
     with pytest.raises(CaretDepthError):
         inst.raise_caret()
Exemple #7
0
 def test_last_caret(self) -> None:
     """Add empty list to caret[-1]. Append pointer to new list to caret. """
     inst = DepthCollector(3)
     inst.drop_caret()
     assert inst.rightmost_branches == [[[]], []]
     assert inst.rightmost_branches[-1] is inst.rightmost_branches[-2][-1]
Exemple #8
0
 def test_init(self) -> None:
     """Init containers"""
     inst = DepthCollector(3)
     assert inst.item_depth == 3
     assert inst.rightmost_branches == [[]]