コード例 #1
0
    def drop_superfluous_attrs(self, fmx_root):
        """
        Drop superfluous CALS-like attributes at the end of the Formex building.

        - ``@cals:namest`` and ``@cals:nameend`` are defined by ``@COLSPAN``
        - ``@cals:morerows`` is defined by ``@ROWSPAN``
        - ``@cals:rowstyle`` is defined by ``ROW/@TYPE``, ``GR.NOTES``, ``TI.BLK`` or ``STI.BLK``.

        :type  fmx_root: ElementType
        :param fmx_root: Root element of the Formex file.

        .. versionadded:: 0.5.1
        """
        cals = self.get_cals_qname
        fmx = self.get_formex_qname
        ROW = fmx("ROW").text
        CELL = fmx("CELL").text
        GR_NOTES = fmx("GR.NOTES").text
        TI_BLK = fmx("TI.BLK").text
        STI_BLK = fmx("STI.BLK").text
        elements = {ROW, CELL, TI_BLK, STI_BLK, GR_NOTES}
        for fmx_corpus in fmx_root.xpath("//TBL"):  # type: ElementType
            context = iterwalk(fmx_corpus, events=("start", ), tag=elements)
            for action, elem in context:
                elem.attrib.pop(cals("namest"), None)
                elem.attrib.pop(cals("nameend"), None)
                elem.attrib.pop(cals("morerows"), None)
                elem.attrib.pop(cals("rowstyle"), None)
コード例 #2
0
    def update_no_seq(self, fmx_root):
        """
        Calculate the ``@NO.SEQ`` values: sequence number of each table.

        :type  fmx_root: ElementType
        :param fmx_root: The result tree which contains the ``TBL`` elements to update.
        """
        context = iterwalk(fmx_root, events=("start", ), tag=("TBL", ))
        stack = []
        for action, elem in context:  # type: str, ElementType
            elem_level = int(elem.xpath("count(ancestor-or-self::TBL)"))
            curr_level = len(stack)
            if curr_level < elem_level:
                stack.extend([0] * (elem_level - curr_level))
            else:
                stack[:] = stack[:elem_level]
            stack[elem_level - 1] += 1
            no_seq = u".".join(u"{:04d}".format(value) for value in stack)
            elem.attrib["NO.SEQ"] = no_seq
コード例 #3
0
ファイル: __init__.py プロジェクト: waynet/benker
    def parse_table(self, w_tbl):
        """
        Convert a Office Open XML ``<w:tbl>`` into CALS ``<table>``

        :type  w_tbl: etree._Element
        :param w_tbl: Office Open XML element.

        :rtype: etree._Element
        :return: CALS element.
        """
        state = self._state
        state.reset()

        elements = {
            w(name)
            for name in {'tbl', 'tblGrid', 'gridCol', 'tr', 'tc'}
        }
        context = iterwalk(w_tbl, events=('start', 'end'), tag=elements)

        depth = 0
        for action, elem in context:
            elem_tag = elem.tag
            if elem_tag == w('tbl'):
                if action == 'start':
                    depth += 1
                else:
                    depth -= 1
            if depth > 1:
                # .. note:: context.skip_subtree() is not available for all version of lxml
                # This <tbl> element is inside the table.
                # It will be handled separately in another call to convert_tbl()
                continue
            if action == 'start':
                if elem_tag == w('tbl'):
                    self.parse_tbl(elem)

                elif elem_tag == w('tblGrid'):
                    # this element has no specific data
                    pass

                elif elem_tag == w('gridCol'):
                    state.next_col()
                    self.parse_grid_col(elem)

                elif elem_tag == w('tr'):
                    state.next_row()
                    self.parse_tr(elem)

                elif elem_tag == w('tc'):
                    state.next_col()
                    self.parse_tc(elem)

                else:
                    raise NotImplementedError(elem_tag)
            else:
                if elem_tag == w('tr'):
                    # add missing entries
                    bounding_box = Box(1, state.row_pos, len(state.table.cols),
                                       state.row_pos)
                    state.table.fill_missing(bounding_box,
                                             None,
                                             nature=state.row.nature)

        return state.table
コード例 #4
0
ファイル: test_lxml_Iterwalk.py プロジェクト: waynet/benker
def test_iterwalk_root(events, tag, expected):
    root = etree.XML("<root><a/></root>")
    context = iterwalk(root, events=events, tag=tag)
    actual = [(event, node.tag) for event, node in context]
    assert expected == actual
コード例 #5
0
ファイル: formex.py プロジェクト: waynet/benker
    def parse_table(self, fmx_corpus):
        """
        Convert a ``<CORPUS>`` Formex element into table object.

        :type  fmx_corpus: ElementType
        :param fmx_corpus: Formex element.

        :rtype: ElementType
        :return: Table.
        """
        state = self._state
        state.reset()

        # -- Formex elements
        fmx = self.get_formex_qname

        CORPUS = fmx("CORPUS").text
        ROW = fmx("ROW").text
        CELL = fmx("CELL").text
        MARGIN = fmx("MARGIN").text
        BLK = fmx("BLK").text
        TI_BLK = fmx("TI.BLK").text
        STI_BLK = fmx("STI.BLK").text

        # -- CALS-like elements
        cals = self.get_cals_qname

        colspec = cals("colspec").text

        elements = {CORPUS, ROW, CELL, MARGIN, BLK, TI_BLK, STI_BLK, colspec}
        context = iterwalk(fmx_corpus, events=("start", "end"), tag=elements)

        depth = 0
        for action, elem in context:
            elem_tag = elem.tag
            if elem_tag == CORPUS:
                if action == "start":
                    depth += 1
                else:
                    depth -= 1
            if depth > 1:
                # .. note:: context.skip_subtree() is not available for all version of lxml
                # This <TBL> element is inside the table.
                # It will be handled separately in another call to transform_tables()
                continue
            if action == "start":
                # tags sorted by frequency:
                if elem_tag == CELL:
                    state.next_col()
                    self.parse_fmx_cell(elem)

                elif elem_tag == ROW:
                    state.next_row()
                    self.parse_fmx_row(elem)

                elif elem_tag == BLK:
                    # only a container
                    pass

                elif elem_tag == TI_BLK:
                    state.next_row()
                    self.parse_fmx_ti_blk(elem)

                elif elem_tag == STI_BLK:
                    state.next_row()
                    self.parse_fmx_sti_blk(elem)

                elif elem_tag == colspec:
                    state.next_col()
                    self.parse_fmx_colspec(elem)

                elif elem_tag == CORPUS:
                    self.parse_fmx_corpus(elem)

                elif elem_tag == MARGIN:
                    raise NotImplementedError("MARGIN is not supported yet")

                else:
                    raise NotImplementedError(elem_tag)
            else:
                if elem_tag in {ROW, TI_BLK, STI_BLK}:
                    bounding_box = Box(1, state.row_pos, len(state.table.cols),
                                       state.row_pos)
                    state.table.fill_missing(bounding_box,
                                             None,
                                             nature=state.row.nature)
                elif elem_tag == CORPUS:
                    # if there is a GR.NOTES, we create a row for it with the nature "footer"
                    if self.formex_ns:
                        nodes = elem.xpath("preceding-sibling::fmx:GR.NOTES",
                                           namespaces={"fmx": self.formex_ns})
                    else:
                        nodes = elem.xpath("preceding-sibling::GR.NOTES")
                    for fmx_gr_notes in nodes:
                        # Convert the GR.NOTES and remove it
                        state.next_row()
                        self.parse_gr_notes(fmx_gr_notes)
                        fmx_tbl = fmx_gr_notes.getparent()
                        fmx_tbl.remove(fmx_gr_notes)
                    state.table.fill_missing(state.table.bounding_box, None)

        return state.table
コード例 #6
0
    def parse_table(self, cals_table):
        """
        Convert a ``<table>`` CALS element into table object.

        :type  cals_table: ElementType
        :param cals_table: CALS element.

        :rtype: benker.table.Table
        :return: Table.
        """
        state = self._state
        state.reset()

        # -- CALS elements
        cals = self.get_cals_qname

        table = cals("table").text
        # titles = cals("titles").text  # not supported
        tgroup = cals("tgroup").text
        colspec = cals("colspec").text
        thead = cals("thead").text
        tfoot = cals("tfoot").text
        tbody = cals("tbody").text
        row = cals("row").text
        # entrytbl = cals("entrytbl").text  # not supported
        entry = cals("entry").text

        elements = {table, tgroup, colspec, thead, tfoot, tbody, row, entry}
        context = iterwalk(cals_table, events=("start", "end"), tag=elements)

        depth = 0
        for action, elem in context:
            elem_tag = elem.tag
            if elem_tag == table:
                if action == "start":
                    depth += 1
                else:
                    depth -= 1
            if depth > 1:
                # .. note:: context.skip_subtree() is not available for all version of lxml
                # This <TBL> element is inside the table.
                # It will be handled separately in another call to transform_tables()
                continue
            if action == "start":
                # tags sorted by frequency:
                if elem_tag == entry:
                    state.next_col()
                    self.parse_cals_entry(elem)

                elif elem_tag == row:
                    state.next_row()
                    self.parse_cals_row(elem)

                elif elem_tag in {tbody, tfoot, thead}:
                    # everything is done in parse_fmx_row()
                    pass

                elif elem_tag == colspec:
                    state.next_col()
                    self.parse_cals_colspec(elem)

                elif elem_tag == tgroup:
                    self.parse_cals_tgroup(elem)

                elif elem_tag == table:
                    self.parse_cals_table(elem)

                else:
                    raise NotImplementedError(elem_tag)
            else:
                if elem_tag in {row}:
                    bounding_box = Box(1, state.row_pos, len(state.table.cols), state.row_pos)
                    state.table.fill_missing(bounding_box, None, nature=state.row.nature)
                elif elem_tag == table:
                    state.table.fill_missing(state.table.bounding_box, None)

        return state.table