def drop_superfluous_attrs(self, fmx_root): """ Drop superfluous CALS-like attributes at the end of the Formex building. - ``@cals:namest`` and ``@cals:nameend`` are defined by ``@COLSPAN`` - ``@cals:morerows`` is defined by ``@ROWSPAN`` - ``@cals:rowstyle`` is defined by ``ROW/@TYPE``, ``GR.NOTES``, ``TI.BLK`` or ``STI.BLK``. :type fmx_root: ElementType :param fmx_root: Root element of the Formex file. .. versionadded:: 0.5.1 """ cals = self.get_cals_qname fmx = self.get_formex_qname ROW = fmx("ROW").text CELL = fmx("CELL").text GR_NOTES = fmx("GR.NOTES").text TI_BLK = fmx("TI.BLK").text STI_BLK = fmx("STI.BLK").text elements = {ROW, CELL, TI_BLK, STI_BLK, GR_NOTES} for fmx_corpus in fmx_root.xpath("//TBL"): # type: ElementType context = iterwalk(fmx_corpus, events=("start", ), tag=elements) for action, elem in context: elem.attrib.pop(cals("namest"), None) elem.attrib.pop(cals("nameend"), None) elem.attrib.pop(cals("morerows"), None) elem.attrib.pop(cals("rowstyle"), None)
def update_no_seq(self, fmx_root): """ Calculate the ``@NO.SEQ`` values: sequence number of each table. :type fmx_root: ElementType :param fmx_root: The result tree which contains the ``TBL`` elements to update. """ context = iterwalk(fmx_root, events=("start", ), tag=("TBL", )) stack = [] for action, elem in context: # type: str, ElementType elem_level = int(elem.xpath("count(ancestor-or-self::TBL)")) curr_level = len(stack) if curr_level < elem_level: stack.extend([0] * (elem_level - curr_level)) else: stack[:] = stack[:elem_level] stack[elem_level - 1] += 1 no_seq = u".".join(u"{:04d}".format(value) for value in stack) elem.attrib["NO.SEQ"] = no_seq
def parse_table(self, w_tbl): """ Convert a Office Open XML ``<w:tbl>`` into CALS ``<table>`` :type w_tbl: etree._Element :param w_tbl: Office Open XML element. :rtype: etree._Element :return: CALS element. """ state = self._state state.reset() elements = { w(name) for name in {'tbl', 'tblGrid', 'gridCol', 'tr', 'tc'} } context = iterwalk(w_tbl, events=('start', 'end'), tag=elements) depth = 0 for action, elem in context: elem_tag = elem.tag if elem_tag == w('tbl'): if action == 'start': depth += 1 else: depth -= 1 if depth > 1: # .. note:: context.skip_subtree() is not available for all version of lxml # This <tbl> element is inside the table. # It will be handled separately in another call to convert_tbl() continue if action == 'start': if elem_tag == w('tbl'): self.parse_tbl(elem) elif elem_tag == w('tblGrid'): # this element has no specific data pass elif elem_tag == w('gridCol'): state.next_col() self.parse_grid_col(elem) elif elem_tag == w('tr'): state.next_row() self.parse_tr(elem) elif elem_tag == w('tc'): state.next_col() self.parse_tc(elem) else: raise NotImplementedError(elem_tag) else: if elem_tag == w('tr'): # add missing entries bounding_box = Box(1, state.row_pos, len(state.table.cols), state.row_pos) state.table.fill_missing(bounding_box, None, nature=state.row.nature) return state.table
def test_iterwalk_root(events, tag, expected): root = etree.XML("<root><a/></root>") context = iterwalk(root, events=events, tag=tag) actual = [(event, node.tag) for event, node in context] assert expected == actual
def parse_table(self, fmx_corpus): """ Convert a ``<CORPUS>`` Formex element into table object. :type fmx_corpus: ElementType :param fmx_corpus: Formex element. :rtype: ElementType :return: Table. """ state = self._state state.reset() # -- Formex elements fmx = self.get_formex_qname CORPUS = fmx("CORPUS").text ROW = fmx("ROW").text CELL = fmx("CELL").text MARGIN = fmx("MARGIN").text BLK = fmx("BLK").text TI_BLK = fmx("TI.BLK").text STI_BLK = fmx("STI.BLK").text # -- CALS-like elements cals = self.get_cals_qname colspec = cals("colspec").text elements = {CORPUS, ROW, CELL, MARGIN, BLK, TI_BLK, STI_BLK, colspec} context = iterwalk(fmx_corpus, events=("start", "end"), tag=elements) depth = 0 for action, elem in context: elem_tag = elem.tag if elem_tag == CORPUS: if action == "start": depth += 1 else: depth -= 1 if depth > 1: # .. note:: context.skip_subtree() is not available for all version of lxml # This <TBL> element is inside the table. # It will be handled separately in another call to transform_tables() continue if action == "start": # tags sorted by frequency: if elem_tag == CELL: state.next_col() self.parse_fmx_cell(elem) elif elem_tag == ROW: state.next_row() self.parse_fmx_row(elem) elif elem_tag == BLK: # only a container pass elif elem_tag == TI_BLK: state.next_row() self.parse_fmx_ti_blk(elem) elif elem_tag == STI_BLK: state.next_row() self.parse_fmx_sti_blk(elem) elif elem_tag == colspec: state.next_col() self.parse_fmx_colspec(elem) elif elem_tag == CORPUS: self.parse_fmx_corpus(elem) elif elem_tag == MARGIN: raise NotImplementedError("MARGIN is not supported yet") else: raise NotImplementedError(elem_tag) else: if elem_tag in {ROW, TI_BLK, STI_BLK}: bounding_box = Box(1, state.row_pos, len(state.table.cols), state.row_pos) state.table.fill_missing(bounding_box, None, nature=state.row.nature) elif elem_tag == CORPUS: # if there is a GR.NOTES, we create a row for it with the nature "footer" if self.formex_ns: nodes = elem.xpath("preceding-sibling::fmx:GR.NOTES", namespaces={"fmx": self.formex_ns}) else: nodes = elem.xpath("preceding-sibling::GR.NOTES") for fmx_gr_notes in nodes: # Convert the GR.NOTES and remove it state.next_row() self.parse_gr_notes(fmx_gr_notes) fmx_tbl = fmx_gr_notes.getparent() fmx_tbl.remove(fmx_gr_notes) state.table.fill_missing(state.table.bounding_box, None) return state.table
def parse_table(self, cals_table): """ Convert a ``<table>`` CALS element into table object. :type cals_table: ElementType :param cals_table: CALS element. :rtype: benker.table.Table :return: Table. """ state = self._state state.reset() # -- CALS elements cals = self.get_cals_qname table = cals("table").text # titles = cals("titles").text # not supported tgroup = cals("tgroup").text colspec = cals("colspec").text thead = cals("thead").text tfoot = cals("tfoot").text tbody = cals("tbody").text row = cals("row").text # entrytbl = cals("entrytbl").text # not supported entry = cals("entry").text elements = {table, tgroup, colspec, thead, tfoot, tbody, row, entry} context = iterwalk(cals_table, events=("start", "end"), tag=elements) depth = 0 for action, elem in context: elem_tag = elem.tag if elem_tag == table: if action == "start": depth += 1 else: depth -= 1 if depth > 1: # .. note:: context.skip_subtree() is not available for all version of lxml # This <TBL> element is inside the table. # It will be handled separately in another call to transform_tables() continue if action == "start": # tags sorted by frequency: if elem_tag == entry: state.next_col() self.parse_cals_entry(elem) elif elem_tag == row: state.next_row() self.parse_cals_row(elem) elif elem_tag in {tbody, tfoot, thead}: # everything is done in parse_fmx_row() pass elif elem_tag == colspec: state.next_col() self.parse_cals_colspec(elem) elif elem_tag == tgroup: self.parse_cals_tgroup(elem) elif elem_tag == table: self.parse_cals_table(elem) else: raise NotImplementedError(elem_tag) else: if elem_tag in {row}: bounding_box = Box(1, state.row_pos, len(state.table.cols), state.row_pos) state.table.fill_missing(bounding_box, None, nature=state.row.nature) elif elem_tag == table: state.table.fill_missing(state.table.bounding_box, None) return state.table