def notes_extraction(file_name): document = Document(file_name) block_list = iter_block_items(document) result_list = [] table_cnt = 0 for index in range(2, len(block_list)): table = block_list[index] table_name = [] df = DataFrame() if type(table) == type(Table('', CT_Tbl)): table_fmt = read_table(table) df = trim_table_to_df(table_fmt) # if (block_list[index-1] == '(续)' or block_list[index-1] == '' )and type(block_list[index-2]) == type(Table('',CT_Tbl)): if (block_list[index - 1] == '') and type( block_list[index - 2]) == type(Table('', CT_Tbl)): table_ext = read_table(block_list[index - 2]) df = trim_table_to_df(table_ext) result_list += [(file_name, result_list[table_cnt - 1][1], df)] elif type(block_list[index - 1]) == type('') and type( block_list[index - 2]) == type(Table('', CT_Tbl)): table_name = [block_list[index - 1]] result_list += [(file_name, table_name, df)] elif type(block_list[index - 1]) == type('') and type( block_list[index - 2]) == type(''): table_name = [block_list[index - 2], block_list[index - 1]] result_list += [(file_name, table_name, df)] table_cnt += 1 return result_list
def iter_block_items(document): document_elm = document.element.body for child in document_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, document) elif isinstance(child, CT_Tbl): yield Table(child, document)
def row_cells_fixture(self, _cells_, _column_count_): table = Table(None, None) _cells_.return_value = [0, 1, 2, 3, 4, 5, 6, 7, 8] _column_count_.return_value = 3 row_idx = 1 expected_cells = [3, 4, 5] return table, row_idx, expected_cells
def col_cells_fixture(self, _cells_, _column_count_): table = Table(None, None) _cells_.return_value = [0, 1, 2, 3, 4, 5, 6, 7, 8] _column_count_.return_value = 3 column_idx = 1 expected_cells = [1, 4, 7] return table, column_idx, expected_cells
def iter_block_items(self, parent): '''See https://github.com/python-openxml/python-docx/issues/40 Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. Commentary: Cascade uses this function to walk through the Paragraphs and Tables of a document in order. It is (currently) the only way in the python-docx API to determine the physical location of tables within a document. ''' if isinstance(parent, docx_Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("Something is not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def read_item_block(parent): if isinstance(parent, _Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc elif isinstance(parent, _Row): parent_elm = parent._tr else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): count = 1 count_flase = 0 res = Paragraph(child, parent) if res.text != '': yield (res, count_flase) else: try: # 试着去取内联元素 from xml.dom.minidom import parseString DOMTree = parseString(child.xml) data = DOMTree.documentElement nodelist = data.getElementsByTagName('pic:blipFill') print('*nodelist' * 9, nodelist) if len(nodelist) < 1: yield (res, count_flase) else: yield (res, count) except Exception as e: print('*' * 9, e) yield (res, count_flase) elif isinstance(child, CT_Tbl): yield (Table(child, parent), )
def iter_block_items(parent): """ Generate a reference to each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. """ if isinstance(parent, _Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc elif isinstance(parent, _Row): parent_elm = parent._tr else: raise ValueError("something's not right") # TODO make this work for floating tables # as do not necessarily appear in the same order in the document as they do visually # Floating tables can be fixed in word doc by right clicking in table, choosing table properties, # selecting None for text wrapping and clicking on ok. # Then moving the table to the correct place. for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def add_column_fixture(self): snippets = snippet_seq('add-row-col') tbl = parse_xml(snippets[0]) table = Table(tbl, None) width = Inches(1.5) expected_xml = snippets[2] return table, width, expected_xml
def _iter_block_items(parent): """ Generate a reference to each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. Author @scanny: https://github.com/python-openxml/python-docx/issues/276#issuecomment-199502885 """ from docx.table import _Cell from docx.oxml import CT_P from docx.oxml import CT_Tbl if isinstance(parent, DocType): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_rpd_items(self, parent): if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent).text elif isinstance(child, CT_Tbl): table = Table(child, parent) my_table = "Таблица: " for row in table.rows: try: for cell in row.cells: my_table += cell.text my_table += '~' except: print('out of range') pass my_table += '@' text = my_table yield text
def docx_to_text(document_path, event_handler): global logger from docx import Document from docx.table import Table from docx.text.paragraph import Paragraph from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P try: doc = Document(document_path) doc_body = doc.element.body blocks = [] for child in doc_body.iterchildren(): if isinstance(child, CT_P): blocks.append(Paragraph(child, doc_body).text) elif isinstance(child, CT_Tbl): blocks.append('\n'.join( ' | '.join(cell.text for cell in row.cells) for row in Table(child, doc_body).rows)) #end for text = '\n\n'.join(blocks).strip() return text except Exception: logger.exception('Exception while parsing <{}>.'.format( event_handler.key)) #end try # Extract it from the XML with ZipFile(document_path) as document_zipfile: xml_content = document_zipfile.read('word/document.xml') try: from xml.etree.cElementTree import XML except ImportError: from xml.etree.ElementTree import XML tree = XML(xml_content) DOCX_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' DOCX_PARA = DOCX_NAMESPACE + 'p' DOCX_TEXT = DOCX_NAMESPACE + 't' paragraphs = [] for paragraph in tree.getiterator(DOCX_PARA): texts = [ node.text for node in paragraph.getiterator(DOCX_TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) #end for text = '\n\n'.join(paragraphs) return text
def table_style_fixture(self): style = 'foobar' tbl = ( a_tbl().with_nsdecls().with_child( a_tblPr().with_child( a_tblStyle().with_val(style))) ).element table = Table(tbl) return table, style
def iterate_paragraphs_and_tables(docx_document): if isinstance(docx_document, _Document): docx_document_elm = docx_document.element.body else: raise ValueError('ошибка при итерации по блокам docx') for child in docx_document_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, docx_document) elif isinstance(child, CT_Tbl): yield Table(child, docx_document)
def iter_block_docx(parent): if isinstance(parent, _Document): parent_elm = parent.element.body else: raise ValueError("Something went right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def table_nested_parsing(cell, current_row, current_col): for block in cell._element: if isinstance(block, CT_P): #(Paragraph(block, cell).text) return (Paragraph(block, cell).text) if isinstance(block, CT_Tbl): block = Table(block, cell) for row in range(len(block.rows)): for col in range(len(block.columns)): cell_table = block.cell(row, col) table_nested_parsing(cell_table, row, col)
def iter_block_items(parent): if isinstance(parent, _Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc elif isinstance(parent, _Row): parent_elm = parent._tr for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_items(parent): if isinstance(parent, _Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): #判断两个参数是否为同一类型,返回布尔型 yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_cell_items(self, parent): parent_elm = parent._tc for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): table = Table(child, parent) for row in table.rows: for cell in row.cells: yield from self.iter_cell_items(cell)
def table_style_set_fixture(self): # table ------------------------ tbl = a_tbl().with_nsdecls().with_child(a_tblPr()).element table = Table(tbl) # style_name ------------------- style_name = 'foobar' # expected_xml ----------------- expected_xml = ( a_tbl().with_nsdecls().with_child( a_tblPr().with_child( a_tblStyle().with_val(style_name))) ).xml() return table, style_name, expected_xml
def doc_parsing(doc): listField = [] listTable = [] fieldName = '' fileDesc = '' for doc_part in doc.element.body: if isinstance(doc_part, CT_P): pg = Paragraph(doc_part, doc).text if (pg.find('<table_name>') >= 0 and pg.find('</table_name>') > 0): fieldName = pg[pg.find('<table_name>') + 12:pg.find('</table_name>')] + '.java' fileDesc = pg[0:pg.find('<table_name>')] if (isinstance(doc_part, CT_Tbl) and fieldName != ''): tableinfo = TableInfo() tableinfo.fileName = fieldName tableinfo.fileDesc = fileDesc tb1 = Table(doc_part, doc) isMytable = doc_mytable(tb1) if (isMytable == False): continue for row in range(len(tb1.rows)): if (row == 0): continue w2 = WordModel() w2.field = getCellText( tb1, row, dict.get("field") if dict.has_key("field") else '') w2.fieldName = getCellText( tb1, row, dict.get("fieldName") if dict.has_key("fieldName") else '') w2.fieldType = getCellText( tb1, row, dict.get("fieldType") if dict.has_key("fieldType") else '') w2.comment = getCellText( tb1, row, dict.get("comment") if dict.has_key("comment") else '') w2.must = getCellText( tb1, row, dict.get("must") if dict.has_key("must") else '') # print w2.display() w2.fieldType = dataConvert(w2.fieldType) listField.append(w2) # for col in range(len(tb1.columns)): # cell_table = tb1.cell(row, col) # table_nested_parsing(cell_table, row, col) tableinfo.listField = listField listTable.append(tableinfo) fieldName = '' listField = [] return listTable
def read_item(parent): if isinstance(parent, _Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc elif isinstance(parent, _Row): parent_elm = parent._tr else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_items(parent): if isinstance(parent, docx.document.Document): parent_elm = parent.element.body elif isinstance(parent, docx.table._Cell): parent_elm = parent._tc elif isinstance(parent, docx.table._Row): parent_elm = parent._tr else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, docx.oxml.text.paragraph.CT_P): yield docx.text.paragraph.Paragraph(child, parent) elif isinstance(child, docx.oxml.table.CT_Tbl): yield Table(child, parent)
def paragraphs_tables(docx): """ merge tables and paragraphs together in docx need it to keep order of text and tables of docx documents """ p_t_list = [] for content in docx._body._body.getchildren(): if content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p': p_t_list.append(Paragraph(content, docx._body)) elif content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tbl': p_t_list.append(Table(content, docx._body)) else: print(content.tag) return p_t_list
def iterate_items(parent): """ Обход параграфов и таблиц в документе """ if isinstance(parent, DocumentType): parent_elem = parent.element.body elif isinstance(parent, _Cell): parent_elem = parent._tc # pylint: disable=protected-access else: raise ValueError('Oops') for child in parent_elem.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_items(file): """ 获取Word当中的表格及段落并维持其原本的段落顺序. 首先获取docx文档中的每个element 将Paragraph对象的文本提取出来保存在结果列表中 将Table对象保存在结果列表中,Table对象的值需要逐个Cell读取 """ res = [] for child in file.element.body: if isinstance(child, CT_P): res.append(Paragraph(child, Document).text) elif isinstance(child, CT_Tbl): res.append(Table(child, Document)) return res
def _get_docx_part_as_text(doc_part): """ Get all text components from the given BlockItemContainer. """ res_text = "" for ele in doc_part._element: if isinstance(ele, CT_Tbl): tbl = Table(ele, doc_part) for row in tbl.rows: for cell in row.cells: res_text += _get_docx_part_as_text(cell) + "\n" elif isinstance(ele, CT_P): res_text += Paragraph(ele, doc_part).text + "\n" return res_text
def iter_block_items(self, parent): """切分document""" if isinstance(parent, Document): # 是doc parent_elm = parent.element.body # 返回文档的内容 w:body elif isinstance(parent, _Cell): # 是表格单元就将表格单元的内容返回 parent_elm = parent._tc # w:tc [table cell] else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): # 迭代子元素,【分割块】 if isinstance(child, CT_P): # 属于w:p yield Paragraph(child, parent) # 生成器生成段落代理【标记这个块是段落】 elif isinstance(child, CT_Tbl): # 属于w:tb1 yield Table(child, parent) # 生成器生成表格代理【标记这个块是表格】
def iter_block_items(parent): # print('utils.py ----> iter_block_items:', 2) if isinstance(parent, dc): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("[TypeError] Document in insuitable type.") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_items(parent): if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("sth's wrong") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Picture): yield InlineShape(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)
def iter_block_items(parent): # Reads and stores text in parent_elm if isinstance(parent, _Document): parent_elm = parent.element.body # Reads and stores tables in parent_elm elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent)