Beispiel #1
0
def notes_extraction(file_name):

    document = Document(file_name)

    block_list = iter_block_items(document)

    result_list = []
    table_cnt = 0

    for index in range(2, len(block_list)):

        table = block_list[index]
        table_name = []
        df = DataFrame()
        if type(table) == type(Table('', CT_Tbl)):
            table_fmt = read_table(table)
            df = trim_table_to_df(table_fmt)
            #             if (block_list[index-1] == '(续)' or block_list[index-1] == '' )and type(block_list[index-2]) == type(Table('',CT_Tbl)):
            if (block_list[index - 1] == '') and type(
                    block_list[index - 2]) == type(Table('', CT_Tbl)):
                table_ext = read_table(block_list[index - 2])
                df = trim_table_to_df(table_ext)
                result_list += [(file_name, result_list[table_cnt - 1][1], df)]
            elif type(block_list[index - 1]) == type('') and type(
                    block_list[index - 2]) == type(Table('', CT_Tbl)):
                table_name = [block_list[index - 1]]
                result_list += [(file_name, table_name, df)]
            elif type(block_list[index - 1]) == type('') and type(
                    block_list[index - 2]) == type(''):
                table_name = [block_list[index - 2], block_list[index - 1]]
                result_list += [(file_name, table_name, df)]
            table_cnt += 1

    return result_list
Beispiel #2
0
def iter_block_items(document):
    document_elm = document.element.body
    for child in document_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, document)
        elif isinstance(child, CT_Tbl):
            yield Table(child, document)
Beispiel #3
0
 def row_cells_fixture(self, _cells_, _column_count_):
     table = Table(None, None)
     _cells_.return_value = [0, 1, 2, 3, 4, 5, 6, 7, 8]
     _column_count_.return_value = 3
     row_idx = 1
     expected_cells = [3, 4, 5]
     return table, row_idx, expected_cells
Beispiel #4
0
 def col_cells_fixture(self, _cells_, _column_count_):
     table = Table(None, None)
     _cells_.return_value = [0, 1, 2, 3, 4, 5, 6, 7, 8]
     _column_count_.return_value = 3
     column_idx = 1
     expected_cells = [1, 4, 7]
     return table, column_idx, expected_cells
Beispiel #5
0
    def iter_block_items(self, parent):
        '''See https://github.com/python-openxml/python-docx/issues/40
        Yield each paragraph and table child within *parent*, in document order.
        Each returned value is an instance of either Table or Paragraph. *parent*
        would most commonly be a reference to a main Document object, but
        also works for a _Cell object, which itself can contain paragraphs and tables.

        Commentary: 
           Cascade uses this function to walk through the Paragraphs and Tables
           of a document in order.  It is (currently) the only way in the 
           python-docx API to determine the physical location of tables
           within a document.
        '''
        if isinstance(parent, docx_Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("Something is not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                yield Table(child, parent)
def read_item_block(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    elif isinstance(parent, _Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            count = 1
            count_flase = 0
            res = Paragraph(child, parent)
            if res.text != '':
                yield (res, count_flase)
            else:
                try:
                    # 试着去取内联元素
                    from xml.dom.minidom import parseString
                    DOMTree = parseString(child.xml)
                    data = DOMTree.documentElement
                    nodelist = data.getElementsByTagName('pic:blipFill')
                    print('*nodelist' * 9, nodelist)
                    if len(nodelist) < 1:
                        yield (res, count_flase)
                    else:
                        yield (res, count)
                except Exception as e:
                    print('*' * 9, e)
                    yield (res, count_flase)
        elif isinstance(child, CT_Tbl):
            yield (Table(child, parent), )
Beispiel #7
0
def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    elif isinstance(parent, _Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")

    # TODO make this work for floating tables
    # as do not necessarily appear in the same order in the document as they do visually
    # Floating tables can be fixed in word doc by right clicking in table, choosing table properties,
    # selecting None for text wrapping and clicking on ok.
    # Then moving the table to the correct place.
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
Beispiel #8
0
 def add_column_fixture(self):
     snippets = snippet_seq('add-row-col')
     tbl = parse_xml(snippets[0])
     table = Table(tbl, None)
     width = Inches(1.5)
     expected_xml = snippets[2]
     return table, width, expected_xml
Beispiel #9
0
def _iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.

    Author @scanny: https://github.com/python-openxml/python-docx/issues/276#issuecomment-199502885
    """
    from docx.table import _Cell
    from docx.oxml import CT_P
    from docx.oxml import CT_Tbl

    if isinstance(parent, DocType):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
    def iter_block_rpd_items(self, parent):
        if isinstance(parent, Document):
            parent_elm = parent.element.body
        elif isinstance(parent, _Cell):
            parent_elm = parent._tc
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent).text
            elif isinstance(child, CT_Tbl):
                table = Table(child, parent)
                my_table = "Таблица: "

                for row in table.rows:
                    try:
                        for cell in row.cells:
                            my_table += cell.text
                            my_table += '~'
                    except:
                        print('out of range')
                        pass
                    my_table += '@'
                text = my_table
                yield text
def docx_to_text(document_path, event_handler):
    global logger

    from docx import Document
    from docx.table import Table
    from docx.text.paragraph import Paragraph
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P

    try:
        doc = Document(document_path)
        doc_body = doc.element.body
        blocks = []
        for child in doc_body.iterchildren():
            if isinstance(child, CT_P):
                blocks.append(Paragraph(child, doc_body).text)
            elif isinstance(child, CT_Tbl):
                blocks.append('\n'.join(
                    ' | '.join(cell.text for cell in row.cells)
                    for row in Table(child, doc_body).rows))
        #end for

        text = '\n\n'.join(blocks).strip()

        return text

    except Exception:
        logger.exception('Exception while parsing <{}>.'.format(
            event_handler.key))
    #end try

    # Extract it from the XML
    with ZipFile(document_path) as document_zipfile:
        xml_content = document_zipfile.read('word/document.xml')

    try:
        from xml.etree.cElementTree import XML
    except ImportError:
        from xml.etree.ElementTree import XML

    tree = XML(xml_content)

    DOCX_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    DOCX_PARA = DOCX_NAMESPACE + 'p'
    DOCX_TEXT = DOCX_NAMESPACE + 't'

    paragraphs = []
    for paragraph in tree.getiterator(DOCX_PARA):
        texts = [
            node.text for node in paragraph.getiterator(DOCX_TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    #end for

    text = '\n\n'.join(paragraphs)

    return text
Beispiel #12
0
 def table_style_fixture(self):
     style = 'foobar'
     tbl = (
         a_tbl().with_nsdecls().with_child(
             a_tblPr().with_child(
                 a_tblStyle().with_val(style)))
     ).element
     table = Table(tbl)
     return table, style
Beispiel #13
0
def iterate_paragraphs_and_tables(docx_document):
    if isinstance(docx_document, _Document):
        docx_document_elm = docx_document.element.body
    else:
        raise ValueError('ошибка при итерации по блокам docx')
    for child in docx_document_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, docx_document)
        elif isinstance(child, CT_Tbl):
            yield Table(child, docx_document)
def iter_block_docx(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    else:
        raise ValueError("Something went right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
Beispiel #15
0
def table_nested_parsing(cell, current_row, current_col):
    for block in cell._element:
        if isinstance(block, CT_P):
            #(Paragraph(block, cell).text)
            return (Paragraph(block, cell).text)
        if isinstance(block, CT_Tbl):
            block = Table(block, cell)
            for row in range(len(block.rows)):
                for col in range(len(block.columns)):
                    cell_table = block.cell(row, col)
                    table_nested_parsing(cell_table, row, col)
Beispiel #16
0
 def iter_block_items(parent):
     if isinstance(parent, _Document):
         parent_elm = parent.element.body
     elif isinstance(parent, _Cell):
         parent_elm = parent._tc
     elif isinstance(parent, _Row):
         parent_elm = parent._tr
     for child in parent_elm.iterchildren():
         if isinstance(child, CT_P):
             yield Paragraph(child, parent)
         elif isinstance(child, CT_Tbl):
             yield Table(child, parent)
Beispiel #17
0
def iter_block_items(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):  #判断两个参数是否为同一类型,返回布尔型
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
Beispiel #18
0
    def iter_cell_items(self, parent):
        parent_elm = parent._tc

        for child in parent_elm.iterchildren():
            if isinstance(child, CT_P):
                yield Paragraph(child, parent)
            elif isinstance(child, CT_Tbl):
                table = Table(child, parent)

                for row in table.rows:
                    for cell in row.cells:
                        yield from self.iter_cell_items(cell)
Beispiel #19
0
 def table_style_set_fixture(self):
     # table ------------------------
     tbl = a_tbl().with_nsdecls().with_child(a_tblPr()).element
     table = Table(tbl)
     # style_name -------------------
     style_name = 'foobar'
     # expected_xml -----------------
     expected_xml = (
         a_tbl().with_nsdecls().with_child(
             a_tblPr().with_child(
                 a_tblStyle().with_val(style_name)))
     ).xml()
     return table, style_name, expected_xml
Beispiel #20
0
def doc_parsing(doc):
    listField = []
    listTable = []
    fieldName = ''
    fileDesc = ''
    for doc_part in doc.element.body:
        if isinstance(doc_part, CT_P):
            pg = Paragraph(doc_part, doc).text
            if (pg.find('<table_name>') >= 0 and pg.find('</table_name>') > 0):
                fieldName = pg[pg.find('<table_name>') +
                               12:pg.find('</table_name>')] + '.java'
                fileDesc = pg[0:pg.find('<table_name>')]

        if (isinstance(doc_part, CT_Tbl) and fieldName != ''):
            tableinfo = TableInfo()
            tableinfo.fileName = fieldName
            tableinfo.fileDesc = fileDesc
            tb1 = Table(doc_part, doc)
            isMytable = doc_mytable(tb1)
            if (isMytable == False):
                continue
            for row in range(len(tb1.rows)):
                if (row == 0):
                    continue
                w2 = WordModel()
                w2.field = getCellText(
                    tb1, row,
                    dict.get("field") if dict.has_key("field") else '')
                w2.fieldName = getCellText(
                    tb1, row,
                    dict.get("fieldName") if dict.has_key("fieldName") else '')
                w2.fieldType = getCellText(
                    tb1, row,
                    dict.get("fieldType") if dict.has_key("fieldType") else '')
                w2.comment = getCellText(
                    tb1, row,
                    dict.get("comment") if dict.has_key("comment") else '')
                w2.must = getCellText(
                    tb1, row,
                    dict.get("must") if dict.has_key("must") else '')
                # print w2.display()
                w2.fieldType = dataConvert(w2.fieldType)
                listField.append(w2)
                # for col in range(len(tb1.columns)):
                #     cell_table = tb1.cell(row, col)
                #     table_nested_parsing(cell_table, row, col)
            tableinfo.listField = listField
            listTable.append(tableinfo)
            fieldName = ''
            listField = []
    return listTable
def read_item(parent):
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    elif isinstance(parent, _Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
Beispiel #22
0
def iter_block_items(parent):
    if isinstance(parent, docx.document.Document):
        parent_elm = parent.element.body
    elif isinstance(parent, docx.table._Cell):
        parent_elm = parent._tc
    elif isinstance(parent, docx.table._Row):
        parent_elm = parent._tr
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, docx.oxml.text.paragraph.CT_P):
            yield docx.text.paragraph.Paragraph(child, parent)
        elif isinstance(child, docx.oxml.table.CT_Tbl):
            yield Table(child, parent)
Beispiel #23
0
def paragraphs_tables(docx):
    """
	merge tables and paragraphs together in docx
	need it to keep order of text and tables of docx documents
	"""
    p_t_list = []
    for content in docx._body._body.getchildren():
        if content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p':
            p_t_list.append(Paragraph(content, docx._body))
        elif content.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tbl':
            p_t_list.append(Table(content, docx._body))
        else:
            print(content.tag)
    return p_t_list
Beispiel #24
0
def iterate_items(parent):
    """ Обход параграфов и таблиц в документе """
    if isinstance(parent, DocumentType):
        parent_elem = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elem = parent._tc  # pylint: disable=protected-access
    else:
        raise ValueError('Oops')

    for child in parent_elem.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
Beispiel #25
0
def iter_block_items(file):
    """
	获取Word当中的表格及段落并维持其原本的段落顺序.
	首先获取docx文档中的每个element
	将Paragraph对象的文本提取出来保存在结果列表中
	将Table对象保存在结果列表中,Table对象的值需要逐个Cell读取
	"""
    res = []
    for child in file.element.body:
        if isinstance(child, CT_P):
            res.append(Paragraph(child, Document).text)
        elif isinstance(child, CT_Tbl):
            res.append(Table(child, Document))
    return res
Beispiel #26
0
def _get_docx_part_as_text(doc_part):
    """
    Get all text components from the given BlockItemContainer.
    """
    res_text = ""
    for ele in doc_part._element:
        if isinstance(ele, CT_Tbl):
            tbl = Table(ele, doc_part)
            for row in tbl.rows:
                for cell in row.cells:
                    res_text += _get_docx_part_as_text(cell) + "\n"
        elif isinstance(ele, CT_P):
            res_text += Paragraph(ele, doc_part).text + "\n"
    return res_text
Beispiel #27
0
    def iter_block_items(self, parent):
        """切分document"""
        if isinstance(parent, Document):  # 是doc
            parent_elm = parent.element.body  # 返回文档的内容 w:body
        elif isinstance(parent, _Cell):  # 是表格单元就将表格单元的内容返回
            parent_elm = parent._tc  # w:tc [table cell]
        else:
            raise ValueError("something's not right")

        for child in parent_elm.iterchildren():  # 迭代子元素,【分割块】
            if isinstance(child, CT_P):  # 属于w:p
                yield Paragraph(child, parent)  # 生成器生成段落代理【标记这个块是段落】
            elif isinstance(child, CT_Tbl):  # 属于w:tb1
                yield Table(child, parent)  # 生成器生成表格代理【标记这个块是表格】
def iter_block_items(parent):
    # print('utils.py ----> iter_block_items:', 2)
    if isinstance(parent, dc):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("[TypeError] Document in insuitable type.")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
def iter_block_items(parent):
    if isinstance(parent, Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("sth's wrong")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Picture):
            yield InlineShape(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
Beispiel #30
0
def iter_block_items(parent):
    # Reads and stores text in parent_elm
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    # Reads and stores tables in parent_elm
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)