Example #1
0
def testmakeelement():
    '''Ensure custom elements get created'''
    docx = Docx()
    testelement = docx._makeelement('testname', attributes={'testattribute':'testvalue'}, tagtext='testtagtext')
    assert testelement.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testname'
    assert testelement.attrib == {'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testattribute': 'testvalue'}
    assert testelement.text == 'testtagtext'
Example #2
0
def testtable():
    '''Ensure tables make sense'''
    docx = Docx()
    testtable = docx.table([['A1', 'A2'], ['B1', 'B2'], ['C1', 'C2']])
    ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    assert testtable.xpath('/ns0:tbl/ns0:tr[2]/ns0:tc[2]/ns0:p/ns0:r/ns0:t',
                           namespaces={'ns0':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})[0].text == 'B2'
Example #3
0
def testnewdocument():
    '''Test that a new document can be created'''
    docx = Docx()
    docx.coreproperties('Python docx testnewdocument',
                        'A short example of making docx from Python',
                        'Alan Brooks',
                        ['python', 'Office Open XML', 'Word'])
    docx.savedocx(TEST_FILE)
Example #4
0
def testunsupportedpagebreak():
    '''Ensure unsupported page break types are trapped'''
    docx = Docx()
    try:
        docx.pagebreak(type='unsup')
    except ValueError:
        return  # passed
    assert False  # failed
Example #5
0
def testunsupportedpagebreak():
    '''Ensure unsupported page break types are trapped'''
    docx = Docx()
    try:
        docx.pagebreak(type='unsup')
    except ValueError:
        return  # passed
    assert False  # failed
Example #6
0
def testtable():
    '''Ensure tables make sense'''
    docx = Docx()
    testtable = docx.table([['A1', 'A2'], ['B1', 'B2'], ['C1', 'C2']])
    ns = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    assert testtable.xpath(
        '/ns0:tbl/ns0:tr[2]/ns0:tc[2]/ns0:p/ns0:r/ns0:t',
        namespaces={
            'ns0':
            'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
        })[0].text == 'B2'
Example #7
0
def testmakeelement():
    '''Ensure custom elements get created'''
    docx = Docx()
    testelement = docx._makeelement('testname',
                                    attributes={'testattribute': 'testvalue'},
                                    tagtext='testtagtext')
    assert testelement.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testname'
    assert testelement.attrib == {
        '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}testattribute':
        'testvalue'
    }
    assert testelement.text == 'testtagtext'
Example #8
0
def testopendocx():
    '''Ensure an etree element is returned'''
    docx = Docx(TEST_FILE)
    if isinstance(docx._document, lxml.etree._Element):
        pass
    else:
        assert False
 def from_word(self, file):
     try:
         # Docx
         paragraphs = Docx(file).paragraphs
         for paragraphs in paragraphs:
             if detect(paragraph) == 'en':
                 self.paragraphs.append(Text(paragraphs))
     except:
         try:
             # Doc
             string = subprocess.check_output(['antiword', '-t', file])
             # Decode and split by paragraphs
             extracted_list = string.decode('utf-8').split('\n\n')
             for paragraph in extracted_list:
                 if detect(paragraph) == 'en':
                     self.paragraphs.append(Text(paragraph))
         except:
             # If Antiword does not work, convert to txt
             subprocess.run(['textutil', '-convert', 'txt', file])
             file = file.replace('.doc', '.txt')
             with open(file, 'r') as f:
                 data = f.readlines()
                 for paragraph in data:
                     if detect(paragraph) == 'en':
                         self.paragraphs.append(Text(paragraph))
Example #10
0
 def get_lines_from_source(self):
     """ Returns a list with the lines from the source """
     extension = self.get_doc_file_extension()
     if extension in ('txt', ''):
         return tuple(line.decode('utf-8') for line in self.doc_file.readlines())
     elif extension == 'docx':
         docx_document = Docx(BytesIO(self.doc_file.read()))
         return tuple(paragrah.text for paragrah in docx_document.paragraphs)
     elif extension == 'pdf':
         raise NotImplementedError()
     else:
         raise ValueError("file_format not supported")
Example #11
0
def testnewdocument():
    '''Test that a new document can be created'''
    docx = Docx()
    docx.coreproperties('Python docx testnewdocument',
                        'A short example of making docx from Python',
                        'Alan Brooks', ['python', 'Office Open XML', 'Word'])
    docx.savedocx(TEST_FILE)
Example #12
0
    def get_full_text_from_source(self):
        """ Returns the full text from the source """
        extension = self.get_doc_file_extension()

        if extension in ('txt', ''):
            # string = unicode(string)
            return self.doc_file.read().decode("utf-8")
        elif extension == 'docx':
            docx_document = Docx(BytesIO(self.doc_file.read()))
            return "\n".join(p.text for p in docx_document.paragraphs)
        elif extension == 'pdf':
            raise NotImplementedError()
        else:
            raise ValueError("file_format not supported")
Example #13
0
"""
This file makes a .docx (Word 2007) file from scratch, showing off most of the
features of python-docx.

If you need to make documents from scratch, you can use this file as a basis
for your work.

Part of Python's docx module - http://github.com/mikemaccana/python-docx
See LICENSE for licensing information.
"""

from docx import Docx

if __name__ == '__main__':
    # Make a new document tree - this is the main part of a Word document
    docx = Docx()

    # Append two headings and a paragraph
    docx.heading("Welcome to Python's docx module", 1)
    docx.heading('Make and edit docx in 200 lines of pure Python', 2)
    docx.paragraph('The module was created when I was looking for a '
        'Python support for MS Word .doc files on PyPI and Stackoverflow. '
        'Unfortunately, the only solutions I could find used:')

    # Add a numbered list
    points = [ 'COM automation'
             , '.net or Java'
             , 'Automating OpenOffice or MS Office'
             ]
    for point in points:
        docx.paragraph(point, style='ListNumber')
If you need to extract text from documents, use this file as a basis for your
work.

Part of Python's docx module - http://github.com/mikemaccana/python-docx
See LICENSE for licensing information.
"""

import sys

from docx import Docx

if __name__ == '__main__':
    docx = None
    try:
        docx = Docx(sys.argv[1])
        newfile = open(sys.argv[2], 'w')
    except:
        print(
            "Please supply an input and output file. For example:\n"
            "  example-extracttext.py 'My Office 2007 document.docx'"
            " 'outputfile.txt'"
        )
        exit()

    # Fetch all the text out of the document we just created
    paratextlist = docx.getdocumenttext()

    # Make explicit unicode version
    newparatextlist = []
    for paratext in paratextlist:
Example #15
0
def simpledoc():
    '''Make a docx (document, relationships) for use in other docx tests'''
    docx = Docx()
    docx.heading('Heading 1', 1)
    docx.heading('Heading 2', 2)
    docx.paragraph('Paragraph 1')
    for point in ['List Item 1', 'List Item 2', 'List Item 3']:
        docx.paragraph(point, style='ListNumber')
    docx.pagebreak(type='page')
    docx.paragraph('Paragraph 2')
    docx.table([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3'], ['C1', 'C2', 'C3']])
    docx.pagebreak(type='section', orient='portrait')
    docx.picture(IMAGE1_FILE, 'This is a test description')
    docx.pagebreak(type='section', orient='landscape')
    docx.paragraph('Paragraph 3')
    return docx
Example #16
0
def testparagraph():
    '''Ensure paragraph creates p elements'''
    docx = Docx()
    testpara = docx.paragraph('paratext', style='BodyText')
    assert testpara.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p'
    pass
Example #17
0
def testtextextraction():
    '''Ensure text can be pulled out of a document'''
    docx = Docx(TEST_FILE)
    paratextlist = docx.getdocumenttext()
    assert len(paratextlist) > 0
Example #18
0
def testtextextraction():
    '''Ensure text can be pulled out of a document'''
    docx = Docx(TEST_FILE)
    paratextlist = docx.getdocumenttext()
    assert len(paratextlist) > 0
Example #19
0
def testparagraph():
    '''Ensure paragraph creates p elements'''
    docx = Docx()
    testpara = docx.paragraph('paratext', style='BodyText')
    assert testpara.tag == '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p'
    pass
Example #20
0
def simpledoc():
    '''Make a docx (document, relationships) for use in other docx tests'''
    docx = Docx()
    docx.heading('Heading 1', 1)  
    docx.heading('Heading 2', 2)
    docx.paragraph('Paragraph 1')
    for point in ['List Item 1', 'List Item 2', 'List Item 3']:
        docx.paragraph(point, style='ListNumber')
    docx.pagebreak(type='page')
    docx.paragraph('Paragraph 2')
    docx.table([['A1', 'A2', 'A3'], ['B1', 'B2', 'B3'], ['C1', 'C2', 'C3']])
    docx.pagebreak(type='section', orient='portrait')
    docx.picture(IMAGE1_FILE, 'This is a test description')
    docx.pagebreak(type='section', orient='landscape')
    docx.paragraph('Paragraph 3')
    return docx
If you need to extract text from documents, use this file as a basis for your
work.

Part of Python's docx module - http://github.com/mikemaccana/python-docx
See LICENSE for licensing information.
"""

import sys

from docx import Docx

if __name__ == '__main__':
    docx = None
    try:
        docx = Docx(sys.argv[1])
        newfile = open(sys.argv[2], 'w')
    except:
        print("Please supply an input and output file. For example:\n"
              "  example-extracttext.py 'My Office 2007 document.docx'"
              " 'outputfile.txt'")
        exit()

    # Fetch all the text out of the document we just created
    paratextlist = docx.getdocumenttext()

    # Make explicit unicode version
    newparatextlist = []
    for paratext in paratextlist:
        newparatextlist.append(paratext.encode("utf-8"))