See LICENSE for licensing information. """ import sys from docx import Docx if __name__ == '__main__': docx = None try: docx = Docx(sys.argv[1]) newfile = open(sys.argv[2], 'w') except: print( "Please supply an input and output file. For example:\n" " example-extracttext.py 'My Office 2007 document.docx'" " 'outputfile.txt'" ) exit() # Fetch all the text out of the document we just created paratextlist = docx.getdocumenttext() # Make explicit unicode version newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) # Print out text of document with two newlines under each paragraph newfile.write('\n\n'.join(newparatextlist))
def testtextextraction(): '''Ensure text can be pulled out of a document''' docx = Docx(TEST_FILE) paratextlist = docx.getdocumenttext() assert len(paratextlist) > 0
Part of Python's docx module - http://github.com/mikemaccana/python-docx See LICENSE for licensing information. """ import sys from docx import Docx if __name__ == '__main__': docx = None try: docx = Docx(sys.argv[1]) newfile = open(sys.argv[2], 'w') except: print("Please supply an input and output file. For example:\n" " example-extracttext.py 'My Office 2007 document.docx'" " 'outputfile.txt'") exit() # Fetch all the text out of the document we just created paratextlist = docx.getdocumenttext() # Make explicit unicode version newparatextlist = [] for paratext in paratextlist: newparatextlist.append(paratext.encode("utf-8")) # Print out text of document with two newlines under each paragraph newfile.write('\n\n'.join(newparatextlist))