def testExportJSON(self):
     doc1 = Document('testdocs/test.docx')
     self.assertEqual(doc1.part_by_name['/word/document.xml'].to_reference(),
                      'Part [/word/document.xml]')
     self.assertEqual(doc1.part_by_name['/word/document.xml'].relationships_out()[2].to_reference(),
                      'Relationship [rId7] (source Part [/word/document.xml])')
     self.assertEqual(doc1.part_by_name['/word/document.xml'].to_json()[0:30],
                      '{\n    "content-type": "applica')
     self.assertEqual(doc1.relationships[0].to_json()[0:32],
                      '{\n    "source": "Part [RootPart]')
     self.assertEqual(doc1.to_json()[0:20], '{\n    "document": [\n')
     self.assertEqual(doc1.to_json(include_stream=True)[285:325],
                      '     "stream_b64": "PD94bWwgdmVyc2lvbj0i')
Exemple #2
0
 def testExportJSON(self):
     doc1 = Document('testdocs/test.docx')
     self.assertEqual(
         doc1.part_by_name['/word/document.xml'].to_reference(),
         'Part [/word/document.xml]')
     self.assertEqual(
         doc1.part_by_name['/word/document.xml'].relationships_out()
         [2].to_reference(),
         'Relationship [rId7] (source Part [/word/document.xml])')
     self.assertEqual(
         doc1.part_by_name['/word/document.xml'].to_json()[0:30],
         '{\n    "content-type": "applica')
     self.assertEqual(doc1.relationships[0].to_json()[0:32],
                      '{\n    "source": "Part [RootPart]')
     self.assertEqual(doc1.to_json()[0:20], '{\n    "document": [\n')
     self.assertEqual(
         doc1.to_json(include_stream=True)[285:325],
         '     "stream_b64": "PD94bWwgdmVyc2lvbj0i')
    def testDocOpen(self):
        corpus_path = [u'govdocs/', u'fraunhoferlibrary/']
        files = []
        for dir_ in corpus_path:
            for f in os.listdir(dir_):
                file_ = os.path.join(dir_, f)
                if os.path.isfile(file_):
                    files.append(file_)

        log = codecs.open('smoke_tests.log', 'w', encoding='utf-8')
        errorlog = codecs.open('smoke_tests_error.log', 'w', encoding='utf-8')

        # Write error log header
        errorlog.write('Three files from govdocs (govdocs/641559.docx, govdocs/500968.xlsx, and\n'
                       'govdocs/974690.xlsx) are apparently corrupt and do not open in Microsoft\n'
                       'Office 2010; hence, error messages should appear for them. (Note that govdocs\n'
                       'is a random sample and explicitly includes bad or corrupt files.)\n'
                       'No other error messages should appear here.\n\n\n')

        for docfile in files:
            print '\nTesting %s...' % docfile
            log.write(u'\nTesting %s...\n' % docfile)
            try:
                doc1 = Document(docfile)
            except ZipCRCError:
                msg = 'Error: Bad CRC for file: %s\n' % docfile
                print msg
                log.write(msg)
                errorlog.write(msg)
                continue
            except Exception as e:
                msg = 'Error: File: %s: %s - %s\n' % (docfile, sys.exc_info()[0].__name__, e)
                print msg
                log.write(msg)
                errorlog.write(msg)
                continue

            log.write('  Document type is: %s\n' % doc1.type)
            log.write('  Document is macro enabled: %s\n' % doc1.is_macro_enabled)
            log.write('  Document is a template: %s\n' % doc1.is_template)

            print '  Testing zip.part_info method...'
            log.write('  Testing zip.part_info method...\n')
            log.write('    zip.part_info([Content_Types].xml).file_size: %s\n' %
                      doc1.zip().part_info('[Content_Types].xml').file_size)
            log.write('    zip.part_info([Content_Types].xml).compress_size: %s\n' %
                      doc1.zip().part_info('[Content_Types].xml').compress_size)
            print '  Done.'
            log.write('  Done.\n')

            second_part = doc1.parts[1]
            print '  Testing Part: %s' % second_part.name
            log.write('  Testing Part: %s\n' % second_part.name)

            doc_stream = doc1.part_by_name[second_part.name].stream().read(10)
            print '  Part stream successfully captured.'
            log.write('  Part stream successfully captured.\n')
            partxml = doc1.part_by_name[second_part.name].xml()
            print '  Part XML successfully parsed.'
            log.write('  Part XML successfully parsed.\n')

            print '  Checking doc.xpath method...'
            log.write('  Checking doc.xpath method...\n')
            log.write('    XPath Result: %s\n' %
                      doc1.part_by_name['/[Content_Types].xml'].xpath('*/@ContentType')[0])
            print '  Done.'
            log.write('  Done.\n')

            print '  Checking that all Parts can get Content_Type...'
            log.write('  Checking that all Parts can get Content_Type...\n')
            for part in doc1.parts:
                ct = part.content_type()
                log.write('    Part %s is Content_Type: %s\n' % (part.name, ct))
            print '  Done.'
            log.write('  Done.\n')

            print '  Checking that Document has main_part...'
            log.write('  Checking that Document has main_part...\n')
            doc_main = doc1.main_part()
            log.write('    Main Part: %s\n' % doc_main.name)
            print '  Done.'
            log.write('  Done.\n')

            print '  Checking all source and target Relationships for each part...'
            log.write('  Checking all source and target Relationships for each part...\n')
            for part in doc1.parts:
                rel_in = part.relationships_in()
                rel_out = part.relationships_out()
                log.write('    Part %s: Relationships in source name: %s\n' %
                          (part.name, [r.source.name for r in rel_in]))
                log.write('    Part %s: Relationships out: %s\n' %
                          (part.name, [r.target for r in rel_out]))
            print '  Done.'
            log.write('  Done.\n')

            print '  Testing Document methods to find by Part or Relationship...'
            log.write('  Testing Document methods to find by Part or Relationship...\n')
            log.write('    doc.parts_by_content_type(application/xml): %s\n' %
                      doc1.parts_by_content_type('application/xml')[0])
            log.write('    doc.parts_by_content_type_regex(ation/xm): %s\n' %
                      doc1.parts_by_content_type_regex('ation/xm')[0])
            log.write('    doc.parts_by_relationship_type(/relationships/officeDocument: %s\n' %
                      doc1.parts_by_relationship_type('/relationships/officeDocument')[0].name)
            log.write('    doc.find_relationship_by_type(/relationships/officeDocument).source: %s\n' %
                      doc1.find_relationships_by_type('/relationships/officeDocument')[0].source)
            print '  Done.'
            log.write('  Done.\n')

            print '  Checking for all Features...'
            log.write('  Checking for Features...\n')
            for image in doc1.features.images:
                log.write('    Image: %s\n' % image.name)
            for video in doc1.features.videos:
                log.write('    Video: %s\n' % video.name)
            for sound in doc1.features.sounds:
                log.write('    Sound: %s\n' % sound.name)
            for font in doc1.features.fonts:
                log.write('    Font: %s\n' % font.name)
            for macro in doc1.features.macros:
                log.write('    Macro content: %s\n' % macro.name)
            for comment in doc1.features.comments:
                log.write('    Comment content: %s\n' % comment.name)
            for customX in doc1.features.custom_xml:
                log.write('    Custom XML content: %s\n' % customX.name)
            for embedded_control in doc1.features.embedded_controls:
                log.write('    Embedded Control content: %s\n' % embedded_control.name)
            for embedded_object in doc1.features.embedded_objects:
                log.write('    Embedded Object content: %s\n' % embedded_object.name)
            for embedded_package in doc1.features.embedded_packages:
                log.write('    Embedded Package content: %s\n' % embedded_package.name)
            for digital_signature in doc1.features.digital_signatures:
                log.write('    Digital Signature content: %s\n' % digital_signature.name)
            print '  Done.'
            log.write('  Done.\n')

            print '  Checking Core Properties...'
            log.write('  Checking Core Properties...\n')
            log.write('    Category: %s\n' % doc1.core_properties.category)
            log.write('    Content status: %s\n' % doc1.core_properties.content_status)
            log.write('    Created: %s\n' % doc1.core_properties.created)
            log.write('    Creator: %s\n' % doc1.core_properties.creator)
            log.write('    Description: %s\n' % doc1.core_properties.description)
            log.write('    Identifier: %s\n' % doc1.core_properties.identifier)
            log.write('    Keywords: %s\n' % doc1.core_properties.keywords)
            log.write('    Language: %s\n' % doc1.core_properties.language)
            log.write('    Last modified by: %s\n' % doc1.core_properties.last_modified_by)
            log.write('    Last printed: %s\n' % doc1.core_properties.last_printed)
            log.write('    Modified: %s\n' % doc1.core_properties.modified)
            log.write('    Revision: %s\n' % doc1.core_properties.revision)
            log.write('    Subject: %s\n' % doc1.core_properties.subject)
            log.write('    Title: %s\n' % doc1.core_properties.title)
            log.write('    Version: %s\n' % doc1.core_properties.version)
            print '  Done.'
            log.write('  Done.\n')

            print '  Checking export to JSON...'
            log.write('  Checking export to JSON...\n')
            doc_json = doc1.to_json()
            log.write('    Beginning of JSON: %s\n' % doc_json[0:50])
            print '  Done.'
            log.write('  Done.\n')

            print 'Done.'
            log.write('Done.\n')

        log.close()
        errorlog.close()
Exemple #4
0
    def testDocOpen(self):
        cur_dir = os.path.dirname(os.path.realpath(__file__))
        corpus_path = [
            os.path.join(cur_dir, 'govdocs'),
            os.path.join(cur_dir, 'fraunhoferlibrary')
        ]
        files = []
        for dir_ in corpus_path:
            for f in os.listdir(dir_):
                file_ = os.path.join(dir_, f)
                if os.path.isfile(file_):
                    files.append(file_)

        log = codecs.open('smoke_tests.log',
                          'w',
                          encoding='utf-8',
                          errors="surrogateescape")
        errorlog = codecs.open('smoke_tests_error.log',
                               'w',
                               encoding='utf-8',
                               errors="surrogateescape")

        # Write error log header
        errorlog.write(
            'Three files from govdocs (govdocs/641559.docx, govdocs/500968.xlsx, and\n'
            'govdocs/974690.xlsx) are apparently corrupt and do not open in Microsoft\n'
            'Office 2010; hence, error messages should appear for them. (Note that govdocs\n'
            'is a random sample and explicitly includes bad or corrupt files.)\n'
            'No other error messages should appear here.\n\n\n')

        for docfile in files:
            if (sys.version_info < (3, 0)):
                to_print = docfile.decode('utf8', 'replace')
            else:
                to_print = docfile
            print('\nTesting %s...' % to_print)
            log.write('\nTesting %s...\n' % to_print)
            try:
                doc1 = Document(docfile)
            except ZipCRCError:
                msg = 'Error: Bad CRC for file: %s\n' % to_print
                print(msg)
                log.write(msg)
                errorlog.write(msg)
                continue
            except Exception as e:
                msg = 'Error: File: %s: %s - %s\n' % (
                    to_print, sys.exc_info()[0].__name__, e)
                print(msg)
                log.write(msg)
                errorlog.write(msg)
                continue

            log.write('  Document type is: %s\n' % doc1.type)
            log.write('  Document is macro enabled: %s\n' %
                      doc1.is_macro_enabled)
            log.write('  Document is a template: %s\n' % doc1.is_template)

            print('  Testing zip.part_info method...')
            log.write('  Testing zip.part_info method...\n')
            log.write(
                '    zip.part_info([Content_Types].xml).file_size: %s\n' %
                doc1.zip().part_info('[Content_Types].xml').file_size)
            log.write(
                '    zip.part_info([Content_Types].xml).compress_size: %s\n' %
                doc1.zip().part_info('[Content_Types].xml').compress_size)
            print('  Done.')
            log.write('  Done.\n')

            second_part = doc1.parts[1]
            print('  Testing Part: %s' % second_part.name)
            log.write('  Testing Part: %s\n' % second_part.name)

            doc_stream = doc1.part_by_name[second_part.name].stream().read(10)
            print('  Part stream successfully captured.')
            log.write('  Part stream successfully captured.\n')
            partxml = doc1.part_by_name[second_part.name].xml()
            print('  Part XML successfully parsed.')
            log.write('  Part XML successfully parsed.\n')

            print('  Checking doc.xpath method...')
            log.write('  Checking doc.xpath method...\n')
            log.write('    XPath Result: %s\n' %
                      doc1.part_by_name['/[Content_Types].xml'].xpath(
                          '*/@ContentType')[0])
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking that all Parts can get Content_Type...')
            log.write('  Checking that all Parts can get Content_Type...\n')
            for part in doc1.parts:
                ct = part.content_type()
                log.write('    Part %s is Content_Type: %s\n' %
                          (part.name, ct))
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking that Document has main_part...')
            log.write('  Checking that Document has main_part...\n')
            doc_main = doc1.main_part()
            log.write('    Main Part: %s\n' % doc_main.name)
            print('  Done.')
            log.write('  Done.\n')

            print(
                '  Checking all source and target Relationships for each part...'
            )
            log.write(
                '  Checking all source and target Relationships for each part...\n'
            )
            for part in doc1.parts:
                rel_in = part.relationships_in()
                rel_out = part.relationships_out()
                log.write('    Part %s: Relationships in source name: %s\n' %
                          (part.name, [r.source.name for r in rel_in]))
                log.write('    Part %s: Relationships out: %s\n' %
                          (part.name, [r.target for r in rel_out]))
            print('  Done.')
            log.write('  Done.\n')

            print(
                '  Testing Document methods to find by Part or Relationship...'
            )
            log.write(
                '  Testing Document methods to find by Part or Relationship...\n'
            )
            log.write('    doc.parts_by_content_type(application/xml): %s\n' %
                      doc1.parts_by_content_type('application/xml')[0])
            log.write('    doc.parts_by_content_type_regex(ation/xm): %s\n' %
                      doc1.parts_by_content_type_regex('ation/xm')[0])
            log.write(
                '    doc.parts_by_relationship_type(/relationships/officeDocument: %s\n'
                % doc1.parts_by_relationship_type(
                    '/relationships/officeDocument')[0].name)
            log.write(
                '    doc.find_relationship_by_type(/relationships/officeDocument).source: %s\n'
                % doc1.find_relationships_by_type(
                    '/relationships/officeDocument')[0].source)
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking for all Features...')
            log.write('  Checking for Features...\n')
            for image in doc1.features.images:
                log.write('    Image: %s\n' % image.name)
            for video in doc1.features.videos:
                log.write('    Video: %s\n' % video.name)
            for sound in doc1.features.sounds:
                log.write('    Sound: %s\n' % sound.name)
            for font in doc1.features.fonts:
                log.write('    Font: %s\n' % font.name)
            for macro in doc1.features.macros:
                log.write('    Macro content: %s\n' % macro.name)
            for comment in doc1.features.comments:
                log.write('    Comment content: %s\n' % comment.name)
            for customX in doc1.features.custom_xml:
                log.write('    Custom XML content: %s\n' % customX.name)
            for embedded_control in doc1.features.embedded_controls:
                log.write('    Embedded Control content: %s\n' %
                          embedded_control.name)
            for embedded_object in doc1.features.embedded_objects:
                log.write('    Embedded Object content: %s\n' %
                          embedded_object.name)
            for embedded_package in doc1.features.embedded_packages:
                log.write('    Embedded Package content: %s\n' %
                          embedded_package.name)
            for digital_signature in doc1.features.digital_signatures:
                log.write('    Digital Signature content: %s\n' %
                          digital_signature.name)
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking Core Properties...')
            log.write('  Checking Core Properties...\n')
            log.write('    Category: %s\n' % doc1.core_properties.category)
            log.write('    Content status: %s\n' %
                      doc1.core_properties.content_status)
            log.write('    Created: %s\n' % doc1.core_properties.created)
            log.write('    Creator: %s\n' % doc1.core_properties.creator)
            log.write('    Description: %s\n' %
                      doc1.core_properties.description)
            log.write('    Identifier: %s\n' % doc1.core_properties.identifier)
            log.write('    Keywords: %s\n' % doc1.core_properties.keywords)
            log.write('    Language: %s\n' % doc1.core_properties.language)
            log.write('    Last modified by: %s\n' %
                      doc1.core_properties.last_modified_by)
            log.write('    Last printed: %s\n' %
                      doc1.core_properties.last_printed)
            log.write('    Modified: %s\n' % doc1.core_properties.modified)
            log.write('    Revision: %s\n' % doc1.core_properties.revision)
            log.write('    Subject: %s\n' % doc1.core_properties.subject)
            log.write('    Title: %s\n' % doc1.core_properties.title)
            log.write('    Version: %s\n' % doc1.core_properties.version)
            print('  Done.')
            log.write('  Done.\n')

            print('  Checking export to JSON...')
            log.write('  Checking export to JSON...\n')
            doc_json = doc1.to_json()
            log.write('    Beginning of JSON: %s\n' % doc_json[0:50])
            print('  Done.')
            log.write('  Done.\n')

            print('Done.')
            log.write('Done.\n')

        log.close()
        errorlog.close()