Example #1
0
    def _build_data(self, path, *args, **kwargs):
        with ZipFile(path) as f:
            # These must be in the ZIP in order for the docx to be valid.
            self.document_text = f.read('word/document.xml')
            self.relationship_text = f.read('word/_rels/document.xml.rels')

            # These are all optional.
            self.styles_text = self._extract_xml(f, 'word/styles.xml')
            self.fonts = self._extract_xml(f, 'word/fontTable.xml')
            self.numbering_text = self._extract_xml(f, 'word/numbering.xml')
            self.comment_text = self._extract_xml(f, 'word/comments.xml')

            zipped_image_files = [
                e for e in f.infolist() if e.filename.startswith('word/media/')
            ]
            for e in zipped_image_files:
                self._image_data[e.filename] = f.read(e.filename)

        self.root = parse_xml_from_string(self.document_text)
        self.numbering_root = None
        if self.numbering_text:
            self.numbering_root = parse_xml_from_string(self.numbering_text)
        self.comment_root = None
        if self.comment_text:
            self.comment_root = parse_xml_from_string(self.comment_text)
Example #2
0
    def _build_data(
            self,
            path,
            document_xml=None,
            rels_dict=None,
            numbering_dict=None,
            styles_dict=None,
            *args, **kwargs):
        self._test_rels_dict = rels_dict
        if rels_dict:
            for value in rels_dict.values():
                self._image_data['word/%s' % value] = 'word/%s' % value
        self.numbering_root = None
        if numbering_dict is not None:
            self.numbering_root = parse_xml_from_string(
                DXB.numbering(numbering_dict),
            )
        self.numbering_dict = numbering_dict
        # Intentionally not calling super
        if document_xml is not None:
            self.root = parse_xml_from_string(document_xml)
        self.zip_path = ''

        # This is the standard page width for a word document, Also the page
        # width that we are looking for in the test.
        self.page_width = 612

        self.styles_dict = styles_dict
Example #3
0
    def _build_data(self, path, *args, **kwargs):
        with ZipFile(path) as f:
            # These must be in the ZIP in order for the docx to be valid.
            self.document_text = f.read('word/document.xml')
            self.relationship_text = f.read('word/_rels/document.xml.rels')

            # These are all optional.
            self.styles_text = self._extract_xml(f, 'word/styles.xml')
            self.fonts = self._extract_xml(f, 'word/fontTable.xml')
            self.numbering_text = self._extract_xml(f, 'word/numbering.xml')
            self.comment_text = self._extract_xml(f, 'word/comments.xml')

            zipped_image_files = [
                e for e in f.infolist()
                if e.filename.startswith('word/media/')
            ]
            for e in zipped_image_files:
                self._image_data[e.filename] = f.read(e.filename)

        self.root = parse_xml_from_string(self.document_text)
        self.numbering_root = None
        if self.numbering_text:
            self.numbering_root = parse_xml_from_string(self.numbering_text)
        self.comment_root = None
        if self.comment_text:
            self.comment_root = parse_xml_from_string(self.comment_text)
Example #4
0
 def _parse_rels_root(self):
     tree = parse_xml_from_string(self.relationship_text)
     rels_dict = {}
     for el in tree:
         rId = el.get("Id")
         target = el.get("Target")
         rels_dict[rId] = target
     return rels_dict
Example #5
0
 def _parse_rels_root(self):
     tree = parse_xml_from_string(self.relationship_text)
     rels_dict = {}
     for el in tree:
         rId = el.get('Id')
         target = el.get('Target')
         rels_dict[rId] = target
     return rels_dict
Example #6
0
 def _parse_styles(self):
     if self.styles_text is None:
         return {}
     tree = parse_xml_from_string(self.styles_text)
     styles_dict = {}
     for style in find_all(tree, "style"):
         style_val = find_first(style, "name").attrib["val"]
         run_properties = find_first(style, "rPr")
         styles_dict[style.attrib["styleId"]] = {
             "style_name": style_val,
             "default_run_properties": self._parse_run_properties(run_properties),
         }
     return styles_dict
Example #7
0
 def _parse_styles(self):
     if self.styles_text is None:
         return {}
     tree = parse_xml_from_string(self.styles_text)
     styles_dict = {}
     for style in find_all(tree, 'style'):
         style_val = find_first(style, 'name').attrib['val']
         run_properties = find_first(style, 'rPr')
         styles_dict[style.attrib['styleId']] = {
             'style_name': style_val,
             'default_run_properties': self._parse_run_properties(
                 run_properties,
             ),
         }
     return styles_dict
Example #8
0
 def _parse_styles(self):
     if self.styles_text is None:
         return {}
     tree = parse_xml_from_string(self.styles_text)
     styles_dict = {}
     for style in find_all(tree, 'style'):
         style_val = find_first(style, 'name').attrib['val']
         run_properties = find_first(style, 'rPr')
         styles_dict[style.attrib['styleId']] = {
             'style_name':
             style_val,
             'default_run_properties':
             self._parse_run_properties(run_properties, ),
         }
     return styles_dict
Example #9
0
    def _load(self):
        self.document = WordprocessingDocument(path=None)
        package = self.document.package
        document_part = package.create_part(
            uri='/word/document.xml',
        )

        if self.relationships:
            for relationship in self.relationships:
                target_mode = 'Internal'
                if relationship['external']:
                    target_mode = 'External'
                target_uri = relationship['target_path']
                if 'data' in relationship:
                    full_target_uri = posixpath.join(
                        package.uri,
                        'word',
                        target_uri,
                    )
                    package.streams[full_target_uri] = BytesIO(
                        relationship['data'],
                    )
                    package.create_part(uri=full_target_uri)
                document_part.create_relationship(
                    target_uri=target_uri,
                    target_mode=target_mode,
                    relationship_type=relationship['relationship_type'],
                    relationship_id=relationship['relationship_id'],
                )

        package.streams[document_part.uri] = BytesIO(self.document_xml)
        package.create_relationship(
            target_uri=document_part.uri,
            target_mode='Internal',
            relationship_type=MainDocumentPart.relationship_type,
        )

        self.numbering_root = None
        if self.numbering_dict is not None:
            self.numbering_root = parse_xml_from_string(
                DXB.numbering(self.numbering_dict),
            )

        # This is the standard page width for a word document (in points), Also
        # the page width that we are looking for in the test.
        self.page_width = 612

        self.parse_begin(self.document.main_document_part.root_element)
Example #10
0
 def test_get_image_sizes(self):
     parser = XMLDocx2Html(
         document_xml=self.get_xml(),
         relationships=self.relationships,
     )
     tree = parse_xml_from_string(self.get_xml())
     els = []
     els.extend(find_all(tree, 'drawing'))
     els.extend(find_all(tree, 'pict'))
     image_ids = []
     for el in els:
         image_ids.append(parser._get_image_size(el))
     expected = [
         ('40px', '20px'),
         ('41pt', '21pt'),
     ]
     self.assertEqual(
         set(image_ids),
         set(expected),
     )
Example #11
0
 def test_get_image_sizes(self):
     parser = XMLDocx2Html(
         document_xml=self.get_xml(),
         rels_dict=self.relationship_dict,
     )
     tree = parse_xml_from_string(self.get_xml())
     els = []
     els.extend(find_all(tree, 'drawing'))
     els.extend(find_all(tree, 'pict'))
     image_ids = []
     for el in els:
         image_ids.append(parser._get_image_size(el))
     expected = [
         ('40px', '20px'),
         ('41pt', '21pt'),
     ]
     self.assertEqual(
         set(image_ids),
         set(expected),
     )
Example #12
0
 def test_get_image_id(self):
     parser = XMLDocx2Html(
         document_xml=self.get_xml(),
         rels_dict=self.relationship_dict,
     )
     tree = parse_xml_from_string(self.get_xml())
     els = []
     els.extend(find_all(tree, 'drawing'))
     els.extend(find_all(tree, 'pict'))
     image_ids = []
     for el in els:
         image_ids.append(parser._get_image_id(el))
     expected = [
         'rId0',
         'rId1',
     ]
     self.assertEqual(
         set(image_ids),
         set(expected),
     )
Example #13
0
 def test_get_image_id(self):
     parser = XMLDocx2Html(
         document_xml=self.get_xml(),
         relationships=self.relationships,
     )
     tree = parse_xml_from_string(self.get_xml())
     els = []
     els.extend(find_all(tree, 'drawing'))
     els.extend(find_all(tree, 'pict'))
     image_ids = []
     for el in els:
         image_ids.append(parser._get_image_id(el))
     expected = [
         'rId0',
         'rId1',
     ]
     self.assertEqual(
         set(image_ids),
         set(expected),
     )
Example #14
0
 def root_element(self):
     if self._root_element is None:
         self._root_element = parse_xml_from_string(self.stream.read())
     return self._root_element
Example #15
0
 def root_element(self):
     if self._root_element is None:
         self._root_element = parse_xml_from_string(self.stream.read())
     return self._root_element