def _build_data(self, path, *args, **kwargs): with ZipFile(path) as f: # These must be in the ZIP in order for the docx to be valid. self.document_text = f.read('word/document.xml') self.relationship_text = f.read('word/_rels/document.xml.rels') # These are all optional. self.styles_text = self._extract_xml(f, 'word/styles.xml') self.fonts = self._extract_xml(f, 'word/fontTable.xml') self.numbering_text = self._extract_xml(f, 'word/numbering.xml') self.comment_text = self._extract_xml(f, 'word/comments.xml') zipped_image_files = [ e for e in f.infolist() if e.filename.startswith('word/media/') ] for e in zipped_image_files: self._image_data[e.filename] = f.read(e.filename) self.root = parse_xml_from_string(self.document_text) self.numbering_root = None if self.numbering_text: self.numbering_root = parse_xml_from_string(self.numbering_text) self.comment_root = None if self.comment_text: self.comment_root = parse_xml_from_string(self.comment_text)
def _build_data( self, path, document_xml=None, rels_dict=None, numbering_dict=None, styles_dict=None, *args, **kwargs): self._test_rels_dict = rels_dict if rels_dict: for value in rels_dict.values(): self._image_data['word/%s' % value] = 'word/%s' % value self.numbering_root = None if numbering_dict is not None: self.numbering_root = parse_xml_from_string( DXB.numbering(numbering_dict), ) self.numbering_dict = numbering_dict # Intentionally not calling super if document_xml is not None: self.root = parse_xml_from_string(document_xml) self.zip_path = '' # This is the standard page width for a word document, Also the page # width that we are looking for in the test. self.page_width = 612 self.styles_dict = styles_dict
def _parse_rels_root(self): tree = parse_xml_from_string(self.relationship_text) rels_dict = {} for el in tree: rId = el.get("Id") target = el.get("Target") rels_dict[rId] = target return rels_dict
def _parse_rels_root(self): tree = parse_xml_from_string(self.relationship_text) rels_dict = {} for el in tree: rId = el.get('Id') target = el.get('Target') rels_dict[rId] = target return rels_dict
def _parse_styles(self): if self.styles_text is None: return {} tree = parse_xml_from_string(self.styles_text) styles_dict = {} for style in find_all(tree, "style"): style_val = find_first(style, "name").attrib["val"] run_properties = find_first(style, "rPr") styles_dict[style.attrib["styleId"]] = { "style_name": style_val, "default_run_properties": self._parse_run_properties(run_properties), } return styles_dict
def _parse_styles(self): if self.styles_text is None: return {} tree = parse_xml_from_string(self.styles_text) styles_dict = {} for style in find_all(tree, 'style'): style_val = find_first(style, 'name').attrib['val'] run_properties = find_first(style, 'rPr') styles_dict[style.attrib['styleId']] = { 'style_name': style_val, 'default_run_properties': self._parse_run_properties( run_properties, ), } return styles_dict
def _parse_styles(self): if self.styles_text is None: return {} tree = parse_xml_from_string(self.styles_text) styles_dict = {} for style in find_all(tree, 'style'): style_val = find_first(style, 'name').attrib['val'] run_properties = find_first(style, 'rPr') styles_dict[style.attrib['styleId']] = { 'style_name': style_val, 'default_run_properties': self._parse_run_properties(run_properties, ), } return styles_dict
def _load(self): self.document = WordprocessingDocument(path=None) package = self.document.package document_part = package.create_part( uri='/word/document.xml', ) if self.relationships: for relationship in self.relationships: target_mode = 'Internal' if relationship['external']: target_mode = 'External' target_uri = relationship['target_path'] if 'data' in relationship: full_target_uri = posixpath.join( package.uri, 'word', target_uri, ) package.streams[full_target_uri] = BytesIO( relationship['data'], ) package.create_part(uri=full_target_uri) document_part.create_relationship( target_uri=target_uri, target_mode=target_mode, relationship_type=relationship['relationship_type'], relationship_id=relationship['relationship_id'], ) package.streams[document_part.uri] = BytesIO(self.document_xml) package.create_relationship( target_uri=document_part.uri, target_mode='Internal', relationship_type=MainDocumentPart.relationship_type, ) self.numbering_root = None if self.numbering_dict is not None: self.numbering_root = parse_xml_from_string( DXB.numbering(self.numbering_dict), ) # This is the standard page width for a word document (in points), Also # the page width that we are looking for in the test. self.page_width = 612 self.parse_begin(self.document.main_document_part.root_element)
def test_get_image_sizes(self): parser = XMLDocx2Html( document_xml=self.get_xml(), relationships=self.relationships, ) tree = parse_xml_from_string(self.get_xml()) els = [] els.extend(find_all(tree, 'drawing')) els.extend(find_all(tree, 'pict')) image_ids = [] for el in els: image_ids.append(parser._get_image_size(el)) expected = [ ('40px', '20px'), ('41pt', '21pt'), ] self.assertEqual( set(image_ids), set(expected), )
def test_get_image_sizes(self): parser = XMLDocx2Html( document_xml=self.get_xml(), rels_dict=self.relationship_dict, ) tree = parse_xml_from_string(self.get_xml()) els = [] els.extend(find_all(tree, 'drawing')) els.extend(find_all(tree, 'pict')) image_ids = [] for el in els: image_ids.append(parser._get_image_size(el)) expected = [ ('40px', '20px'), ('41pt', '21pt'), ] self.assertEqual( set(image_ids), set(expected), )
def test_get_image_id(self): parser = XMLDocx2Html( document_xml=self.get_xml(), rels_dict=self.relationship_dict, ) tree = parse_xml_from_string(self.get_xml()) els = [] els.extend(find_all(tree, 'drawing')) els.extend(find_all(tree, 'pict')) image_ids = [] for el in els: image_ids.append(parser._get_image_id(el)) expected = [ 'rId0', 'rId1', ] self.assertEqual( set(image_ids), set(expected), )
def test_get_image_id(self): parser = XMLDocx2Html( document_xml=self.get_xml(), relationships=self.relationships, ) tree = parse_xml_from_string(self.get_xml()) els = [] els.extend(find_all(tree, 'drawing')) els.extend(find_all(tree, 'pict')) image_ids = [] for el in els: image_ids.append(parser._get_image_id(el)) expected = [ 'rId0', 'rId1', ] self.assertEqual( set(image_ids), set(expected), )
def root_element(self): if self._root_element is None: self._root_element = parse_xml_from_string(self.stream.read()) return self._root_element