def _get_image_size(self, el): """ If we can't find a height or width, return 0 for whichever is not found, then rely on the `image` handler to strip those attributes. This functionality can change once we integrate PIL. """ sizes = find_first(el, "ext") if sizes is not None and sizes.get("cx"): if sizes.get("cx"): x = self._convert_image_size(int(sizes.get("cx"))) if sizes.get("cy"): y = self._convert_image_size(int(sizes.get("cy"))) return ("%dpx" % x, "%dpx" % y) shape = find_first(el, "shape") if shape is not None and shape.get("style") is not None: # If either of these are not set, rely on the method `image` to not # use either of them. x = 0 y = 0 styles = shape.get("style").split(";") for s in styles: if s.startswith("height:"): y = s.split(":")[1] if s.startswith("width:"): x = s.split(":")[1] return x, y return 0, 0
def _get_image_size(self, el): """ If we can't find a height or width, return 0 for whichever is not found, then rely on the `image` handler to strip those attributes. This functionality can change once we integrate PIL. """ sizes = find_first(el, 'ext') if sizes is not None and sizes.get('cx'): if sizes.get('cx'): x = self._convert_image_size(int(sizes.get('cx'))) if sizes.get('cy'): y = self._convert_image_size(int(sizes.get('cy'))) return ( '%dpx' % x, '%dpx' % y, ) shape = find_first(el, 'shape') if shape is not None and shape.get('style') is not None: # If either of these are not set, rely on the method `image` to not # use either of them. x = 0 y = 0 styles = shape.get('style').split(';') for s in styles: if s.startswith('height:'): y = s.split(':')[1] if s.startswith('width:'): x = s.split(':')[1] return x, y return 0, 0
def __init__( self, path, convert_root_level_upper_roman=False, *args, **kwargs): self._parsed = '' self.block_text = '' self.page_width = 0 self.convert_root_level_upper_roman = convert_root_level_upper_roman self._image_data = {} self._build_data(path, *args, **kwargs) self.pre_processor = None #divide by 20 to get to pt (Office works in 20th's of a point) """ see http://msdn.microsoft.com/en-us/library/documentformat .openxml.wordprocessing.indentation.aspx """ if find_first(self.root, 'pgSz') is not None: self.page_width = int( find_first(self.root, 'pgSz').attrib['w'] ) / 20 #all blank when we init self.comment_store = None self.visited = set() self.list_depth = 0 self.rels_dict = self._parse_rels_root() self.styles_dict = self._parse_styles() self.parse_begin(self.root) # begin to parse
def __init__(self, path, convert_root_level_upper_roman=False, *args, **kwargs): self._parsed = '' self.block_text = '' self.page_width = 0 self.convert_root_level_upper_roman = convert_root_level_upper_roman self._image_data = {} self._build_data(path, *args, **kwargs) self.pre_processor = None #divide by 20 to get to pt (Office works in 20th's of a point) """ see http://msdn.microsoft.com/en-us/library/documentformat .openxml.wordprocessing.indentation.aspx """ if find_first(self.root, 'pgSz') is not None: self.page_width = int(find_first(self.root, 'pgSz').attrib['w']) / 20 #all blank when we init self.comment_store = None self.visited = set() self.list_depth = 0 self.rels_dict = self._parse_rels_root() self.styles_dict = self._parse_styles() self.parse_begin(self.root) # begin to parse
def _get_image_id(self, el): # Drawings blip = find_first(el, 'blip') if blip is not None: # On drawing tags the id is actually whatever is returned from the # embed attribute on the blip tag. Thanks a lot Microsoft. return blip.get('embed') # Picts imagedata = find_first(el, 'imagedata') if imagedata is not None: return imagedata.get('id')
def _parse_styles(self): if self.styles_text is None: return {} tree = parse_xml_from_string(self.styles_text) styles_dict = {} for style in find_all(tree, "style"): style_val = find_first(style, "name").attrib["val"] run_properties = find_first(style, "rPr") styles_dict[style.attrib["styleId"]] = { "style_name": style_val, "default_run_properties": self._parse_run_properties(run_properties), } return styles_dict
def test_find_first(self): root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>') # Can't find the root element result = find_first(root, 'one') self.assertEqual(result, None) result = find_first(root, 'three') self.assertEqual(result.tag, 'three') self.assertEqual(result.get('v'), '1') result = find_first(root, 'two') self.assertEqual(result.tag, 'two')
def test_find_first(self): root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>') # Can't find the root element result = find_first(root, "one") self.assertEqual(result, None) result = find_first(root, "three") self.assertEqual(result.tag, "three") self.assertEqual(result.get("v"), "1") result = find_first(root, "two") self.assertEqual(result.tag, "two")
def _parse_styles(self): if self.styles_text is None: return {} tree = parse_xml_from_string(self.styles_text) styles_dict = {} for style in find_all(tree, 'style'): style_val = find_first(style, 'name').attrib['val'] run_properties = find_first(style, 'rPr') styles_dict[style.attrib['styleId']] = { 'style_name': style_val, 'default_run_properties': self._parse_run_properties(run_properties, ), } return styles_dict
def _parse_styles(self): if self.styles_text is None: return {} tree = parse_xml_from_string(self.styles_text) styles_dict = {} for style in find_all(tree, 'style'): style_val = find_first(style, 'name').attrib['val'] run_properties = find_first(style, 'rPr') styles_dict[style.attrib['styleId']] = { 'style_name': style_val, 'default_run_properties': self._parse_run_properties( run_properties, ), } return styles_dict
def _parse_styles(self): styles_part = self.document.main_document_part.style_definitions_part if not styles_part: return {} styles_root = styles_part.root_element styles_dict = {} for style in find_all(styles_root, 'style'): style_val = find_first(style, 'name').attrib['val'] run_properties = find_first(style, 'rPr') styles_dict[style.attrib['styleId']] = { 'style_name': style_val, 'default_run_properties': self._parse_run_properties(run_properties, ), } return styles_dict
def _parse_styles(self): styles_part = self.document.main_document_part.style_definitions_part if not styles_part: return {} styles_root = styles_part.root_element styles_dict = {} for style in find_all(styles_root, 'style'): style_val = find_first(style, 'name').attrib['val'] run_properties = find_first(style, 'rPr') styles_dict[style.attrib['styleId']] = { 'style_name': style_val, 'default_run_properties': self._parse_run_properties( run_properties, ), } return styles_dict
def parse_table_cell(self, el, text): v_merge = find_first(el, 'vMerge') if v_merge is not None and ('restart' != v_merge.get('val', '')): return '' colspan = self.get_colspan(el) rowspan = self._get_rowspan(el, v_merge) if rowspan > 1: rowspan = str(rowspan) else: rowspan = '' return self.table_cell(text, colspan, rowspan)
def parse_table_cell(self, el, text): v_merge = find_first(el, "vMerge") if v_merge is not None and ("restart" != v_merge.get("val", "")): return "" colspan = self.get_colspan(el) rowspan = self._get_rowspan(el, v_merge) if rowspan > 1: rowspan = str(rowspan) else: rowspan = "" return self.table_cell(text, colspan, rowspan)
def parse_table_cell(self, el, text): v_merge = find_first(el, 'vMerge') if v_merge is not None and ( 'restart' != v_merge.get('val', '')): return '' colspan = self.get_colspan(el) rowspan = self._get_rowspan(el, v_merge) if rowspan > 1: rowspan = str(rowspan) else: rowspan = '' return self.table_cell(text, colspan, rowspan)
def _load(self): self.document = WordprocessingDocument(path=self.path) main_document_part = self.document.main_document_part if main_document_part is None: raise MalformedDocxException self.root_element = main_document_part.root_element self.numbering_root = None numbering_part = main_document_part.numbering_definitions_part if numbering_part: self.numbering_root = numbering_part.root_element pgSzEl = find_first(self.root_element, 'pgSz') if pgSzEl is not None: # pgSz is defined in twips, convert to points pgSz = int(pgSzEl.attrib['w']) self.page_width = pgSz / TWIPS_PER_POINT self.styles_dict = self._parse_styles() self.parse_begin(self.root_element)
def parse_r(self, el, parsed): """ Parse the running text. """ text = parsed if not text: return '' run_properties = {} # Get the rPr for the current style, they are the defaults. p = find_ancestor_with_tag(self.pre_processor, el, 'p') paragraph_style = find_first(p, 'pStyle') if paragraph_style is not None: style = paragraph_style.get('val') style_defaults = self.styles_dict.get(style, {}) run_properties.update( style_defaults.get('default_run_properties', {}), ) # Get the rPr for the current r tag, they are overrides. run_properties_element = el.find('rPr') if run_properties_element: local_run_properties = self._parse_run_properties( run_properties_element, ) run_properties.update(local_run_properties) inline_tag_handlers = { 'b': self.bold, 'i': self.italics, 'u': self.underline, 'caps': self.caps, 'smallCaps': self.small_caps, 'strike': self.strike, 'dstrike': self.strike, 'vanish': self.hide, 'webHidden': self.hide, } styles_needing_application = [] for property_name, property_value in run_properties.items(): # These tags are a little different, handle them separately # from the rest. # This could be a superscript or a subscript if property_name == 'vertAlign': if property_value == 'superscript': styles_needing_application.append(self.superscript) elif property_value == 'subscript': styles_needing_application.append(self.subscript) else: if ( property_name in inline_tag_handlers and self._is_style_on(property_value) ): styles_needing_application.append( inline_tag_handlers[property_name], ) # Apply all the handlers. for func in styles_needing_application: text = func(text) return text
def get_colspan(self, el): grid_span = find_first(el, "gridSpan") if grid_span is None: return "" return grid_span.attrib["val"]
def get_colspan(self, el): grid_span = find_first(el, 'gridSpan') if grid_span is None: return '' return grid_span.attrib['val']