def test_find_first(self): root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>') # Can't find the root element result = find_first(root, 'one') self.assertEqual(result, None) result = find_first(root, 'three') self.assertEqual(result.tag, 'three') self.assertEqual(result.get('v'), '1') result = find_first(root, 'two') self.assertEqual(result.tag, 'two')
def test_find_first(self): root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>') # Can't find the root element result = find_first(root, 'one') self.assertEqual(result, None) result = find_first(root, 'three') self.assertEqual(result.tag, 'three') self.assertEqual(result.get('v'), '1') result = find_first(root, 'two') self.assertEqual(result.tag, 'two')
def _get_image_id(self, el): # Drawings blip = find_first(el, 'blip') if blip is not None: # On drawing tags the id is actually whatever is returned from the # embed attribute on the blip tag. Thanks a lot Microsoft. r_id = blip.get('embed') if r_id is None: r_id = blip.get('link') return r_id # Picts imagedata = find_first(el, 'imagedata') if imagedata is not None: return imagedata.get('id')
def _get_image_id(self, el): # Drawings blip = find_first(el, 'blip') if blip is not None: # On drawing tags the id is actually whatever is returned from the # embed attribute on the blip tag. Thanks a lot Microsoft. r_id = blip.get('embed') if r_id is None: r_id = blip.get('link') return r_id # Picts imagedata = find_first(el, 'imagedata') if imagedata is not None: return imagedata.get('id')
def _get_image_size(self, el): """ If we can't find a height or width, return 0 for whichever is not found, then rely on the `image` handler to strip those attributes. This functionality can change once we integrate PIL. """ sizes = el.find('./*/graphic/graphicData/pic/spPr/xfrm/ext') if sizes is not None and sizes.get('cx'): if sizes.get('cx'): x = self._convert_image_size(int(sizes.get('cx'))) if sizes.get('cy'): y = self._convert_image_size(int(sizes.get('cy'))) return ( '%dpx' % x, '%dpx' % y, ) shape = find_first(el, 'shape') if shape is not None and shape.get('style') is not None: # If either of these are not set, rely on the method `image` to not # use either of them. x = 0 y = 0 styles = shape.get('style').split(';') for s in styles: if s.startswith('height:'): y = s.split(':')[1] if s.startswith('width:'): x = s.split(':')[1] return x, y return 0, 0
def perform_pre_processing(self, root, *args, **kwargs): self.populate_memoization({ 'find_first': find_first, }) self._add_parent(root) # If we don't have a numbering root there cannot be any lists. if self.numbering_root is not None: self._set_list_attributes(root) self._set_table_attributes(root) self._set_is_in_table(root) body = find_first(root, 'body') self._set_next(body) p_elements = [ child for child in find_all(body, 'p') ] list_elements = [ child for child in p_elements if self.is_list_item(child) ] # Find the first and last li elements num_ids = set([self.num_id(i) for i in list_elements]) ilvls = set([self.ilvl(i) for i in list_elements]) self._set_first_list_item(num_ids, ilvls, list_elements) self._set_last_list_item(num_ids, list_elements) self._set_headers(p_elements) self._convert_upper_roman(body)
def _get_image_size(self, el): """ If we can't find a height or width, return 0 for whichever is not found, then rely on the `image` handler to strip those attributes. This functionality can change once we integrate PIL. """ sizes = el.find('./*/graphic/graphicData/pic/spPr/xfrm/ext') if sizes is not None and sizes.get('cx'): if sizes.get('cx'): x = self._convert_image_size(int(sizes.get('cx'))) if sizes.get('cy'): y = self._convert_image_size(int(sizes.get('cy'))) return ( '%dpx' % x, '%dpx' % y, ) shape = find_first(el, 'shape') if shape is not None and shape.get('style') is not None: # If either of these are not set, rely on the method `image` to not # use either of them. x = 0 y = 0 styles = shape.get('style').split(';') for s in styles: if s.startswith('height:'): y = s.split(':')[1] if s.startswith('width:'): x = s.split(':')[1] return x, y return 0, 0
def perform_pre_processing(self, root, *args, **kwargs): self.populate_memoization({ 'find_first': find_first, }) self._add_parent(root) # If we don't have a numbering root there cannot be any lists. if self.numbering_root is not None: self._set_list_attributes(root) self._set_table_attributes(root) self._set_is_in_table(root) body = find_first(root, 'body') self._set_next(body) p_elements = [child for child in find_all(body, 'p')] list_elements = [ child for child in p_elements if self.is_list_item(child) ] # Find the first and last li elements num_ids = set([self.num_id(i) for i in list_elements]) ilvls = set([self.ilvl(i) for i in list_elements]) self._set_first_list_item(num_ids, ilvls, list_elements) self._set_last_list_item(num_ids, list_elements) self._set_headers(p_elements) self._convert_upper_roman(body)
def get_properties_with_no_font_size(): # Only set paragraph_properties if properties has a size. if not properties.size: return copied_el = copy.deepcopy(el) rpr = find_first(copied_el, 'rPr') if rpr is None: return size_tag = find_first(rpr, 'sz') if size_tag is None: return rpr.remove(size_tag) return self.styles_manager.get_resolved_properties_for_element( copied_el, stack, )
def get_properties_with_no_font_size(): # Only set paragraph_properties if properties has a size. if not properties.size: return copied_el = copy.deepcopy(el) rpr = find_first(copied_el, 'rPr') if rpr is None: return size_tag = find_first(rpr, 'sz') if size_tag is None: return rpr.remove(size_tag) return self.styles_manager.get_resolved_properties_for_element( copied_el, stack, )
def parse_table_cell(self, el, text, stack): v_merge = find_first(el, 'vMerge') if v_merge is not None and ('restart' != v_merge.get('val', '')): return '' colspan = self.get_colspan(el) rowspan = self._get_rowspan(el, v_merge) if rowspan > 1: rowspan = str(rowspan) else: rowspan = '' return self.table_cell(text, colspan, rowspan)
def parse_table_cell(self, el, text, stack): v_merge = find_first(el, 'vMerge') if v_merge is not None and ( 'restart' != v_merge.get('val', '')): return '' colspan = self.get_colspan(el) rowspan = self._get_rowspan(el, v_merge) if rowspan > 1: rowspan = str(rowspan) else: rowspan = '' return self.table_cell(text, colspan, rowspan)
def _set_table_attributes(self, el): tables = find_all(el, 'tbl') for table in tables: rows = filter_children(table, ['tr']) if rows is None: continue for i, row in enumerate(rows): tcs = filter_children(row, ['tc']) for j, child in enumerate(tcs): self.meta_data[child]['row_index'] = i self.meta_data[child]['column_index'] = j v_merge = find_first(child, 'vMerge') if (v_merge is not None and ('continue' == v_merge.get('val', '') or v_merge.attrib == {})): self.meta_data[child]['vmerge_continue'] = True
def _set_table_attributes(self, el): tables = find_all(el, 'tbl') for table in tables: rows = filter_children(table, ['tr']) if rows is None: continue for i, row in enumerate(rows): tcs = filter_children(row, ['tc']) for j, child in enumerate(tcs): self.meta_data[child]['row_index'] = i self.meta_data[child]['column_index'] = j v_merge = find_first(child, 'vMerge') if ( v_merge is not None and ('continue' == v_merge.get('val', '') or v_merge.attrib == {}) ): self.meta_data[child]['vmerge_continue'] = True
def _generate_num_id(self, el): ''' Fun fact: It is possible to have a list in the root, that holds a table that holds a list and for both lists to have the same numId. When this happens we should namespace the nested list with the number of tables it is in to ensure it is considered a new list. Otherwise all sorts of terrible html gets generated. ''' num_id = find_first(el, 'numId').attrib['val'] # First, go up the parent until we get None and count the number of # tables there are. num_tables = 0 while self.parent(el) is not None: if el.tag == 'tbl': num_tables += 1 el = self.parent(el) return NamespacedNumId( num_id=num_id, num_tables=num_tables, )
def _generate_num_id(self, el): ''' Fun fact: It is possible to have a list in the root, that holds a table that holds a list and for both lists to have the same numId. When this happens we should namespace the nested list with the number of tables it is in to ensure it is considered a new list. Otherwise all sorts of terrible html gets generated. ''' num_id = find_first(el, 'numId').attrib['val'] # First, go up the parent until we get None and count the number of # tables there are. num_tables = 0 while self.parent(el) is not None: if el.tag == 'tbl': num_tables += 1 el = self.parent(el) return NamespacedNumId( num_id=num_id, num_tables=num_tables, )
def _set_headers(self, elements): # These are the styles for headers and what the html tag should be if # we have one. headers = { 'heading 1': 'h1', 'heading 2': 'h2', 'heading 3': 'h3', 'heading 4': 'h4', 'heading 5': 'h5', 'heading 6': 'h6', 'heading 7': 'h6', 'heading 8': 'h6', 'heading 9': 'h6', 'heading 10': 'h6', } # Remove the rPr from the styles dict since all the styling will be # down with the heading. for style in self.styles.styles: if style.name.lower() in headers: style.run_properties = None for element in elements: # This element is using the default style which is not a heading. p_style = find_first(element, 'pStyle') if p_style is None: continue style = p_style.attrib.get('val', '') style = self.styles.get_styles_by_type('paragraph').get(style) if style: style_name = style.name.lower() # Check to see if this element is actually a header. if style_name in headers: # Set all the list item variables to false. self.meta_data[element]['is_list_item'] = False self.meta_data[element]['is_first_list_item'] = False self.meta_data[element][ 'is_last_list_item_in_root'] = False # noqa # Prime the heading_level self.meta_data[element]['heading_level'] = headers[ style_name] # noqa
def _set_headers(self, elements): # These are the styles for headers and what the html tag should be if # we have one. headers = { 'heading 1': 'h1', 'heading 2': 'h2', 'heading 3': 'h3', 'heading 4': 'h4', 'heading 5': 'h5', 'heading 6': 'h6', 'heading 7': 'h6', 'heading 8': 'h6', 'heading 9': 'h6', 'heading 10': 'h6', } # Remove the rPr from the styles dict since all the styling will be # down with the heading. for style in self.styles.styles: if style.name.lower() in headers: style.run_properties = None for element in elements: # This element is using the default style which is not a heading. p_style = find_first(element, 'pStyle') if p_style is None: continue style = p_style.attrib.get('val', '') style = self.styles.get_styles_by_type('paragraph').get(style) if style: style_name = style.name.lower() # Check to see if this element is actually a header. if style_name in headers: # Set all the list item variables to false. self.meta_data[element]['is_list_item'] = False self.meta_data[element]['is_first_list_item'] = False self.meta_data[element]['is_last_list_item_in_root'] = False # noqa # Prime the heading_level self.meta_data[element]['heading_level'] = headers[style_name] # noqa
def _get_page_width(self, root_element): pgSzEl = find_first(root_element, 'pgSz') if pgSzEl is not None: # pgSz is defined in twips, convert to points pgSz = int(pgSzEl.attrib['w']) return pgSz / TWIPS_PER_POINT
def get_colspan(self, el): grid_span = find_first(el, 'gridSpan') if grid_span is None: return '' return grid_span.attrib['val']
def get_colspan(self, el): grid_span = find_first(el, 'gridSpan') if grid_span is None: return '' return grid_span.attrib['val']
def _get_page_width(self, root_element): pgSzEl = find_first(root_element, 'pgSz') if pgSzEl is not None: # pgSz is defined in twips, convert to points pgSz = int(pgSzEl.attrib['w']) return pgSz / TWIPS_PER_POINT