def test_find_all(self): root = make_xml(b'<one><two><three/><three/></two></one>') # Can't find the root element result = find_all(root, 'one') self.assertEqual(result, []) result = find_all(root, 'three') expected = ['three', 'three'] self.assertEqual(list(elements_to_tags(result)), expected) result = find_all(root, 'two') self.assertEqual(list(elements_to_tags(result)), ['two'])
def perform_pre_processing(self, root, *args, **kwargs): self.populate_memoization({ 'find_first': find_first, }) self._add_parent(root) # If we don't have a numbering root there cannot be any lists. if self.numbering_root is not None: self._set_list_attributes(root) self._set_table_attributes(root) self._set_is_in_table(root) body = find_first(root, 'body') self._set_next(body) p_elements = [child for child in find_all(body, 'p')] list_elements = [ child for child in p_elements if self.is_list_item(child) ] # Find the first and last li elements num_ids = set([self.num_id(i) for i in list_elements]) ilvls = set([self.ilvl(i) for i in list_elements]) self._set_first_list_item(num_ids, ilvls, list_elements) self._set_last_list_item(num_ids, list_elements) self._set_headers(p_elements) self._convert_upper_roman(body)
def _set_next(self, body): def _get_children_with_content(el): # We only care about children if they have text in them. children = [] for child in filter_children(el, TAGS_HOLDING_CONTENT_TAGS): _has_descendant_with_tag = any( has_descendant_with_tag(child, tag) for tag in TAGS_CONTAINING_CONTENT) if _has_descendant_with_tag: children.append(child) return children def _assign_next(children): # Populate the `next` attribute for all the child elements. for i in range(len(children)): try: if children[i + 1] is not None: self.meta_data[children[i]]['next'] = children[ i + 1] # noqa except IndexError: pass try: if children[i - 1] is not None: self.meta_data[children[i]]['previous'] = children[ i - 1] # noqa except IndexError: pass # Assign next for everything in the root. _assign_next(_get_children_with_content(body)) # In addition set next for everything in table cells. for tc in find_all(body, 'tc'): _assign_next(_get_children_with_content(tc))
def perform_pre_processing(self, root, *args, **kwargs): self._add_parent(root) # If we don't have a numbering root there cannot be any lists. if self.numbering_root is not None: self._set_list_attributes(root) self._set_table_attributes(root) self._set_is_in_table(root) body = root.find('./body') self._set_next(body) p_elements = [ child for child in find_all(body, 'p') ] list_elements = [ child for child in p_elements if self.is_list_item(child) ] # Find the first and last li elements num_ids = set([self.num_id(i) for i in list_elements]) ilvls = set([self.ilvl(i) for i in list_elements]) self._set_first_list_item(num_ids, ilvls, list_elements) self._set_last_list_item(num_ids, list_elements) self._set_headers(p_elements) self._convert_upper_roman(body)
def _set_next(self, body): def _get_children_with_content(el): # We only care about children if they have text in them. children = [] for child in filter_children(el, TAGS_HOLDING_CONTENT_TAGS): _has_descendant_with_tag = any( has_descendant_with_tag(child, tag) for tag in TAGS_CONTAINING_CONTENT ) if _has_descendant_with_tag: children.append(child) return children def _assign_next(children): # Populate the `next` attribute for all the child elements. for i in range(len(children)): try: if children[i + 1] is not None: self.meta_data[children[i]]['next'] = children[i + 1] # noqa except IndexError: pass try: if children[i - 1] is not None: self.meta_data[children[i]]['previous'] = children[i - 1] # noqa except IndexError: pass # Assign next for everything in the root. _assign_next(_get_children_with_content(body)) # In addition set next for everything in table cells. for tc in find_all(body, 'tc'): _assign_next(_get_children_with_content(tc))
def test_get_image_id(self): parser = XMLDocx2Html( document_xml=self.get_xml(), relationships=self.relationships, ) tree = parse_xml_from_string(self.get_xml(), remove_namespaces=True) els = [] els.extend(find_all(tree, 'drawing')) els.extend(find_all(tree, 'pict')) image_ids = [] for el in els: image_ids.append(parser._get_image_id(el)) expected = [ 'rId0', 'rId1', ] self.assertEqual( set(image_ids), set(expected), )
def test_get_image_sizes(self): parser = XMLDocx2Html( document_xml=self.get_xml(), relationships=self.relationships, ) tree = parse_xml_from_string(self.get_xml()) els = [] els.extend(find_all(tree, 'drawing')) els.extend(find_all(tree, 'pict')) image_ids = [] for el in els: image_ids.append(parser._get_image_size(el)) expected = [ ('40px', '20px'), ('41pt', '21pt'), ] self.assertEqual( set(image_ids), set(expected), )
def _set_list_attributes(self, el): list_elements = find_all(el, 'numId') for li in list_elements: parent = find_ancestor_with_tag(self, li, 'p') # Deleted text in a list will have a numId but no ilvl. if parent is None: continue parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl') if parent_ilvl is None: continue self.meta_data[parent]['is_list_item'] = True self.meta_data[parent]['num_id'] = self._generate_num_id(parent) self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
def _set_list_attributes(self, el): list_elements = find_all(el, 'numId') for li in list_elements: parent = find_ancestor_with_tag(self, li, 'p') # Deleted text in a list will have a numId but no ilvl. if parent is None: continue parent_ilvl = parent.find('./pPr/numPr/ilvl') if parent_ilvl is None: continue self.meta_data[parent]['is_list_item'] = True self.meta_data[parent]['num_id'] = self._generate_num_id(parent) self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
def _set_table_attributes(self, el): tables = find_all(el, 'tbl') for table in tables: rows = filter_children(table, ['tr']) if rows is None: continue for i, row in enumerate(rows): tcs = filter_children(row, ['tc']) for j, child in enumerate(tcs): self.meta_data[child]['row_index'] = i self.meta_data[child]['column_index'] = j v_merge = find_first(child, 'vMerge') if (v_merge is not None and ('continue' == v_merge.get('val', '') or v_merge.attrib == {})): self.meta_data[child]['vmerge_continue'] = True
def _set_table_attributes(self, el): tables = find_all(el, 'tbl') for table in tables: rows = filter_children(table, ['tr']) if rows is None: continue for i, row in enumerate(rows): tcs = filter_children(row, ['tc']) for j, child in enumerate(tcs): self.meta_data[child]['row_index'] = i self.meta_data[child]['column_index'] = j v_merge = child.find('./tcPr/vMerge') if ( v_merge is not None and ('continue' == v_merge.get('val', '') or v_merge.attrib == {}) ): self.meta_data[child]['vmerge_continue'] = True
def _convert_upper_roman(self, body): if not self.convert_root_level_upper_roman: return first_root_list_items = [ # Only root level elements. el for el in body.getchildren() # And only first_list_items if self.is_first_list_item(el) ] visited_num_ids = [] all_p_tags_in_body = find_all(body, 'p') for root_list_item in first_root_list_items: if self.num_id(root_list_item) in visited_num_ids: continue visited_num_ids.append(self.num_id(root_list_item)) lst_style = get_list_style( self.numbering_root, self.num_id(root_list_item).num_id, self.ilvl(root_list_item), ) if lst_style != 'upperRoman': continue ilvl = min( self.ilvl(el) for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item)) root_upper_roman_list_items = [ el for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item) and self.ilvl(el) == ilvl ] for list_item in root_upper_roman_list_items: self.meta_data[list_item]['is_list_item'] = False self.meta_data[list_item]['is_first_list_item'] = False self.meta_data[list_item][ 'is_last_list_item_in_root'] = False # noqa self.meta_data[list_item][ 'heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE # noqa
def _convert_upper_roman(self, body): if not self.convert_root_level_upper_roman: return first_root_list_items = [ # Only root level elements. el for el in body.getchildren() # And only first_list_items if self.is_first_list_item(el) ] visited_num_ids = [] all_p_tags_in_body = find_all(body, 'p') for root_list_item in first_root_list_items: if self.num_id(root_list_item) in visited_num_ids: continue visited_num_ids.append(self.num_id(root_list_item)) lst_style = get_list_style( self.numbering_root, self.num_id(root_list_item).num_id, self.ilvl(root_list_item), ) if lst_style != 'upperRoman': continue ilvl = min( self.ilvl(el) for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item) ) root_upper_roman_list_items = [ el for el in all_p_tags_in_body if self.num_id(el) == self.num_id(root_list_item) and self.ilvl(el) == ilvl ] for list_item in root_upper_roman_list_items: self.meta_data[list_item]['is_list_item'] = False self.meta_data[list_item]['is_first_list_item'] = False self.meta_data[list_item]['is_last_list_item_in_root'] = False # noqa self.meta_data[list_item]['heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE # noqa
def _set_is_in_table(self, el): paragraph_elements = find_all(el, 'p') for p in paragraph_elements: if find_ancestor_with_tag(self, p, 'tc') is not None: self.meta_data[p]['is_in_table'] = True