Beispiel #1
0
    def test_find_all(self):
        root = make_xml(b'<one><two><three/><three/></two></one>')

        # Can't find the root element
        result = find_all(root, 'one')
        self.assertEqual(result, [])

        result = find_all(root, 'three')
        expected = ['three', 'three']
        self.assertEqual(list(elements_to_tags(result)), expected)

        result = find_all(root, 'two')
        self.assertEqual(list(elements_to_tags(result)), ['two'])
Beispiel #2
0
    def test_find_all(self):
        root = make_xml(b'<one><two><three/><three/></two></one>')

        # Can't find the root element
        result = find_all(root, 'one')
        self.assertEqual(result, [])

        result = find_all(root, 'three')
        expected = ['three', 'three']
        self.assertEqual(list(elements_to_tags(result)), expected)

        result = find_all(root, 'two')
        self.assertEqual(list(elements_to_tags(result)), ['two'])
Beispiel #3
0
    def perform_pre_processing(self, root, *args, **kwargs):
        self.populate_memoization({
            'find_first': find_first,
        })
        self._add_parent(root)
        # If we don't have a numbering root there cannot be any lists.
        if self.numbering_root is not None:
            self._set_list_attributes(root)
        self._set_table_attributes(root)
        self._set_is_in_table(root)

        body = find_first(root, 'body')
        self._set_next(body)
        p_elements = [child for child in find_all(body, 'p')]
        list_elements = [
            child for child in p_elements if self.is_list_item(child)
        ]
        # Find the first and last li elements
        num_ids = set([self.num_id(i) for i in list_elements])
        ilvls = set([self.ilvl(i) for i in list_elements])
        self._set_first_list_item(num_ids, ilvls, list_elements)
        self._set_last_list_item(num_ids, list_elements)

        self._set_headers(p_elements)
        self._convert_upper_roman(body)
Beispiel #4
0
    def _set_next(self, body):
        def _get_children_with_content(el):
            # We only care about children if they have text in them.
            children = []
            for child in filter_children(el, TAGS_HOLDING_CONTENT_TAGS):
                _has_descendant_with_tag = any(
                    has_descendant_with_tag(child, tag)
                    for tag in TAGS_CONTAINING_CONTENT)
                if _has_descendant_with_tag:
                    children.append(child)
            return children

        def _assign_next(children):
            # Populate the `next` attribute for all the child elements.
            for i in range(len(children)):
                try:
                    if children[i + 1] is not None:
                        self.meta_data[children[i]]['next'] = children[
                            i + 1]  # noqa
                except IndexError:
                    pass
                try:
                    if children[i - 1] is not None:
                        self.meta_data[children[i]]['previous'] = children[
                            i - 1]  # noqa
                except IndexError:
                    pass

        # Assign next for everything in the root.
        _assign_next(_get_children_with_content(body))

        # In addition set next for everything in table cells.
        for tc in find_all(body, 'tc'):
            _assign_next(_get_children_with_content(tc))
Beispiel #5
0
    def perform_pre_processing(self, root, *args, **kwargs):
        self._add_parent(root)
        # If we don't have a numbering root there cannot be any lists.
        if self.numbering_root is not None:
            self._set_list_attributes(root)
        self._set_table_attributes(root)
        self._set_is_in_table(root)

        body = root.find('./body')
        self._set_next(body)
        p_elements = [
            child for child in find_all(body, 'p')
        ]
        list_elements = [
            child for child in p_elements
            if self.is_list_item(child)
        ]
        # Find the first and last li elements
        num_ids = set([self.num_id(i) for i in list_elements])
        ilvls = set([self.ilvl(i) for i in list_elements])
        self._set_first_list_item(num_ids, ilvls, list_elements)
        self._set_last_list_item(num_ids, list_elements)

        self._set_headers(p_elements)
        self._convert_upper_roman(body)
Beispiel #6
0
    def _set_next(self, body):
        def _get_children_with_content(el):
            # We only care about children if they have text in them.
            children = []
            for child in filter_children(el, TAGS_HOLDING_CONTENT_TAGS):
                _has_descendant_with_tag = any(
                    has_descendant_with_tag(child, tag) for
                    tag in TAGS_CONTAINING_CONTENT
                )
                if _has_descendant_with_tag:
                    children.append(child)
            return children

        def _assign_next(children):
            # Populate the `next` attribute for all the child elements.
            for i in range(len(children)):
                try:
                    if children[i + 1] is not None:
                        self.meta_data[children[i]]['next'] = children[i + 1]  # noqa
                except IndexError:
                    pass
                try:
                    if children[i - 1] is not None:
                        self.meta_data[children[i]]['previous'] = children[i - 1]  # noqa
                except IndexError:
                    pass
        # Assign next for everything in the root.
        _assign_next(_get_children_with_content(body))

        # In addition set next for everything in table cells.
        for tc in find_all(body, 'tc'):
            _assign_next(_get_children_with_content(tc))
Beispiel #7
0
 def test_get_image_id(self):
     parser = XMLDocx2Html(
         document_xml=self.get_xml(),
         relationships=self.relationships,
     )
     tree = parse_xml_from_string(self.get_xml(), remove_namespaces=True)
     els = []
     els.extend(find_all(tree, 'drawing'))
     els.extend(find_all(tree, 'pict'))
     image_ids = []
     for el in els:
         image_ids.append(parser._get_image_id(el))
     expected = [
         'rId0',
         'rId1',
     ]
     self.assertEqual(
         set(image_ids),
         set(expected),
     )
Beispiel #8
0
 def test_get_image_sizes(self):
     parser = XMLDocx2Html(
         document_xml=self.get_xml(),
         relationships=self.relationships,
     )
     tree = parse_xml_from_string(self.get_xml())
     els = []
     els.extend(find_all(tree, 'drawing'))
     els.extend(find_all(tree, 'pict'))
     image_ids = []
     for el in els:
         image_ids.append(parser._get_image_size(el))
     expected = [
         ('40px', '20px'),
         ('41pt', '21pt'),
     ]
     self.assertEqual(
         set(image_ids),
         set(expected),
     )
Beispiel #9
0
 def _set_list_attributes(self, el):
     list_elements = find_all(el, 'numId')
     for li in list_elements:
         parent = find_ancestor_with_tag(self, li, 'p')
         # Deleted text in a list will have a numId but no ilvl.
         if parent is None:
             continue
         parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl')
         if parent_ilvl is None:
             continue
         self.meta_data[parent]['is_list_item'] = True
         self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
         self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
Beispiel #10
0
 def _set_list_attributes(self, el):
     list_elements = find_all(el, 'numId')
     for li in list_elements:
         parent = find_ancestor_with_tag(self, li, 'p')
         # Deleted text in a list will have a numId but no ilvl.
         if parent is None:
             continue
         parent_ilvl = parent.find('./pPr/numPr/ilvl')
         if parent_ilvl is None:
             continue
         self.meta_data[parent]['is_list_item'] = True
         self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
         self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
Beispiel #11
0
 def _set_table_attributes(self, el):
     tables = find_all(el, 'tbl')
     for table in tables:
         rows = filter_children(table, ['tr'])
         if rows is None:
             continue
         for i, row in enumerate(rows):
             tcs = filter_children(row, ['tc'])
             for j, child in enumerate(tcs):
                 self.meta_data[child]['row_index'] = i
                 self.meta_data[child]['column_index'] = j
                 v_merge = find_first(child, 'vMerge')
                 if (v_merge is not None
                         and ('continue' == v_merge.get('val', '')
                              or v_merge.attrib == {})):
                     self.meta_data[child]['vmerge_continue'] = True
Beispiel #12
0
 def _set_table_attributes(self, el):
     tables = find_all(el, 'tbl')
     for table in tables:
         rows = filter_children(table, ['tr'])
         if rows is None:
             continue
         for i, row in enumerate(rows):
             tcs = filter_children(row, ['tc'])
             for j, child in enumerate(tcs):
                 self.meta_data[child]['row_index'] = i
                 self.meta_data[child]['column_index'] = j
                 v_merge = child.find('./tcPr/vMerge')
                 if (
                         v_merge is not None and
                         ('continue' == v_merge.get('val', '') or
                          v_merge.attrib == {})
                 ):
                     self.meta_data[child]['vmerge_continue'] = True
Beispiel #13
0
    def _convert_upper_roman(self, body):
        if not self.convert_root_level_upper_roman:
            return
        first_root_list_items = [
            # Only root level elements.
            el for el in body.getchildren()
            # And only first_list_items
            if self.is_first_list_item(el)
        ]
        visited_num_ids = []
        all_p_tags_in_body = find_all(body, 'p')
        for root_list_item in first_root_list_items:
            if self.num_id(root_list_item) in visited_num_ids:
                continue
            visited_num_ids.append(self.num_id(root_list_item))
            lst_style = get_list_style(
                self.numbering_root,
                self.num_id(root_list_item).num_id,
                self.ilvl(root_list_item),
            )
            if lst_style != 'upperRoman':
                continue
            ilvl = min(
                self.ilvl(el) for el in all_p_tags_in_body
                if self.num_id(el) == self.num_id(root_list_item))
            root_upper_roman_list_items = [
                el for el in all_p_tags_in_body
                if self.num_id(el) == self.num_id(root_list_item)
                and self.ilvl(el) == ilvl
            ]
            for list_item in root_upper_roman_list_items:
                self.meta_data[list_item]['is_list_item'] = False
                self.meta_data[list_item]['is_first_list_item'] = False
                self.meta_data[list_item][
                    'is_last_list_item_in_root'] = False  # noqa

                self.meta_data[list_item][
                    'heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE  # noqa
Beispiel #14
0
    def _convert_upper_roman(self, body):
        if not self.convert_root_level_upper_roman:
            return
        first_root_list_items = [
            # Only root level elements.
            el for el in body.getchildren()
            # And only first_list_items
            if self.is_first_list_item(el)
        ]
        visited_num_ids = []
        all_p_tags_in_body = find_all(body, 'p')
        for root_list_item in first_root_list_items:
            if self.num_id(root_list_item) in visited_num_ids:
                continue
            visited_num_ids.append(self.num_id(root_list_item))
            lst_style = get_list_style(
                self.numbering_root,
                self.num_id(root_list_item).num_id,
                self.ilvl(root_list_item),
            )
            if lst_style != 'upperRoman':
                continue
            ilvl = min(
                self.ilvl(el) for el in all_p_tags_in_body
                if self.num_id(el) == self.num_id(root_list_item)
            )
            root_upper_roman_list_items = [
                el for el in all_p_tags_in_body
                if self.num_id(el) == self.num_id(root_list_item) and
                self.ilvl(el) == ilvl
            ]
            for list_item in root_upper_roman_list_items:
                self.meta_data[list_item]['is_list_item'] = False
                self.meta_data[list_item]['is_first_list_item'] = False
                self.meta_data[list_item]['is_last_list_item_in_root'] = False  # noqa

                self.meta_data[list_item]['heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE  # noqa
Beispiel #15
0
 def _set_is_in_table(self, el):
     paragraph_elements = find_all(el, 'p')
     for p in paragraph_elements:
         if find_ancestor_with_tag(self, p, 'tc') is not None:
             self.meta_data[p]['is_in_table'] = True
Beispiel #16
0
 def _set_is_in_table(self, el):
     paragraph_elements = find_all(el, 'p')
     for p in paragraph_elements:
         if find_ancestor_with_tag(self, p, 'tc') is not None:
             self.meta_data[p]['is_in_table'] = True