コード例 #1
0
ファイル: test_xml.py プロジェクト: OmniaSolutions/pydocx
    def test_find_first(self):
        root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>')

        # Can't find the root element
        result = find_first(root, 'one')
        self.assertEqual(result, None)

        result = find_first(root, 'three')
        self.assertEqual(result.tag, 'three')
        self.assertEqual(result.get('v'), '1')

        result = find_first(root, 'two')
        self.assertEqual(result.tag, 'two')
コード例 #2
0
ファイル: test_xml.py プロジェクト: madevelopers/pydocx
    def test_find_first(self):
        root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>')

        # Can't find the root element
        result = find_first(root, 'one')
        self.assertEqual(result, None)

        result = find_first(root, 'three')
        self.assertEqual(result.tag, 'three')
        self.assertEqual(result.get('v'), '1')

        result = find_first(root, 'two')
        self.assertEqual(result.tag, 'two')
コード例 #3
0
ファイル: DocxParser.py プロジェクト: mfraezz/pydocx
 def _get_image_id(self, el):
     # Drawings
     blip = find_first(el, 'blip')
     if blip is not None:
         # On drawing tags the id is actually whatever is returned from the
         # embed attribute on the blip tag. Thanks a lot Microsoft.
         r_id = blip.get('embed')
         if r_id is None:
             r_id = blip.get('link')
         return r_id
     # Picts
     imagedata = find_first(el, 'imagedata')
     if imagedata is not None:
         return imagedata.get('id')
コード例 #4
0
ファイル: DocxParser.py プロジェクト: TJtrack99/pydocx
 def _get_image_id(self, el):
     # Drawings
     blip = find_first(el, 'blip')
     if blip is not None:
         # On drawing tags the id is actually whatever is returned from the
         # embed attribute on the blip tag. Thanks a lot Microsoft.
         r_id = blip.get('embed')
         if r_id is None:
             r_id = blip.get('link')
         return r_id
     # Picts
     imagedata = find_first(el, 'imagedata')
     if imagedata is not None:
         return imagedata.get('id')
コード例 #5
0
ファイル: DocxParser.py プロジェクト: mfraezz/pydocx
    def _get_image_size(self, el):
        """
        If we can't find a height or width, return 0 for whichever is not
        found, then rely on the `image` handler to strip those attributes. This
        functionality can change once we integrate PIL.
        """
        sizes = el.find('./*/graphic/graphicData/pic/spPr/xfrm/ext')
        if sizes is not None and sizes.get('cx'):
            if sizes.get('cx'):
                x = self._convert_image_size(int(sizes.get('cx')))
            if sizes.get('cy'):
                y = self._convert_image_size(int(sizes.get('cy')))
            return (
                '%dpx' % x,
                '%dpx' % y,
            )
        shape = find_first(el, 'shape')
        if shape is not None and shape.get('style') is not None:
            # If either of these are not set, rely on the method `image` to not
            # use either of them.
            x = 0
            y = 0
            styles = shape.get('style').split(';')

            for s in styles:
                if s.startswith('height:'):
                    y = s.split(':')[1]
                if s.startswith('width:'):
                    x = s.split(':')[1]
            return x, y
        return 0, 0
コード例 #6
0
ファイル: preprocessor.py プロジェクト: AaronWan/pydocx
    def perform_pre_processing(self, root, *args, **kwargs):
        self.populate_memoization({
            'find_first': find_first,
        })
        self._add_parent(root)
        # If we don't have a numbering root there cannot be any lists.
        if self.numbering_root is not None:
            self._set_list_attributes(root)
        self._set_table_attributes(root)
        self._set_is_in_table(root)

        body = find_first(root, 'body')
        self._set_next(body)
        p_elements = [
            child for child in find_all(body, 'p')
        ]
        list_elements = [
            child for child in p_elements
            if self.is_list_item(child)
        ]
        # Find the first and last li elements
        num_ids = set([self.num_id(i) for i in list_elements])
        ilvls = set([self.ilvl(i) for i in list_elements])
        self._set_first_list_item(num_ids, ilvls, list_elements)
        self._set_last_list_item(num_ids, list_elements)

        self._set_headers(p_elements)
        self._convert_upper_roman(body)
コード例 #7
0
ファイル: DocxParser.py プロジェクト: TJtrack99/pydocx
    def _get_image_size(self, el):
        """
        If we can't find a height or width, return 0 for whichever is not
        found, then rely on the `image` handler to strip those attributes. This
        functionality can change once we integrate PIL.
        """
        sizes = el.find('./*/graphic/graphicData/pic/spPr/xfrm/ext')
        if sizes is not None and sizes.get('cx'):
            if sizes.get('cx'):
                x = self._convert_image_size(int(sizes.get('cx')))
            if sizes.get('cy'):
                y = self._convert_image_size(int(sizes.get('cy')))
            return (
                '%dpx' % x,
                '%dpx' % y,
            )
        shape = find_first(el, 'shape')
        if shape is not None and shape.get('style') is not None:
            # If either of these are not set, rely on the method `image` to not
            # use either of them.
            x = 0
            y = 0
            styles = shape.get('style').split(';')

            for s in styles:
                if s.startswith('height:'):
                    y = s.split(':')[1]
                if s.startswith('width:'):
                    x = s.split(':')[1]
            return x, y
        return 0, 0
コード例 #8
0
ファイル: preprocessor.py プロジェクト: madevelopers/pydocx
    def perform_pre_processing(self, root, *args, **kwargs):
        self.populate_memoization({
            'find_first': find_first,
        })
        self._add_parent(root)
        # If we don't have a numbering root there cannot be any lists.
        if self.numbering_root is not None:
            self._set_list_attributes(root)
        self._set_table_attributes(root)
        self._set_is_in_table(root)

        body = find_first(root, 'body')
        self._set_next(body)
        p_elements = [child for child in find_all(body, 'p')]
        list_elements = [
            child for child in p_elements if self.is_list_item(child)
        ]
        # Find the first and last li elements
        num_ids = set([self.num_id(i) for i in list_elements])
        ilvls = set([self.ilvl(i) for i in list_elements])
        self._set_first_list_item(num_ids, ilvls, list_elements)
        self._set_last_list_item(num_ids, list_elements)

        self._set_headers(p_elements)
        self._convert_upper_roman(body)
コード例 #9
0
ファイル: DocxParser.py プロジェクト: mfraezz/pydocx
        def get_properties_with_no_font_size():
            # Only set paragraph_properties if properties has a size.
            if not properties.size:
                return
            copied_el = copy.deepcopy(el)
            rpr = find_first(copied_el, 'rPr')
            if rpr is None:
                return

            size_tag = find_first(rpr, 'sz')
            if size_tag is None:
                return

            rpr.remove(size_tag)

            return self.styles_manager.get_resolved_properties_for_element(
                copied_el,
                stack,
            )
コード例 #10
0
ファイル: DocxParser.py プロジェクト: TJtrack99/pydocx
        def get_properties_with_no_font_size():
            # Only set paragraph_properties if properties has a size.
            if not properties.size:
                return
            copied_el = copy.deepcopy(el)
            rpr = find_first(copied_el, 'rPr')
            if rpr is None:
                return

            size_tag = find_first(rpr, 'sz')
            if size_tag is None:
                return

            rpr.remove(size_tag)

            return self.styles_manager.get_resolved_properties_for_element(
                copied_el,
                stack,
            )
コード例 #11
0
ファイル: DocxParser.py プロジェクト: TJtrack99/pydocx
 def parse_table_cell(self, el, text, stack):
     v_merge = find_first(el, 'vMerge')
     if v_merge is not None and ('restart' != v_merge.get('val', '')):
         return ''
     colspan = self.get_colspan(el)
     rowspan = self._get_rowspan(el, v_merge)
     if rowspan > 1:
         rowspan = str(rowspan)
     else:
         rowspan = ''
     return self.table_cell(text, colspan, rowspan)
コード例 #12
0
ファイル: DocxParser.py プロジェクト: mfraezz/pydocx
 def parse_table_cell(self, el, text, stack):
     v_merge = find_first(el, 'vMerge')
     if v_merge is not None and (
             'restart' != v_merge.get('val', '')):
         return ''
     colspan = self.get_colspan(el)
     rowspan = self._get_rowspan(el, v_merge)
     if rowspan > 1:
         rowspan = str(rowspan)
     else:
         rowspan = ''
     return self.table_cell(text, colspan, rowspan)
コード例 #13
0
ファイル: preprocessor.py プロジェクト: madevelopers/pydocx
 def _set_table_attributes(self, el):
     tables = find_all(el, 'tbl')
     for table in tables:
         rows = filter_children(table, ['tr'])
         if rows is None:
             continue
         for i, row in enumerate(rows):
             tcs = filter_children(row, ['tc'])
             for j, child in enumerate(tcs):
                 self.meta_data[child]['row_index'] = i
                 self.meta_data[child]['column_index'] = j
                 v_merge = find_first(child, 'vMerge')
                 if (v_merge is not None
                         and ('continue' == v_merge.get('val', '')
                              or v_merge.attrib == {})):
                     self.meta_data[child]['vmerge_continue'] = True
コード例 #14
0
ファイル: preprocessor.py プロジェクト: AaronWan/pydocx
 def _set_table_attributes(self, el):
     tables = find_all(el, 'tbl')
     for table in tables:
         rows = filter_children(table, ['tr'])
         if rows is None:
             continue
         for i, row in enumerate(rows):
             tcs = filter_children(row, ['tc'])
             for j, child in enumerate(tcs):
                 self.meta_data[child]['row_index'] = i
                 self.meta_data[child]['column_index'] = j
                 v_merge = find_first(child, 'vMerge')
                 if (
                         v_merge is not None and
                         ('continue' == v_merge.get('val', '') or
                          v_merge.attrib == {})
                 ):
                     self.meta_data[child]['vmerge_continue'] = True
コード例 #15
0
ファイル: preprocessor.py プロジェクト: madevelopers/pydocx
    def _generate_num_id(self, el):
        '''
        Fun fact: It is possible to have a list in the root, that holds a table
        that holds a list and for both lists to have the same numId. When this
        happens we should namespace the nested list with the number of tables
        it is in to ensure it is considered a new list. Otherwise all sorts of
        terrible html gets generated.
        '''
        num_id = find_first(el, 'numId').attrib['val']

        # First, go up the parent until we get None and count the number of
        # tables there are.
        num_tables = 0
        while self.parent(el) is not None:
            if el.tag == 'tbl':
                num_tables += 1
            el = self.parent(el)
        return NamespacedNumId(
            num_id=num_id,
            num_tables=num_tables,
        )
コード例 #16
0
ファイル: preprocessor.py プロジェクト: AaronWan/pydocx
    def _generate_num_id(self, el):
        '''
        Fun fact: It is possible to have a list in the root, that holds a table
        that holds a list and for both lists to have the same numId. When this
        happens we should namespace the nested list with the number of tables
        it is in to ensure it is considered a new list. Otherwise all sorts of
        terrible html gets generated.
        '''
        num_id = find_first(el, 'numId').attrib['val']

        # First, go up the parent until we get None and count the number of
        # tables there are.
        num_tables = 0
        while self.parent(el) is not None:
            if el.tag == 'tbl':
                num_tables += 1
            el = self.parent(el)
        return NamespacedNumId(
            num_id=num_id,
            num_tables=num_tables,
        )
コード例 #17
0
ファイル: preprocessor.py プロジェクト: madevelopers/pydocx
    def _set_headers(self, elements):
        # These are the styles for headers and what the html tag should be if
        # we have one.
        headers = {
            'heading 1': 'h1',
            'heading 2': 'h2',
            'heading 3': 'h3',
            'heading 4': 'h4',
            'heading 5': 'h5',
            'heading 6': 'h6',
            'heading 7': 'h6',
            'heading 8': 'h6',
            'heading 9': 'h6',
            'heading 10': 'h6',
        }
        # Remove the rPr from the styles dict since all the styling will be
        # down with the heading.
        for style in self.styles.styles:
            if style.name.lower() in headers:
                style.run_properties = None

        for element in elements:
            # This element is using the default style which is not a heading.
            p_style = find_first(element, 'pStyle')
            if p_style is None:
                continue
            style = p_style.attrib.get('val', '')
            style = self.styles.get_styles_by_type('paragraph').get(style)
            if style:
                style_name = style.name.lower()
                # Check to see if this element is actually a header.
                if style_name in headers:
                    # Set all the list item variables to false.
                    self.meta_data[element]['is_list_item'] = False
                    self.meta_data[element]['is_first_list_item'] = False
                    self.meta_data[element][
                        'is_last_list_item_in_root'] = False  # noqa
                    # Prime the heading_level
                    self.meta_data[element]['heading_level'] = headers[
                        style_name]  # noqa
コード例 #18
0
ファイル: preprocessor.py プロジェクト: AaronWan/pydocx
    def _set_headers(self, elements):
        # These are the styles for headers and what the html tag should be if
        # we have one.
        headers = {
            'heading 1': 'h1',
            'heading 2': 'h2',
            'heading 3': 'h3',
            'heading 4': 'h4',
            'heading 5': 'h5',
            'heading 6': 'h6',
            'heading 7': 'h6',
            'heading 8': 'h6',
            'heading 9': 'h6',
            'heading 10': 'h6',
        }
        # Remove the rPr from the styles dict since all the styling will be
        # down with the heading.
        for style in self.styles.styles:
            if style.name.lower() in headers:
                style.run_properties = None

        for element in elements:
            # This element is using the default style which is not a heading.
            p_style = find_first(element, 'pStyle')
            if p_style is None:
                continue
            style = p_style.attrib.get('val', '')
            style = self.styles.get_styles_by_type('paragraph').get(style)
            if style:
                style_name = style.name.lower()
                # Check to see if this element is actually a header.
                if style_name in headers:
                    # Set all the list item variables to false.
                    self.meta_data[element]['is_list_item'] = False
                    self.meta_data[element]['is_first_list_item'] = False
                    self.meta_data[element]['is_last_list_item_in_root'] = False  # noqa
                    # Prime the heading_level
                    self.meta_data[element]['heading_level'] = headers[style_name]  # noqa
コード例 #19
0
ファイル: DocxParser.py プロジェクト: mfraezz/pydocx
 def _get_page_width(self, root_element):
     pgSzEl = find_first(root_element, 'pgSz')
     if pgSzEl is not None:
         # pgSz is defined in twips, convert to points
         pgSz = int(pgSzEl.attrib['w'])
         return pgSz / TWIPS_PER_POINT
コード例 #20
0
ファイル: DocxParser.py プロジェクト: TJtrack99/pydocx
 def get_colspan(self, el):
     grid_span = find_first(el, 'gridSpan')
     if grid_span is None:
         return ''
     return grid_span.attrib['val']
コード例 #21
0
ファイル: DocxParser.py プロジェクト: mfraezz/pydocx
 def get_colspan(self, el):
     grid_span = find_first(el, 'gridSpan')
     if grid_span is None:
         return ''
     return grid_span.attrib['val']
コード例 #22
0
ファイル: DocxParser.py プロジェクト: TJtrack99/pydocx
 def _get_page_width(self, root_element):
     pgSzEl = find_first(root_element, 'pgSz')
     if pgSzEl is not None:
         # pgSz is defined in twips, convert to points
         pgSz = int(pgSzEl.attrib['w'])
         return pgSz / TWIPS_PER_POINT