Esempio n. 1
0
    def _get_image_size(self, el):
        """
        If we can't find a height or width, return 0 for whichever is not
        found, then rely on the `image` handler to strip those attributes. This
        functionality can change once we integrate PIL.
        """
        sizes = find_first(el, "ext")
        if sizes is not None and sizes.get("cx"):
            if sizes.get("cx"):
                x = self._convert_image_size(int(sizes.get("cx")))
            if sizes.get("cy"):
                y = self._convert_image_size(int(sizes.get("cy")))
            return ("%dpx" % x, "%dpx" % y)
        shape = find_first(el, "shape")
        if shape is not None and shape.get("style") is not None:
            # If either of these are not set, rely on the method `image` to not
            # use either of them.
            x = 0
            y = 0
            styles = shape.get("style").split(";")

            for s in styles:
                if s.startswith("height:"):
                    y = s.split(":")[1]
                if s.startswith("width:"):
                    x = s.split(":")[1]
            return x, y
        return 0, 0
Esempio n. 2
0
    def _get_image_size(self, el):
        """
        If we can't find a height or width, return 0 for whichever is not
        found, then rely on the `image` handler to strip those attributes. This
        functionality can change once we integrate PIL.
        """
        sizes = find_first(el, 'ext')
        if sizes is not None and sizes.get('cx'):
            if sizes.get('cx'):
                x = self._convert_image_size(int(sizes.get('cx')))
            if sizes.get('cy'):
                y = self._convert_image_size(int(sizes.get('cy')))
            return (
                '%dpx' % x,
                '%dpx' % y,
            )
        shape = find_first(el, 'shape')
        if shape is not None and shape.get('style') is not None:
            # If either of these are not set, rely on the method `image` to not
            # use either of them.
            x = 0
            y = 0
            styles = shape.get('style').split(';')

            for s in styles:
                if s.startswith('height:'):
                    y = s.split(':')[1]
                if s.startswith('width:'):
                    x = s.split(':')[1]
            return x, y
        return 0, 0
Esempio n. 3
0
    def __init__(
            self,
            path,
            convert_root_level_upper_roman=False,
            *args,
            **kwargs):
        self._parsed = ''
        self.block_text = ''
        self.page_width = 0
        self.convert_root_level_upper_roman = convert_root_level_upper_roman
        self._image_data = {}
        self._build_data(path, *args, **kwargs)
        self.pre_processor = None

        #divide by 20 to get to pt (Office works in 20th's of a point)
        """
        see http://msdn.microsoft.com/en-us/library/documentformat
        .openxml.wordprocessing.indentation.aspx
        """
        if find_first(self.root, 'pgSz') is not None:
            self.page_width = int(
                find_first(self.root, 'pgSz').attrib['w']
            ) / 20

        #all blank when we init
        self.comment_store = None
        self.visited = set()
        self.list_depth = 0
        self.rels_dict = self._parse_rels_root()
        self.styles_dict = self._parse_styles()
        self.parse_begin(self.root)  # begin to parse
Esempio n. 4
0
    def _get_image_size(self, el):
        """
        If we can't find a height or width, return 0 for whichever is not
        found, then rely on the `image` handler to strip those attributes. This
        functionality can change once we integrate PIL.
        """
        sizes = find_first(el, 'ext')
        if sizes is not None and sizes.get('cx'):
            if sizes.get('cx'):
                x = self._convert_image_size(int(sizes.get('cx')))
            if sizes.get('cy'):
                y = self._convert_image_size(int(sizes.get('cy')))
            return (
                '%dpx' % x,
                '%dpx' % y,
            )
        shape = find_first(el, 'shape')
        if shape is not None and shape.get('style') is not None:
            # If either of these are not set, rely on the method `image` to not
            # use either of them.
            x = 0
            y = 0
            styles = shape.get('style').split(';')

            for s in styles:
                if s.startswith('height:'):
                    y = s.split(':')[1]
                if s.startswith('width:'):
                    x = s.split(':')[1]
            return x, y
        return 0, 0
Esempio n. 5
0
    def __init__(self,
                 path,
                 convert_root_level_upper_roman=False,
                 *args,
                 **kwargs):
        self._parsed = ''
        self.block_text = ''
        self.page_width = 0
        self.convert_root_level_upper_roman = convert_root_level_upper_roman
        self._image_data = {}
        self._build_data(path, *args, **kwargs)
        self.pre_processor = None

        #divide by 20 to get to pt (Office works in 20th's of a point)
        """
        see http://msdn.microsoft.com/en-us/library/documentformat
        .openxml.wordprocessing.indentation.aspx
        """
        if find_first(self.root, 'pgSz') is not None:
            self.page_width = int(find_first(self.root,
                                             'pgSz').attrib['w']) / 20

        #all blank when we init
        self.comment_store = None
        self.visited = set()
        self.list_depth = 0
        self.rels_dict = self._parse_rels_root()
        self.styles_dict = self._parse_styles()
        self.parse_begin(self.root)  # begin to parse
Esempio n. 6
0
 def _get_image_id(self, el):
     # Drawings
     blip = find_first(el, 'blip')
     if blip is not None:
         # On drawing tags the id is actually whatever is returned from the
         # embed attribute on the blip tag. Thanks a lot Microsoft.
         return blip.get('embed')
     # Picts
     imagedata = find_first(el, 'imagedata')
     if imagedata is not None:
         return imagedata.get('id')
Esempio n. 7
0
 def _get_image_id(self, el):
     # Drawings
     blip = find_first(el, 'blip')
     if blip is not None:
         # On drawing tags the id is actually whatever is returned from the
         # embed attribute on the blip tag. Thanks a lot Microsoft.
         return blip.get('embed')
     # Picts
     imagedata = find_first(el, 'imagedata')
     if imagedata is not None:
         return imagedata.get('id')
Esempio n. 8
0
 def _parse_styles(self):
     if self.styles_text is None:
         return {}
     tree = parse_xml_from_string(self.styles_text)
     styles_dict = {}
     for style in find_all(tree, "style"):
         style_val = find_first(style, "name").attrib["val"]
         run_properties = find_first(style, "rPr")
         styles_dict[style.attrib["styleId"]] = {
             "style_name": style_val,
             "default_run_properties": self._parse_run_properties(run_properties),
         }
     return styles_dict
Esempio n. 9
0
    def test_find_first(self):
        root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>')

        # Can't find the root element
        result = find_first(root, 'one')
        self.assertEqual(result, None)

        result = find_first(root, 'three')
        self.assertEqual(result.tag, 'three')
        self.assertEqual(result.get('v'), '1')

        result = find_first(root, 'two')
        self.assertEqual(result.tag, 'two')
Esempio n. 10
0
    def test_find_first(self):
        root = make_xml(b'<one><two><three v="1"/><three v="2"/></two></one>')

        # Can't find the root element
        result = find_first(root, "one")
        self.assertEqual(result, None)

        result = find_first(root, "three")
        self.assertEqual(result.tag, "three")
        self.assertEqual(result.get("v"), "1")

        result = find_first(root, "two")
        self.assertEqual(result.tag, "two")
Esempio n. 11
0
 def _parse_styles(self):
     if self.styles_text is None:
         return {}
     tree = parse_xml_from_string(self.styles_text)
     styles_dict = {}
     for style in find_all(tree, 'style'):
         style_val = find_first(style, 'name').attrib['val']
         run_properties = find_first(style, 'rPr')
         styles_dict[style.attrib['styleId']] = {
             'style_name':
             style_val,
             'default_run_properties':
             self._parse_run_properties(run_properties, ),
         }
     return styles_dict
Esempio n. 12
0
 def _parse_styles(self):
     if self.styles_text is None:
         return {}
     tree = parse_xml_from_string(self.styles_text)
     styles_dict = {}
     for style in find_all(tree, 'style'):
         style_val = find_first(style, 'name').attrib['val']
         run_properties = find_first(style, 'rPr')
         styles_dict[style.attrib['styleId']] = {
             'style_name': style_val,
             'default_run_properties': self._parse_run_properties(
                 run_properties,
             ),
         }
     return styles_dict
Esempio n. 13
0
 def _parse_styles(self):
     styles_part = self.document.main_document_part.style_definitions_part
     if not styles_part:
         return {}
     styles_root = styles_part.root_element
     styles_dict = {}
     for style in find_all(styles_root, 'style'):
         style_val = find_first(style, 'name').attrib['val']
         run_properties = find_first(style, 'rPr')
         styles_dict[style.attrib['styleId']] = {
             'style_name':
             style_val,
             'default_run_properties':
             self._parse_run_properties(run_properties, ),
         }
     return styles_dict
Esempio n. 14
0
 def _parse_styles(self):
     styles_part = self.document.main_document_part.style_definitions_part
     if not styles_part:
         return {}
     styles_root = styles_part.root_element
     styles_dict = {}
     for style in find_all(styles_root, 'style'):
         style_val = find_first(style, 'name').attrib['val']
         run_properties = find_first(style, 'rPr')
         styles_dict[style.attrib['styleId']] = {
             'style_name': style_val,
             'default_run_properties': self._parse_run_properties(
                 run_properties,
             ),
         }
     return styles_dict
Esempio n. 15
0
 def parse_table_cell(self, el, text):
     v_merge = find_first(el, 'vMerge')
     if v_merge is not None and ('restart' != v_merge.get('val', '')):
         return ''
     colspan = self.get_colspan(el)
     rowspan = self._get_rowspan(el, v_merge)
     if rowspan > 1:
         rowspan = str(rowspan)
     else:
         rowspan = ''
     return self.table_cell(text, colspan, rowspan)
Esempio n. 16
0
 def parse_table_cell(self, el, text):
     v_merge = find_first(el, "vMerge")
     if v_merge is not None and ("restart" != v_merge.get("val", "")):
         return ""
     colspan = self.get_colspan(el)
     rowspan = self._get_rowspan(el, v_merge)
     if rowspan > 1:
         rowspan = str(rowspan)
     else:
         rowspan = ""
     return self.table_cell(text, colspan, rowspan)
Esempio n. 17
0
 def parse_table_cell(self, el, text):
     v_merge = find_first(el, 'vMerge')
     if v_merge is not None and (
             'restart' != v_merge.get('val', '')):
         return ''
     colspan = self.get_colspan(el)
     rowspan = self._get_rowspan(el, v_merge)
     if rowspan > 1:
         rowspan = str(rowspan)
     else:
         rowspan = ''
     return self.table_cell(text, colspan, rowspan)
Esempio n. 18
0
    def _load(self):
        self.document = WordprocessingDocument(path=self.path)
        main_document_part = self.document.main_document_part
        if main_document_part is None:
            raise MalformedDocxException

        self.root_element = main_document_part.root_element
        self.numbering_root = None
        numbering_part = main_document_part.numbering_definitions_part
        if numbering_part:
            self.numbering_root = numbering_part.root_element

        pgSzEl = find_first(self.root_element, 'pgSz')
        if pgSzEl is not None:
            # pgSz is defined in twips, convert to points
            pgSz = int(pgSzEl.attrib['w'])
            self.page_width = pgSz / TWIPS_PER_POINT

        self.styles_dict = self._parse_styles()
        self.parse_begin(self.root_element)
Esempio n. 19
0
    def _load(self):
        self.document = WordprocessingDocument(path=self.path)
        main_document_part = self.document.main_document_part
        if main_document_part is None:
            raise MalformedDocxException

        self.root_element = main_document_part.root_element
        self.numbering_root = None
        numbering_part = main_document_part.numbering_definitions_part
        if numbering_part:
            self.numbering_root = numbering_part.root_element

        pgSzEl = find_first(self.root_element, 'pgSz')
        if pgSzEl is not None:
            # pgSz is defined in twips, convert to points
            pgSz = int(pgSzEl.attrib['w'])
            self.page_width = pgSz / TWIPS_PER_POINT

        self.styles_dict = self._parse_styles()
        self.parse_begin(self.root_element)
Esempio n. 20
0
    def parse_r(self, el, parsed):
        """
        Parse the running text.
        """
        text = parsed
        if not text:
            return ''

        run_properties = {}

        # Get the rPr for the current style, they are the defaults.
        p = find_ancestor_with_tag(self.pre_processor, el, 'p')
        paragraph_style = find_first(p, 'pStyle')
        if paragraph_style is not None:
            style = paragraph_style.get('val')
            style_defaults = self.styles_dict.get(style, {})
            run_properties.update(
                style_defaults.get('default_run_properties', {}),
            )

        # Get the rPr for the current r tag, they are overrides.
        run_properties_element = el.find('rPr')
        if run_properties_element:
            local_run_properties = self._parse_run_properties(
                run_properties_element,
            )
            run_properties.update(local_run_properties)

        inline_tag_handlers = {
            'b': self.bold,
            'i': self.italics,
            'u': self.underline,
            'caps': self.caps,
            'smallCaps': self.small_caps,
            'strike': self.strike,
            'dstrike': self.strike,
            'vanish': self.hide,
            'webHidden': self.hide,
        }
        styles_needing_application = []
        for property_name, property_value in run_properties.items():
            # These tags are a little different, handle them separately
            # from the rest.
            # This could be a superscript or a subscript
            if property_name == 'vertAlign':
                if property_value == 'superscript':
                    styles_needing_application.append(self.superscript)
                elif property_value == 'subscript':
                    styles_needing_application.append(self.subscript)
            else:
                if (
                        property_name in inline_tag_handlers and
                        self._is_style_on(property_value)
                ):
                    styles_needing_application.append(
                        inline_tag_handlers[property_name],
                    )

        # Apply all the handlers.
        for func in styles_needing_application:
            text = func(text)

        return text
Esempio n. 21
0
 def get_colspan(self, el):
     grid_span = find_first(el, "gridSpan")
     if grid_span is None:
         return ""
     return grid_span.attrib["val"]
Esempio n. 22
0
 def get_colspan(self, el):
     grid_span = find_first(el, 'gridSpan')
     if grid_span is None:
         return ''
     return grid_span.attrib['val']
Esempio n. 23
0
 def get_colspan(self, el):
     grid_span = find_first(el, 'gridSpan')
     if grid_span is None:
         return ''
     return grid_span.attrib['val']