def read_properties_core(xml_source): """Read assorted file properties.""" properties = DocumentProperties() root = fromstring(xml_source) creator_node = root.find(QName(NAMESPACES['dc'], 'creator').text) if creator_node is not None: properties.creator = creator_node.text else: properties.creator = '' last_modified_by_node = root.find( QName(NAMESPACES['cp'], 'lastModifiedBy').text) if last_modified_by_node is not None: properties.last_modified_by = last_modified_by_node.text else: properties.last_modified_by = '' created_node = root.find(QName(NAMESPACES['dcterms'], 'created').text) if created_node is not None: properties.created = W3CDTF_to_datetime(created_node.text) else: properties.created = datetime.datetime.now() modified_node = root.find(QName(NAMESPACES['dcterms'], 'modified').text) if modified_node is not None: properties.modified = W3CDTF_to_datetime(modified_node.text) else: properties.modified = properties.created return properties
def get_text(xmlns, rich_node): """Read rich text, discarding formatting if not disallowed""" text_node = rich_node.find(QName(xmlns, 't').text) partial_text = text_node.text or unicode('') if text_node.get(QName(NAMESPACES['xml'], 'space').text) != 'preserve': partial_text = partial_text.strip() return unicode(partial_text)
def parse_custom_num_formats(root, xmlns): """Read in custom numeric formatting rules from the shared style table""" custom_formats = {} num_fmts = root.find(QName(xmlns, 'numFmts').text) if num_fmts is not None: num_fmt_nodes = num_fmts.findall(QName(xmlns, 'numFmt').text) for num_fmt_node in num_fmt_nodes: custom_formats[int(num_fmt_node.get('numFmtId'))] = \ num_fmt_node.get('formatCode').lower() return custom_formats
def parse_fonts(root, xmlns, color_index): """Read in the fonts""" font_list = [] fonts = root.find(QName(xmlns, 'fonts').text) if fonts is not None: font_nodes = fonts.findall(QName(xmlns, 'font').text) for font_node in font_nodes: font = Font() font.size = font_node.find(QName(xmlns, 'sz').text).get('val') font.name = font_node.find(QName(xmlns, 'name').text).get('val') font.bold = True if len(font_node.findall(QName( xmlns, 'b').text)) else False font.italic = True if len(font_node.findall( QName(xmlns, 'i').text)) else False if len(font_node.findall(QName(xmlns, 'u').text)): underline = font_node.find(QName(xmlns, 'u').text).get('val') font.underline = underline if underline else 'single' color = font_node.find(QName(xmlns, 'color').text) if color is not None: if color.get('indexed') is not None and 0 <= int( color.get('indexed')) < len(color_index): font.color.index = color_index[int(color.get('indexed'))] elif color.get('theme') is not None: if color.get('tint') is not None: font.color.index = 'theme:%s:%s' % (color.get('theme'), color.get('tint')) else: font.color.index = 'theme:%s:' % color.get( 'theme') # prefix color with theme elif color.get('rgb'): font.color.index = color.get('rgb') font_list.append(font) return font_list
def parse_color_index(root, xmlns): """Read in the list of indexed colors""" color_index = [] colors = root.find(QName(xmlns, 'colors').text) if colors is not None: indexedColors = colors.find(QName(xmlns, 'indexedColors').text) if indexedColors is not None: color_nodes = indexedColors.findall(QName(xmlns, 'rgbColor').text) for color_node in color_nodes: color_index.append(color_node.get('rgb')) return color_index
def parse_fills(root, xmlns, color_index): """Read in the list of fills""" fill_list = [] fills = root.find(QName(xmlns, 'fills').text) count = 0 if fills is not None: fillNodes = fills.findall(QName(xmlns, 'fill').text) for fill in fillNodes: newFill = Fill() # Rotation is unset patternFill = fill.find(QName(xmlns, 'patternFill').text) if patternFill is not None: newFill.fill_type = patternFill.get('patternType') fgColor = patternFill.find(QName(xmlns, 'fgColor').text) if fgColor is not None: if fgColor.get('indexed') is not None and 0 <= int( fgColor.get('indexed')) < len(color_index): newFill.start_color.index = color_index[int( fgColor.get('indexed'))] elif fgColor.get('theme') is not None: if fgColor.get('tint') is not None: newFill.start_color.index = 'theme:%s:%s' % ( fgColor.get('theme'), fgColor.get('tint')) else: newFill.start_color.index = 'theme:%s:' % fgColor.get( 'theme') # prefix color with theme else: newFill.start_color.index = fgColor.get('rgb') bgColor = patternFill.find(QName(xmlns, 'bgColor').text) if bgColor is not None: if bgColor.get('indexed') is not None and 0 <= int( bgColor.get('indexed')) < len(color_index): newFill.end_color.index = color_index[int( bgColor.get('indexed'))] elif bgColor.get('theme') is not None: if bgColor.get('tint') is not None: newFill.end_color.index = 'theme:%s:%s' % ( bgColor.get('theme'), bgColor.get('tint')) else: newFill.end_color.index = 'theme:%s:' % bgColor.get( 'theme') # prefix color with theme elif bgColor.get('rgb'): newFill.end_color.index = bgColor.get('rgb') count += 1 fill_list.append(newFill) return fill_list
def read_sheets_titles(xml_source): """Read titles for all sheets.""" root = fromstring(xml_source) titles_root = root.find( QName('http://schemas.openxmlformats.org/spreadsheetml/2006/main', 'sheets').text) return [sheet.get('name') for sheet in titles_root.getchildren()]
def read_excel_base_date(xml_source): root = fromstring(text=xml_source) wbPr = root.find( QName('http://schemas.openxmlformats.org/spreadsheetml/2006/main', 'workbookPr').text) if ('date1904' in wbPr.keys() and wbPr.attrib['date1904'] in ('1', 'true')): return CALENDAR_MAC_1904 return CALENDAR_WINDOWS_1900
def parse_borders(root, xmlns, color_index): """Read in the boarders""" border_list = [] borders = root.find(QName(xmlns, 'borders').text) if borders is not None: boarderNodes = borders.findall(QName(xmlns, 'border').text) count = 0 for boarder in boarderNodes: newBorder = Borders() if boarder.get('diagonalup') == 1: newBorder.diagonal_direction = newBorder.DIAGONAL_UP if boarder.get('diagonalDown') == 1: if newBorder.diagonal_direction == newBorder.DIAGONAL_UP: newBorder.diagonal_direction = newBorder.DIAGONAL_BOTH else: newBorder.diagonal_direction = newBorder.DIAGONAL_DOWN for side in ('left', 'right', 'top', 'bottom', 'diagonal'): node = boarder.find(QName(xmlns, side).text) if node is not None: borderSide = getattr(newBorder, side) if node.get('style') is not None: borderSide.border_style = node.get('style') color = node.find(QName(xmlns, 'color').text) if color is not None: # Ignore 'auto' if color.get('indexed') is not None and 0 <= int( color.get('indexed')) < len(color_index): borderSide.color.index = color_index[int( color.get('indexed'))] elif color.get('theme') is not None: if color.get('tint') is not None: borderSide.color.index = 'theme:%s:%s' % ( color.get('theme'), color.get('tint')) else: borderSide.color.index = 'theme:%s:' % color.get( 'theme') # prefix color with theme elif color.get('rgb'): borderSide.color.index = color.get('rgb') count += 1 border_list.append(newBorder) return border_list
def get_string(xmlns, string_index_node): """Read the contents of a specific string index""" rich_nodes = string_index_node.findall(QName(xmlns, 'r').text) if rich_nodes: reconstructed_text = [] for rich_node in rich_nodes: partial_text = get_text(xmlns, rich_node) reconstructed_text.append(partial_text) return unicode(''.join(reconstructed_text)) else: return get_text(xmlns, string_index_node)
def read_named_ranges(xml_source, workbook): """Read named ranges, excluding poorly defined ranges.""" named_ranges = [] root = fromstring(xml_source) names_root = root.find( QName('http://schemas.openxmlformats.org/spreadsheetml/2006/main', 'definedNames').text) if names_root is not None: for name_node in names_root.getchildren(): range_name = name_node.get('name') if name_node.get("hidden", '0') == '1': continue valid = True for discarded_range in DISCARDED_RANGES: if discarded_range in range_name: valid = False for bad_range in BUGGY_NAMED_RANGES: if bad_range in name_node.text: valid = False if valid: if refers_to_range(name_node.text): destinations = split_named_range(name_node.text) new_destinations = [] for worksheet, cells_range in destinations: # it can happen that a valid named range references # a missing worksheet, when Excel didn't properly maintain # the named range list # # we just ignore them here worksheet = workbook.get_sheet_by_name(worksheet) if worksheet: new_destinations.append((worksheet, cells_range)) named_range = NamedRange(range_name, new_destinations) else: named_range = NamedRangeContainingValue( range_name, name_node.text) location_id = name_node.get("localSheetId") if location_id: named_range.scope = workbook.worksheets[int(location_id)] named_ranges.append(named_range) return named_ranges
def read_string_table(xml_source): """Read in all shared strings in the table""" table = {} xmlns = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' root = fromstring(text=xml_source) string_index_nodes = root.findall(QName(xmlns, 'si').text) for index, string_index_node in enumerate(string_index_nodes): string = get_string(xmlns, string_index_node) # fix XML escaping sequence for '_x' string = string.replace('x005F_', '') table[index] = string return table
def read_style_table(xml_source): """Read styles from the shared style table""" table = {} xmlns = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' root = fromstring(xml_source) custom_num_formats = parse_custom_num_formats(root, xmlns) color_index = parse_color_index(root, xmlns) font_list = parse_fonts(root, xmlns, color_index) fill_list = parse_fills(root, xmlns, color_index) border_list = parse_borders(root, xmlns, color_index) builtin_formats = NumberFormat._BUILTIN_FORMATS cell_xfs = root.find(QName(xmlns, 'cellXfs').text) if cell_xfs is not None: # can happen on bad OOXML writers (e.g. Gnumeric) cell_xfs_nodes = cell_xfs.findall(QName(xmlns, 'xf').text) for index, cell_xfs_node in enumerate(cell_xfs_nodes): new_style = Style() number_format_id = int(cell_xfs_node.get('numFmtId')) if number_format_id < 164: new_style.number_format.format_code = \ builtin_formats.get(number_format_id, 'General') else: if number_format_id in custom_num_formats: new_style.number_format.format_code = \ custom_num_formats[number_format_id] else: raise MissingNumberFormat('%s' % number_format_id) if cell_xfs_node.get('applyAlignment') == '1': alignment = cell_xfs_node.find(QName(xmlns, 'alignment').text) if alignment is not None: if alignment.get('horizontal') is not None: new_style.alignment.horizontal = alignment.get( 'horizontal') if alignment.get('vertical') is not None: new_style.alignment.vertical = alignment.get( 'vertical') if alignment.get('wrapText'): new_style.alignment.wrap_text = True if alignment.get('shrinkToFit'): new_style.alignment.shrink_to_fit = True if alignment.get('indent') is not None: new_style.alignment.ident = int( alignment.get('indent')) if alignment.get('textRotation') is not None: new_style.alignment.text_rotation = int( alignment.get('textRotation')) # ignore justifyLastLine option when horizontal = distributed if cell_xfs_node.get('applyFont') == '1': new_style.font = deepcopy(font_list[int( cell_xfs_node.get('fontId'))]) new_style.font.color = deepcopy(font_list[int( cell_xfs_node.get('fontId'))].color) if cell_xfs_node.get('applyFill') == '1': new_style.fill = deepcopy(fill_list[int( cell_xfs_node.get('fillId'))]) new_style.fill.start_color = deepcopy(fill_list[int( cell_xfs_node.get('fillId'))].start_color) new_style.fill.end_color = deepcopy(fill_list[int( cell_xfs_node.get('fillId'))].end_color) if cell_xfs_node.get('applyBorder') == '1': new_style.borders = deepcopy(border_list[int( cell_xfs_node.get('borderId'))]) new_style.borders.left = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].left) new_style.borders.left.color = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].left.color) new_style.borders.right = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].right) new_style.borders.right.color = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].right.color) new_style.borders.top = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].top) new_style.borders.top.color = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].top.color) new_style.borders.bottom = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].bottom) new_style.borders.bottom.color = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].bottom.color) new_style.borders.diagonal = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].diagonal) new_style.borders.diagonal.color = deepcopy(border_list[int( cell_xfs_node.get('borderId'))].diagonal.color) if cell_xfs_node.get('applyProtection') == '1': protection = cell_xfs_node.find( QName(xmlns, 'protection').text) # Ignore if there are no protection sub-nodes if protection is not None: if protection.get('locked') is not None: if protection.get('locked') == '1': new_style.protection.locked = Protection.PROTECTION_PROTECTED else: new_style.protection.locked = Protection.PROTECTION_UNPROTECTED if protection.get('hidden') is not None: if protection.get('hidden') == '1': new_style.protection.hidden = Protection.PROTECTION_PROTECTED else: new_style.protection.hidden = Protection.PROTECTION_UNPROTECTED table[index] = new_style return table
def fast_parse(ws, xml_source, string_table, style_table): xmlns = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main' root = fromstring(xml_source) mergeCells = root.find(QName(xmlns, 'mergeCells').text) if mergeCells is not None: mergeCellNodes = mergeCells.findall(QName(xmlns, 'mergeCell').text) for mergeCell in mergeCellNodes: ws.merge_cells(mergeCell.get('ref')) source = _get_xml_iter(xml_source) it = iterparse(source) for event, element in filter(filter_cells, it): value = element.findtext('{http://schemas.openxmlformats.org/spreadsheetml/2006/main}v') coordinate = element.get('r') style_id = element.get('s') if style_id is not None: ws._styles[coordinate] = style_table.get(int(style_id)) if value is not None: data_type = element.get('t', 'n') if data_type == Cell.TYPE_STRING: value = string_table.get(int(value)) ws.cell(coordinate).value = value # to avoid memory exhaustion, clear the item after use element.clear() cols = root.find(QName(xmlns, 'cols').text) if cols is not None: colNodes = cols.findall(QName(xmlns, 'col').text) for col in colNodes: min = int(col.get('min')) if col.get('min') else 1 max = int(col.get('max')) if col.get('max') else 1 for colId in range(min, max + 1): column = get_column_letter(colId) if column not in ws.column_dimensions: ws.column_dimensions[column] = ColumnDimension(column) if col.get('width') is not None: ws.column_dimensions[column].width = float(col.get('width')) if col.get('bestFit') == '1': ws.column_dimensions[column].auto_size = True if col.get('hidden') == '1': ws.column_dimensions[column].visible = False if col.get('outlineLevel') is not None: ws.column_dimensions[column].outline_level = int(col.get('outlineLevel')) if col.get('collapsed') == '1': ws.column_dimensions[column].collapsed = True if col.get('style') is not None: ws.column_dimensions[column].style_index = col.get('style') printOptions = root.find(QName(xmlns, 'printOptions').text) if printOptions is not None: if printOptions.get('horizontalCentered') is not None: ws.page_setup.horizontalCentered = printOptions.get('horizontalCentered') if printOptions.get('verticalCentered') is not None: ws.page_setup.verticalCentered = printOptions.get('verticalCentered') pageMargins = root.find(QName(xmlns, 'pageMargins').text) if pageMargins is not None: if pageMargins.get('left') is not None: ws.page_margins.left = float(pageMargins.get('left')) if pageMargins.get('right') is not None: ws.page_margins.right = float(pageMargins.get('right')) if pageMargins.get('top') is not None: ws.page_margins.top = float(pageMargins.get('top')) if pageMargins.get('bottom') is not None: ws.page_margins.bottom = float(pageMargins.get('bottom')) if pageMargins.get('header') is not None: ws.page_margins.header = float(pageMargins.get('header')) if pageMargins.get('footer') is not None: ws.page_margins.footer = float(pageMargins.get('footer')) pageSetup = root.find(QName(xmlns, 'pageSetup').text) if pageSetup is not None: if pageSetup.get('orientation') is not None: ws.page_setup.orientation = pageSetup.get('orientation') if pageSetup.get('paperSize') is not None: ws.page_setup.paperSize = pageSetup.get('paperSize') if pageSetup.get('scale') is not None: ws.page_setup.top = pageSetup.get('scale') if pageSetup.get('fitToPage') is not None: ws.page_setup.fitToPage = pageSetup.get('fitToPage') if pageSetup.get('fitToHeight') is not None: ws.page_setup.fitToHeight = pageSetup.get('fitToHeight') if pageSetup.get('fitToWidth') is not None: ws.page_setup.fitToWidth = pageSetup.get('fitToWidth') if pageSetup.get('firstPageNumber') is not None: ws.page_setup.firstPageNumber = pageSetup.get('firstPageNumber') if pageSetup.get('useFirstPageNumber') is not None: ws.page_setup.useFirstPageNumber = pageSetup.get('useFirstPageNumber') headerFooter = root.find(QName(xmlns, 'headerFooter').text) if headerFooter is not None: oddHeader = headerFooter.find(QName(xmlns, 'oddHeader').text) if oddHeader is not None: ws.header_footer.setHeader(oddHeader.text) oddFooter = headerFooter.find(QName(xmlns, 'oddFooter').text) if oddFooter is not None: ws.header_footer.setFooter(oddFooter.text)