def read_in_styles(styles, content, transclusions): stys = DocStys() # Page properties raw_header = styles.find(ns.office('master-styles/') + ns.style('master-page/') + ns.style('header')) raw_footer = styles.find(ns.office('master-styles/') + ns.style('master-page/') + ns.style('footer')) page_layout = (styles.find(ns.office('automatic-styles/') + ns.style('page-layout/'))) page_width, lmargin, rmargin, lpad, rpad = [ in_cm(page_layout.get(ns.fo(k), '0cm')) for k in ['page-width', 'margin-left', 'margin-right', 'padding-left', 'padding-right']] stys.textwidth = float(page_width - (lmargin + rmargin + lpad + rpad)) log.warn('Textwidth: %fcm', stys.textwidth) # the actual meat for stylebit in styles.find(ns.office('styles')): stys.add_odt_style(stylebit) for stylebit in styles.find(ns.office('automatic-styles')): stys.add_odt_style(stylebit) stys.header, stys.footer = ( raw is not None and list(parse_body(raw, ParseContext(stys), transclusions)) for raw in [raw_header, raw_footer]) # NB: the parsing of the content styLes has to come *after* we've got the # header and footer, because they'll clobber the automatic styles defined # in styles.xml (I'm not sure if that's spec conformant), which apply to # the header and footer text for stylebit in content.find(ns.office('automatic-styles')): stys.add_odt_style(stylebit) return stys
def read_in_styles(styles, content, transclusions): stys = DocStys() # Page properties raw_header = styles.find( ns.office('master-styles/') + ns.style('master-page/') + ns.style('header')) raw_footer = styles.find( ns.office('master-styles/') + ns.style('master-page/') + ns.style('footer')) page_layout = ( styles.find(ns.office('automatic-styles/') + ns.style('page-layout/'))) page_width, lmargin, rmargin, lpad, rpad = [ in_cm(page_layout.get(ns.fo(k), '0cm')) for k in [ 'page-width', 'margin-left', 'margin-right', 'padding-left', 'padding-right' ] ] stys.textwidth = float(page_width - (lmargin + rmargin + lpad + rpad)) log.warn('Textwidth: %fcm', stys.textwidth) # the actual meat for stylebit in styles.find(ns.office('styles')): stys.add_odt_style(stylebit) for stylebit in styles.find(ns.office('automatic-styles')): stys.add_odt_style(stylebit) stys.header, stys.footer = (raw is not None and list( parse_body(raw, ParseContext(stys), transclusions)) for raw in [raw_header, raw_footer]) # NB: the parsing of the content styLes has to come *after* we've got the # header and footer, because they'll clobber the automatic styles defined # in styles.xml (I'm not sure if that's spec conformant), which apply to # the header and footer text for stylebit in content.find(ns.office('automatic-styles')): stys.add_odt_style(stylebit) return stys
def style_id(style): """Gets the unique id of a style, which according to OpenDocumentv-1.1.pdf, p.480 would seem to be (style:family, style:name): The style:name attribute identifies the name of the style. This attribute, combined with the style:family attribute, uniquely identifies a style. The <office:styles>, <office:automatic-styles> and <office:master-styles> elements each must not contain two styles with the same family and the same name. Of course Google doc seems to have a different interpreation because it refers to ('text', 'Standard') when only ('paragraph', 'Standard') exists. The parent style cannot be an automatic style and has to exist. Well, tough. """ if style.tag in (ns.text('list-style'), ): family = None else: family = style.attrib[ns.style('family')] return ( family, #FIXME verify name defaults to display-name style.get(ns.style('name')) or style.get(ns.style('display-name')))
def style_id(style): """Gets the unique id of a style, which according to OpenDocumentv-1.1.pdf, p.480 would seem to be (style:family, style:name): The style:name attribute identifies the name of the style. This attribute, combined with the style:family attribute, uniquely identifies a style. The <office:styles>, <office:automatic-styles> and <office:master-styles> elements each must not contain two styles with the same family and the same name. Of course Google doc seems to have a different interpreation because it refers to ('text', 'Standard') when only ('paragraph', 'Standard') exists. The parent style cannot be an automatic style and has to exist. Well, tough. """ if style.tag in (ns.text('list-style'), ): family = None else: family = style.attrib[ns.style('family')] return (family, #FIXME verify name defaults to display-name style.get(ns.style('name')) or style.get(ns.style('display-name')))
def ensure_minimal_styles(styles, required_styles): common_styles = styles.find(ns.office('styles')) for name in required_styles: if not common_styles.find( ns.style('style') + '[@' + ns.style('name') + '="%s"]' % name): style = (ODT_MINIMAL_STYLES.get(name) or create_color_style(*COLOR_SPAN_REX.match(name).groups())) templated = (style(name) if name == 'Image' else ODT_TEXT_STYLE_TEMPLATE(name, style)) lxmlutil.extend(common_styles, parse_odt_frags(templated))
def ensure_minimal_styles(styles, required_styles): common_styles = styles.find(ns.office('styles')) for name in required_styles: if not common_styles.find(ns.style('style')+ '[@'+ns.style('name')+'="%s"]' % name): style = (ODT_MINIMAL_STYLES.get(name) or create_color_style(*COLOR_SPAN_REX.match(name).groups())) templated = (style(name) if name == 'Image' else ODT_TEXT_STYLE_TEMPLATE(name, style)) lxmlutil.extend(common_styles, parse_odt_frags(templated))
def _get_list_style(cls, style): # pylint: disable=R0914 OO_ENUMERATIONS = '1aiAI' CSS_ENUMERATIONS = [ 'decimal', 'lower-alpha', 'lower-roman', 'upper-alpha', 'upper-roman' ] # The ODT 1.1 spec lists these as common values # •BULLET✔HEAVY CHECK MARK✗BALLOT X # ➔HEAV WIDE-HEADED RIGHTWARDS ARROW # ➢THREE-D TOP-LIGHTED RIGHTWARDS ARROWHEAD # FIXME: allow for all of these # 'none', 'asterisks', 'box', 'check', 'circle', 'diamond', # 'disc', 'hyphen', 'square', # # XXX: As an additional hack we map several visually close unicode # characters to the same CSS bullets. The first 4 are the canonical # values as per spec, IIRC, but he second 4 occur frequently in odt # documents and at least 'o' has also been observed in a google doc # that came from word OO_BULLETS = u'●○■-' u'•◦▪–' u'·o⬛—' # −‐ might also be relevant CSS_BULLETS = ['disc', 'circle', 'square', 'hyphen'] * 3 LISTS = [('ol', ns.text('list-level-style-number'), ns.style('num-format'), OO_ENUMERATIONS, CSS_ENUMERATIONS), ('ul', ns.text('list-level-style-bullet'), ns.text('bullet-char'), OO_BULLETS, CSS_BULLETS)] ans = {} for list_type, what, oo_attr, oo, css in LISTS: oo2css = dict(zip(oo, css)) xml_frags = style.findall(what) levels = [int(props.get(ns.text('level'))) for props in xml_frags] rel_depth = cls._rel_depths(levels) for props, level in zip(xml_frags, levels): v = props.get(oo_attr) the_prop = oo_attr.split('}')[1] default_value_at_this_depth = css[rel_depth[level] % 3] list_style_type = oo2css.get(v) if list_style_type is None: log.warn("Don't know list-style %s value '%s'", the_prop, v) elif list_style_type == default_value_at_this_depth: list_style_type = None start = props.get(ns.text('start-value')) if start: start = int(start) assert list_type == 'ol' ans[level] = dict(list_type=list_type, list_style_type=list_style_type, depth=rel_depth[level], start=start) assert ans return 'list', ans
def _get_list_style(cls, style): # pylint: disable=R0914 OO_ENUMERATIONS = '1aiAI' CSS_ENUMERATIONS = ['decimal', 'lower-alpha', 'lower-roman', 'upper-alpha', 'upper-roman'] # The ODT 1.1 spec lists these as common values # •BULLET✔HEAVY CHECK MARK✗BALLOT X # ➔HEAV WIDE-HEADED RIGHTWARDS ARROW # ➢THREE-D TOP-LIGHTED RIGHTWARDS ARROWHEAD # FIXME: allow for all of these # 'none', 'asterisks', 'box', 'check', 'circle', 'diamond', # 'disc', 'hyphen', 'square', # # XXX: As an additional hack we map several visually close unicode # characters to the same CSS bullets. The first 4 are the canonical # values as per spec, IIRC, but he second 4 occur frequently in odt # documents and at least 'o' has also been observed in a google doc # that came from word OO_BULLETS = u'●○■-' u'•◦▪–' u'·o⬛—' # −‐ might also be relevant CSS_BULLETS = ['disc', 'circle', 'square', 'hyphen'] * 3 LISTS = [ ('ol', ns.text('list-level-style-number'), ns.style('num-format'), OO_ENUMERATIONS, CSS_ENUMERATIONS), ('ul', ns.text('list-level-style-bullet'), ns.text('bullet-char'), OO_BULLETS, CSS_BULLETS)] ans = {} for list_type, what, oo_attr, oo, css in LISTS: oo2css = dict(zip(oo, css)) xml_frags = style.findall(what) levels = [int(props.get(ns.text('level'))) for props in xml_frags] rel_depth = cls._rel_depths(levels) for props, level in zip(xml_frags, levels): v = props.get(oo_attr) the_prop = oo_attr.split('}')[1] default_value_at_this_depth = css[rel_depth[level] % 3] list_style_type = oo2css.get(v) if list_style_type is None: log.warn("Don't know list-style %s value '%s'", the_prop, v) elif list_style_type == default_value_at_this_depth: list_style_type = None start = props.get(ns.text('start-value')) if start: start = int(start) assert list_type == 'ol' ans[level] = dict(list_type=list_type, list_style_type=list_style_type, depth=rel_depth[level], start=start) assert ans return 'list', ans
def parse_body(xml, context, normalize_transclusion): # pylint: disable=R0912,R0915,R0914 for e in xml: text = (e.text or '') tail = (e.tail or '') # some style properties should be promoted to tags, e.g. underlining # and bolding tags_from_style = [] stys_dealt_with = [] if e.tag in (S_TAG, TAB_TAG): yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get( ns.text('c'), '1')) if tail: yield tail continue if e.tag == LINEBREAK_TAG: yield mkel('br', {}, []) continue sty = context.stys.get( e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR)) # handle page breaks if sty and sty.par_break: assert e.tag in (H_TAG, P_TAG), \ "Unexpected page-break in %r" % e.tag yield mkel('.pagebreak', {}, []) stys_dealt_with.append('par_break') # Handle lists specially if e.tag == LIST_TAG: new_context = context.bump_list_level(sty) stys_dealt_with.append('sub_list_styles') else: new_context = context body = list(parse_body(e, new_context, normalize_transclusion)) assert type(body) is list and not body or type(body[0]) is not list attrs = {} if text: body = [text] + body if sty and sty.type.endswith('title'): head = sty.type body = [plaintextify(body)] sty = None elif e.tag == H_TAG: # skip empty headings; NB: this *must* happen # after we extracted eventual page-breaks, which are the only # useful information empty headings can contain if blank(body): continue head = sty.type # FIXME(alexander): keep track of the headings breadcrumbs in # context for two reasons # # 1. to associate errors with specific headings # 2. to warn about bad structure e.g. h1 followed by h4, # rather than h2 elif e.tag == LIST_TAG: head = new_context.list_type assert head in ('ol', 'ul') list_start = new_context.list_start if list_start is not None: assert head == 'ol' attrs['start'] = str(list_start) id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101 if id_ is not None: attrs['id'] = id_ continues = e.attrib.get(ns.text('continue-list')) if continues is not None: # make this a data attrib, so we can stuff it # into the html, which doesn't have direct support attrs['data-continue-list'] = continues elif e.tag == LIST_ITEM_TAG: head = 'li' elif e.tag == ANNOTATION_TAG: head = 'aside' elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG): #FIXME: extract content if text: log.warning('Hey, someone actually specified a %s: %s', e.tag, text) if tail: yield tail continue elif e.tag == NOTE_TAG: # other valid option is 'endnote' assert e.attrib[ns.text('note-class')] == 'footnote' # skip ahead and exit early; we only represent the note-body assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG assert len(body) == 1 yield body[0] if tail: yield tail continue elif e.tag == NOTE_BODY_TAG: head = '.footnote' # FIXME(alexander): sucky hack to strip the bogus whitespace # google docs enters at the beginning of a footnote for some # reason. I should really write a more generic whitespace # stripping mechanism in the postprocess module that can recognize # consecutive whitespace even if seperated-by/wrapped-in inline # tags. _, B1, B2, = map(Var, '_, B1, B2'.split(', ')) SPACED_STR = Var( 'SPACED_STR', lambda s: (isinstance(s, basestring) and re.match(r'\s+', s))) if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]: body[0][2][0] = SPACED_STR.val.lstrip() # FIXME(alexander): add anchors for all paras elif e.tag == P_TAG: margin = sty.margin_left or sty.text_indent if sty else None indent_level = in_indents(margin) if margin else 0 if indent_level: head = '.block' attrs['indent'] = indent_level else: head = 'p' #FIXME styled links etc. gdocs might not use that... #... but we should be able to handle non-span bolding etc. elif e.tag == SPAN_TAG: # XXX: order can matter; we need # <b><u>command</u><b> # not # <u><b>command</b><u> # # but more generally the minimal coalescing of abutting partially # overlapping styles is something that needs to be thought about # properly at some point. for attr, on_values, html_tags in [ ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']), ('font_style', ['italic'], ['i']), ('line_through', [True], ['s']), ('text_position', ['sub', 'super'], ['sub', 'sup']) ]: value = getattr(sty, attr, None) if value: if value not in on_values: log.error("Bad value for %s: %s in %s", attr, value, e.tag) continue tags_from_style.append(html_tags[on_values.index(value)]) stys_dealt_with.append(attr) if is_code_font(sty.font_family): tags_from_style.append('code') stys_dealt_with.append('font_family') head = 'span' elif e.tag == A_TAG: assert e.attrib[ns.xlink('type')] == 'simple' head = 'a' attrs = dict(href=e.attrib[HREF_ATTR]) # FIXME the in 'span' check is a bit too general, should use # something else to markup textcolor body = tidy(whack(lambda x: x in ('span', 'u'), body)) elif e.tag == BOOKMARK_START_TAG: head = 'a' attrs = dict(name=e.attrib[TEXT_NAME_ATTR]) assert (blank(text) and blank(tail) and next(e.itersiblings()).tag == BOOKMARK_END_TAG) elif e.tag == TABLE_TAG: head = 'table' body = parse_table_body(body) elif e.tag == TABLE_ROW_TAG: head = 'tr' elif e.tag == TABLE_CELL_TAG: head = 'td' #FIXME repetition via table:number-columns-repeated #FIXME handle column-groups elif e.tag == TABLE_COLUMN_TAG: head = 'col' sty = context.stys.get(e.attrib.get(ns.table('style-name'))) if sty and sty.width is not None: # XXX this isn't really the column width # since google moronically saves this even # if set column width is turned off thank you google! attrs = dict(style=OrderedDict(width=sty.width)) stys_dealt_with.append('width') elif e.tag == FRAME_TAG: # XXX: try to find caption # FIXME(alexander): keep figures/tables with captions in context, # so that we can produce a lot/loi; add an id for all of them inline = e.attrib[ns.text('anchor-type')] == 'as-char' width = ( e.attrib.get(ns.svg('width')) # pylint: disable=E1101 or e.attrib[ns.style('rel-width')]) # FIXME(alexander): should handle all these, in theory: # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute> # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" ) assert width.endswith('cm'), \ 'Expected figure width in cm, got %s' % width relwidth = float(width[:-2]) / context.stys.textwidth head, attrs, body = make_figure( relwidth=relwidth, inline=inline, # FIXME(alexander): the body[0][1] to access the image # will blow up on leading whitespace in the body body=list(x for x in body if not (isinstance(x, basestring) and blank(x))), src=body[0][1]['src'], original_href=e.find(ns.draw('image')).get(ns.xlink('href'))) elif e.tag == IMAGE_TAG: head = 'img' attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR])) else: log.warning('Ignoring tag %s', e.tag) continue # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag) sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)], tags_from_style, tidy(body)) if sty: if sty.text_align: stys_dealt_with.append('text_align') attrs = add_class(attrs, sty.text_align) if sty.background_color: stys_dealt_with.append('background_color') iadd_style(attrs, 'background-color', sty.background_color) if sty.color: stys_dealt_with.append('color') iadd_style(attrs, 'color', sty.color) if e.tag == LIST_TAG: if new_context.list_style_type: attrs = add_class(attrs, new_context.list_style_type) # FIXME additional tidy parsed = mkel(head, attrs, sty_tagged) if head == 'span' and 'style' in attrs: B = Var('B') if parsed == ('span', attrs, [('code', {}, B)]): parsed = mkel('code', {}, [('span', attrs, B.val)]) leftover_styles = sty and set( sty.active_props()) - set(stys_dealt_with) if leftover_styles: log.warn('Ignoring style elements: %r in %r "%s"', ([(k, getattr(sty, k)) for k in leftover_styles]), head, plaintextify(body)) preprocess.maybe_anchorize_id(head, attrs, sty_tagged) yield parsed if tail: yield tail
def from_odt_style(cls, styles, style): # pylint: disable=R0912 if style.tag in (ns.style('default-style'), ns.style('page-layout'), ns.text('outline-style'), ns.text('notes-configuration'), ns.text('linenumbering-configuration')): # default-style appears to serve no real purpose in google docs # page-style probably doesn't really matter anyway # notes-configuration also doesn't really look that useful log.warn('Skipping %s', style.tag.split('}')[1]) return None family, name = style_id(style) assert name parent = style.get(ns.style('parent-style-name')) #parent_id(style) display_name = style.get(ns.style('display-name')) if parent: parent_style = styles[parent] #FIXME: verify no forward decls def true(xml): # fix lxml truth stupidity via boxing return [xml] if xml is not None else None par_props = (true(style.find(ns.style('paragraph-properties'))) or [{}])[0] text_props = (true(style.find(ns.style('text-properties'))) or [{}])[0] # FIXME below assumes table style never sets table-column (table-row) # stuff table_props = (true(style.find(ns.style('table-column-properties'))) or true(style.find(ns.style('table-row-properties'))) or true(style.find(ns.style('table-cell-properties'))) or true(style.find(ns.style('table-properties'))) or [{}])[0] if style.tag == ns.text('list-style'): my_type, sub_list_styles = cls._get_list_style(style) elif style.tag == ns.style('style'): sub_list_styles = None # XXX look for a more robust way to do this if display_name and display_name.startswith('Heading'): my_type = 'h' + display_name.split()[-1] elif parent in ('Title', 'Subtitle'): my_type = parent.lower() elif name in ('Title', 'Subtitle'): my_type = name.lower() elif parent and parent_style.type: my_type = parent_style.type elif family == 'paragraph': my_type = 'p' elif family.startswith('table'): #FIXME actually implement something for these my_type = { 'table': 'table', 'table-cell': 'td', "table-row": "tr", "table-column": "col" }[family] else: # In reality all the following are valid: # paragraph, text, section, table, table-column, table-row, # table-cell, table-page, chart, default, drawing-page, # graphic, presentation, control and ruby. if family != 'text': log.warn("Unexpected style family: %r", family) my_type = 'span' else: raise TypeError('%r is not at style' % style) ans = cls( styles, name=name, #FIXME type=my_type, parent=parent, # pylint: disable=C0326 font_family=text_props.get(ns.style('font-name')), font_size=text_props.get(ns.fo('font-size')), font_weight=text_props.get(ns.fo('font-weight')), font_style=text_props.get(ns.fo('font-style')), color=(text_props.get(ns.fo('color')) or table_props.get(ns.fo('color'))), background_color=(text_props.get(ns.fo('background-color')) or table_props.get(ns.fo('background-color'))), # in css: text-decoration [underline] [line-through] underline=text_props.get(ns.style('text-underline-style')), line_through=text_props.get(ns.style('text-line-through-style')), # in css: vertical-align:sub; font-size:smaller; text_position=text_props.get(ns.style('text-position')), text_align=par_props.get(ns.fo('text-align')), line_height=par_props.get(ns.fo('line-height')), margin_left=par_props.get(ns.fo('margin-left')), par_break=par_props.get(ns.fo('break-before')), text_indent=par_props.get(ns.fo('text-indent')), sub_list_styles=sub_list_styles, width=table_props.get(ns.style('column-width')), min_height=table_props.get(ns.style('min-row-height')), ) if name in styles: log.warn('Overwriting old Sty %r %r => %r', name, styles[name], ans) styles[name] = ans return ans
def parent_id(style): parent_style = style.get(ns.style('parent-style-name')) return parent_style and (style.attrib.get( ns.style('family')), parent_style)
def parse_body(xml, context, normalize_transclusion): # pylint: disable=R0912,R0915,R0914 for e in xml: text = (e.text or '') tail = (e.tail or '') # some style properties should be promoted to tags, e.g. underlining # and bolding tags_from_style = [] stys_dealt_with = [] if e.tag in (S_TAG, TAB_TAG): yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get(ns.text('c'), '1')) if tail: yield tail continue if e.tag == LINEBREAK_TAG: yield mkel('br', {}, []) continue sty = context.stys.get(e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR)) # handle page breaks if sty and sty.par_break: assert e.tag in (H_TAG, P_TAG), \ "Unexpected page-break in %r" % e.tag yield mkel('.pagebreak', {}, []) stys_dealt_with.append('par_break') # Handle lists specially if e.tag == LIST_TAG: new_context = context.bump_list_level(sty) stys_dealt_with.append('sub_list_styles') else: new_context = context body = list(parse_body(e, new_context, normalize_transclusion)) assert type(body) is list and not body or type(body[0]) is not list attrs = {} if text: body = [text] + body if sty and sty.type.endswith('title'): head = sty.type body = [plaintextify(body)] sty = None elif e.tag == H_TAG: # skip empty headings; NB: this *must* happen # after we extracted eventual page-breaks, which are the only # useful information empty headings can contain if blank(body): continue head = sty.type # FIXME(alexander): keep track of the headings breadcrumbs in # context for two reasons # # 1. to associate errors with specific headings # 2. to warn about bad structure e.g. h1 followed by h4, # rather than h2 elif e.tag == LIST_TAG: head = new_context.list_type assert head in ('ol', 'ul') list_start = new_context.list_start if list_start is not None: assert head == 'ol' attrs['start'] = str(list_start) id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101 if id_ is not None: attrs['id'] = id_ continues = e.attrib.get(ns.text('continue-list')) if continues is not None: # make this a data attrib, so we can stuff it # into the html, which doesn't have direct support attrs['data-continue-list'] = continues elif e.tag == LIST_ITEM_TAG: head = 'li' elif e.tag == ANNOTATION_TAG: head = 'aside' elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG): #FIXME: extract content if text: log.warning('Hey, someone actually specified a %s: %s', e.tag, text) if tail: yield tail continue elif e.tag == NOTE_TAG: # other valid option is 'endnote' assert e.attrib[ns.text('note-class')] == 'footnote' # skip ahead and exit early; we only represent the note-body assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG assert len(body) == 1 yield body[0] if tail: yield tail continue elif e.tag == NOTE_BODY_TAG: head = '.footnote' # FIXME(alexander): sucky hack to strip the bogus whitespace # google docs enters at the beginning of a footnote for some # reason. I should really write a more generic whitespace # stripping mechanism in the postprocess module that can recognize # consecutive whitespace even if seperated-by/wrapped-in inline # tags. _, B1, B2, = map(Var, '_, B1, B2'.split(', ')) SPACED_STR = Var('SPACED_STR', lambda s: (isinstance(s, basestring) and re.match(r'\s+', s))) if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]: body[0][2][0] = SPACED_STR.val.lstrip() # FIXME(alexander): add anchors for all paras elif e.tag == P_TAG: margin = sty.margin_left or sty.text_indent if sty else None indent_level = in_indents(margin) if margin else 0 if indent_level: head = '.block' attrs['indent'] = indent_level else: head = 'p' #FIXME styled links etc. gdocs might not use that... #... but we should be able to handle non-span bolding etc. elif e.tag == SPAN_TAG: # XXX: order can matter; we need # <b><u>command</u><b> # not # <u><b>command</b><u> # # but more generally the minimal coalescing of abutting partially # overlapping styles is something that needs to be thought about # properly at some point. for attr, on_values, html_tags in [ ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']), ('font_style', ['italic'], ['i']), ('line_through', [True], ['s']), ('text_position', ['sub', 'super'], ['sub', 'sup']) ]: value = getattr(sty, attr, None) if value: if value not in on_values: log.error("Bad value for %s: %s in %s", attr, value, e.tag) continue tags_from_style.append(html_tags[on_values.index(value)]) stys_dealt_with.append(attr) if is_code_font(sty.font_family): tags_from_style.append('code') stys_dealt_with.append('font_family') head = 'span' elif e.tag == A_TAG: assert e.attrib[ns.xlink('type')] == 'simple' head = 'a' attrs = dict(href=e.attrib[HREF_ATTR]) # FIXME the in 'span' check is a bit too general, should use # something else to markup textcolor body = tidy(whack(lambda x: x in ('span', 'u'), body)) elif e.tag == BOOKMARK_START_TAG: head = 'a' attrs = dict(name=e.attrib[TEXT_NAME_ATTR]) assert (blank(text) and blank(tail) and next(e.itersiblings()).tag == BOOKMARK_END_TAG) elif e.tag == TABLE_TAG: head = 'table' body = parse_table_body(body) elif e.tag == TABLE_ROW_TAG: head = 'tr' elif e.tag == TABLE_CELL_TAG: head = 'td' #FIXME repetition via table:number-columns-repeated #FIXME handle column-groups elif e.tag == TABLE_COLUMN_TAG: head = 'col' sty = context.stys.get(e.attrib.get(ns.table('style-name'))) if sty and sty.width is not None: # XXX this isn't really the column width # since google moronically saves this even # if set column width is turned off thank you google! attrs = dict(style=OrderedDict(width=sty.width)) stys_dealt_with.append('width') elif e.tag == FRAME_TAG: # XXX: try to find caption # FIXME(alexander): keep figures/tables with captions in context, # so that we can produce a lot/loi; add an id for all of them inline = e.attrib[ns.text('anchor-type')] == 'as-char' width = (e.attrib.get(ns.svg('width')) # pylint: disable=E1101 or e.attrib[ns.style('rel-width')]) # FIXME(alexander): should handle all these, in theory: # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute> # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" ) assert width.endswith('cm'), \ 'Expected figure width in cm, got %s' % width relwidth = float(width[:-2]) / context.stys.textwidth head, attrs, body = make_figure( relwidth=relwidth, inline=inline, # FIXME(alexander): the body[0][1] to access the image # will blow up on leading whitespace in the body body=list(x for x in body if not (isinstance(x, basestring) and blank(x))), src=body[0][1]['src'], original_href=e.find(ns.draw('image')).get(ns.xlink('href'))) elif e.tag == IMAGE_TAG: head = 'img' attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR])) else: log.warning('Ignoring tag %s', e.tag) continue # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag) sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)], tags_from_style, tidy(body)) if sty: if sty.text_align: stys_dealt_with.append('text_align') attrs = add_class(attrs, sty.text_align) if sty.background_color: stys_dealt_with.append('background_color') iadd_style(attrs, 'background-color', sty.background_color) if sty.color: stys_dealt_with.append('color') iadd_style(attrs, 'color', sty.color) if e.tag == LIST_TAG: if new_context.list_style_type: attrs = add_class(attrs, new_context.list_style_type) # FIXME additional tidy parsed = mkel(head, attrs, sty_tagged) if head == 'span' and 'style' in attrs: B = Var('B') if parsed == ('span', attrs, [('code', {}, B)]): parsed = mkel('code', {}, [('span', attrs, B.val)]) leftover_styles = sty and set(sty.active_props()) - set(stys_dealt_with) if leftover_styles: log.warn('Ignoring style elements: %r in %r "%s"', ( [(k, getattr(sty, k)) for k in leftover_styles]), head, plaintextify(body)) preprocess.maybe_anchorize_id(head, attrs, sty_tagged) yield parsed if tail: yield tail
def from_odt_style(cls, styles, style): # pylint: disable=R0912 if style.tag in (ns.style('default-style'), ns.style('page-layout'), ns.text('outline-style'), ns.text('notes-configuration'), ns.text('linenumbering-configuration')): # default-style appears to serve no real purpose in google docs # page-style probably doesn't really matter anyway # notes-configuration also doesn't really look that useful log.warn('Skipping %s', style.tag.split('}')[1]) return None family, name = style_id(style) assert name parent = style.get(ns.style('parent-style-name')) #parent_id(style) display_name = style.get(ns.style('display-name')) if parent: parent_style = styles[parent] #FIXME: verify no forward decls def true(xml): # fix lxml truth stupidity via boxing return [xml] if xml is not None else None par_props = (true(style.find(ns.style('paragraph-properties'))) or [{}])[0] text_props = (true(style.find(ns.style('text-properties'))) or [{}])[0] # FIXME below assumes table style never sets table-column (table-row) # stuff table_props = (true(style.find(ns.style('table-column-properties'))) or true(style.find(ns.style('table-row-properties'))) or true(style.find(ns.style('table-cell-properties'))) or true(style.find(ns.style('table-properties'))) or [{}])[0] if style.tag == ns.text('list-style'): my_type, sub_list_styles = cls._get_list_style(style) elif style.tag == ns.style('style'): sub_list_styles = None # XXX look for a more robust way to do this if display_name and display_name.startswith('Heading'): my_type = 'h' + display_name.split()[-1] elif parent in ('Title', 'Subtitle'): my_type = parent.lower() elif name in ('Title', 'Subtitle'): my_type = name.lower() elif parent and parent_style.type: my_type = parent_style.type elif family == 'paragraph': my_type = 'p' elif family.startswith('table'): #FIXME actually implement something for these my_type = {'table': 'table', 'table-cell': 'td', "table-row": "tr", "table-column": "col"}[family] else: # In reality all the following are valid: # paragraph, text, section, table, table-column, table-row, # table-cell, table-page, chart, default, drawing-page, # graphic, presentation, control and ruby. if family != 'text': log.warn("Unexpected style family: %r", family) my_type = 'span' else: raise TypeError('%r is not at style' % style) ans = cls( styles, name=name, #FIXME type=my_type, parent=parent, # pylint: disable=C0326 font_family = text_props.get(ns.style('font-name')), font_size = text_props.get(ns.fo('font-size')), font_weight = text_props.get(ns.fo('font-weight')), font_style = text_props.get(ns.fo('font-style')), color = (text_props.get(ns.fo('color')) or table_props.get(ns.fo('color'))), background_color = (text_props.get(ns.fo('background-color')) or table_props.get(ns.fo('background-color'))), # in css: text-decoration [underline] [line-through] underline = text_props.get(ns.style('text-underline-style')), line_through = text_props.get(ns.style('text-line-through-style')), # in css: vertical-align:sub; font-size:smaller; text_position = text_props.get(ns.style('text-position')), text_align = par_props.get(ns.fo('text-align')), line_height = par_props.get(ns.fo('line-height')), margin_left = par_props.get(ns.fo('margin-left')), par_break = par_props.get(ns.fo('break-before')), text_indent = par_props.get(ns.fo('text-indent')), sub_list_styles = sub_list_styles, width = table_props.get(ns.style('column-width')), min_height = table_props.get(ns.style('min-row-height')), ) if name in styles: log.warn('Overwriting old Sty %r %r => %r', name, styles[name], ans) styles[name] = ans return ans
def parent_id(style): parent_style = style.get(ns.style('parent-style-name')) return parent_style and (style.attrib.get(ns.style('family')), parent_style)