def _coalesce_blocks(attrs, blocks): B = Var('B') _ = Var('_') blocks = list(blocks) _debug = blocks[:] def next_body(): return blocks.pop(0)[2] if blocks else [] while True: body = next_body() if not body: break pre_block = [] while body and body == [('code', {}, B)]: pre_block.append(plaintextify(B.val) + '\n') body = next_body() if pre_block: pre_block = mkel('pre', {}, pre_block) yield pre_block non_pre_block = [] while body and body != [('code', {}, B)]: is_citation = 'right' in attrs.get('class', []) if is_citation: non_pre_block.append( mkel('footer', {}, [mkel('cite', {}, body)])) else: if needs_wrapping_in_p(body): body = [mkel('p', {}, body)] non_pre_block.extend(body) body = next_body() if non_pre_block: yield mkel('blockquote', {}, tidy(non_pre_block))
def needs_wrapping_in_p(body): REAL_BLOCK_TAG = Var('REAL_BLOCK_TAG', lambda e: e in BLOCK_TAGS and e != '.footnote') _ = Var('_') if body == [(REAL_BLOCK_TAG, _, _)]: return False else: # if we're too eager to wrap things in p's then hopefully a subsequent # tidy pass will remove them return True
def hacky_flatten_block(block): # XXX(ash): move to postprocess # pylint: disable=C0103 BLOCK_ATTRS = Var('BLOCK_ATTRS') P_ATTRS = Var('P_ATTRS') BODY = Var('BODY') if block == ('.block', BLOCK_ATTRS, [('p', P_ATTRS, BODY)]): return mkel('.block', merge_attrs(BLOCK_ATTRS.val, P_ATTRS.val), BODY.val) else: return block
def _pop_dl_meta(body, head): """Pops ``<dl>`` encoded metadata from `body` and stuffs it into `head`.""" DL_BODY = Var('DL_BODY') if body == Seq[('dl', {'id': 'document-properties'}, DL_BODY), :]: del body[0] dl_body = space_normalize(DL_BODY.val) DD_BODY, ATTRS = map(Var, 'DD_BODY, ATTRS'.split(', ')) DT = ('dt', Var('_'), Var('_')) DD = ('dd', ATTRS, DD_BODY) for dt_dd in zip(dl_body[::2], dl_body[1::2]): assert (DT, DD) == dt_dd c, = ATTRS.val['class'] head[c] = ATTRS.val.get('data-value', plaintextify(DD_BODY.val))
def _tidy_heading(tag, attrs, body): # is there any actual textual content in the heading? cleansed = tidy(whack(lambda e: e not in CAN_OCCUR_IN_H, body)) if not blank(whack(lambda e: e in ('img', 'figure'), cleansed)): yield tag, {k: v for k, v in attrs.iteritems() if k != 'style'}, cleansed return # no, so it's not really a heading # but maybe it contains some misformatted figures or similar # so yield the contents that aren't anchors or whitespace strings _, _STRING = Var('_'), Var('_STRING', isinstance, basestring) for x in cleansed: if x != ('a', {'name': _}, []) and x != _STRING: yield x
def test_make_cover(): dummy_image = literal.Image('', 'image/jpeg', OrderedDict()) assert make_cover_page(src='SOME_HASH.jpg', title='Dummy Title') == \ '''<?xml version='1.0' encoding='UTF-8'?> <!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title>Dummy Title</title> <link href="css/stylesheet.css" rel="stylesheet" type="text/css"/> </head> <body> <div class="cover-page"> <img alt="Dummy Title" src="SOME_HASH.jpg" title="Dummy Title"/> </div> </body> </html> ''' assert make_cover_opf(dummy_image, src='SOME_HASH.jpg') == [ ('item', {'href': Var('_', re.compile(r'.*\.jpg').match), 'id': 'cover-image', 'media-type': 'image/jpeg', 'properties': 'cover-image'}, []), ('item', {'href': 'cover.xhtml', 'id': 'cover', 'media-type': 'application/xhtml+xml'}, [])]
def parse_cites(parsed_body, bib_entries, collect_cite): # pylint: disable=R0914 ZURL = Var('ZURL', ZOTERO_ITEM_URL_REX.match) ZBODY = Var('ZBODY', lambda s: REF_KEY_REX.match(plaintextify(s))) ans = [] coalesce_strings = False for e in parsed_body: if isinstance(e, basestring): ans.append(e) elif e == ('a', {'href': ZURL}, ZBODY): link_text = ZBODY.match.group(0).strip() # sp1 and sp2 are potential leading and trailing spaces which we # tolerate and move out of the link sp1, paren, key, field, post, sp2 = ZBODY.match.groups() if bool(paren) != link_text.endswith(']'): ans.append( mkerr([e], 'malformed citation, unmatched %s' % ('[]'[not paren]))) continue zotero_id = 'http:' + ZURL.match.group(0).split(':', 1)[1] collect_cite(key) fields = bib_entries[key].fields if key in bib_entries else {} # XXX(alexander): cleanse_post_citation kinda takes rich-text, we # only do plaintext for now post_text, = cleanse_post_citation([post]) if 'zoteroid' not in fields or fields.get('zoteroid') == zotero_id: if sp1: ans.append(sp1) coalesce_strings = True ans.append(cite(key, post_text, textual=not paren, field=field)) if sp2: ans.append(sp2) coalesce_strings = True else: ans.append(mkerr([e], 'bad citation key')) else: ans.append( mkel(e[0], e[1], parse_cites(e[2], bib_entries, collect_cite))) if coalesce_strings: i = 1 while i < len(ans): if (isinstance(ans[i], basestring) and isinstance(ans[i - 1], basestring)): ans[i - 1:i + 1] = [ans[i - 1] + ans[i]] i += 1 return ans
def handle_emphasis(self, emph, body): r"""Boldens italicizes or strikes-through latex text. Harder than it sounds: The problem being that \textbf and \textit don't work across paragraphs and \bfseries and \itshape don't do italic correction (i.e. the end of the emphasized text juts into what follows it, because the space is not widened as necessary). >>> writer = LatexWriter() >>> print writer.handle_emphasis('b', ['some bold text']) \textbf{some bold text} >>> print writer.handle_emphasis( ... 'b', [('p', {}, [('i', {}, ['some bold italic'])]), 'text']) {\bfseries{}\textit{some bold italic} <BLANKLINE> text\/} >>> With strikethrough and underline the problem is even worse. TeX itself has no underline/strikethrough at all and the default LaTeX \underline command is broken (e.g. makes the text un(line)breakable). All replacements like soul's \ul and ulem's \uline have weird limitations that cause random breakage, so we push these styles down into the body recursively. >>> print writer.handle_emphasis( ... 'u', [('p', {}, [('i', {}, ... [('b', {}, ['ul bold italic'])])]), 'text']) {\itshape{}{\bfseries{}\uline{ul bold italic}\/}\/} <BLANKLINE> \uline{text} """ # can safely use \textit/\textbf etc. INLINE_TEXT = Var( 'INLINE_TEXT', # pylint: disable=C0103 lambda x: isinstance(x, basestring) and '\n' not in x) if body == [INLINE_TEXT]: return cmd(self.INLINE_EMPH_TO_LATEX[emph], [], [self.latexify(body)]) else: if emph in ('b', 'i'): # need to use itshape/bfseries and do italic correction (r'\/') return texcmd( dict(b='bfseries', i='itshape')[emph], join(self.latexify(body), r'\/')) else: assert emph in ('u', 's') # XXX: it might be better to have latexify as the outmost call # here rather than join indivudally converted parts. That would # allow for further rewrite logic in other parts of the latex # converter. return join(*(self.handle_emphasis(emph, [e]) if isinstance( e, basestring) else self.latexify( mkel(*e[:2], body=[ mkel(emph, {}, [subbody_part]) for subbody_part in e[2] ])) for e in body))
def extract_labels(body): HREF = Var('HREF') labels = [] newbody = [] for e in body: if e == ('a', {'name': HREF}, []): labels.append(HREF.val.lstrip('#')) else: newbody.append(e) return labels, newbody
def _space_normalize(es, lstrip=False, rstrip=False, parent_was_block_el=False): REAL_BLOCK_TAG = Var('REAL_BLOCK_TAG', lambda e: e in BLOCK_TAGS and e != '.footnote') _ = Var('_') ans = [] n = len(es) for i, e in enumerate(es): new_e, lstrip = _space_normalize1( e, # NB: the parenthesization difference is intentional lstrip=lstrip or parent_was_block_el and i == 0, rstrip=(rstrip or parent_was_block_el) and i == n - 1 or (es[i + 1:i + 2] == [(REAL_BLOCK_TAG, _, _)])) if new_e: ans.append(new_e) return ans, lstrip
def unwrap_figures(body): # XXX: this currently only operates at the toplevel, both looking for # paragraphs and also looking for block figures in paragraphs. Strictly # speaking we should probably descend for both. As an additional hack, we # descend, up to the the <td> level, into tables. FATTRS, PATTRS, FBODY = map(Var, 'FATTRS, PATTRS, FBODY'.split(', ')) BLOCK_STYLE_ATTR = Var('BLOCK_STYLE_ATTR', lambda a: a['style']['display'] == 'block') BLOCK_FIG = ('figure', BLOCK_STYLE_ATTR, FBODY) PBODY_WITH_BLOCKFIG = Var('PBODY_WITH_BLOCKFIG', list.__contains__, BLOCK_FIG) for elem in body: if elem and elem[0] in ('table', 'tr', 'td', 'blockquote'): yield mkel(elem[0], elem[1], list(unwrap_figures(elem[-1]))) elif elem in (('p', {}, [('figure', FATTRS, FBODY)]), ('figure', FATTRS, FBODY)): # override style of standalone figures new_fattrs = copy.deepcopy(FATTRS.val) new_fattrs['style']['display'] = 'block' yield mkel('figure', new_fattrs, FBODY.val) # Split a <p> that contains a block figure into # two paragraphs separated by a figure. # This case can only arise due to the # large inline image heuristic; if the paragraph # has an id attribute (shouldn't happen yet), # we put it into the first half of the split. We throw away # empty <p>s. elif elem == ('p', PATTRS, Seq[PBODY_WITH_BLOCKFIG:]): body = PBODY_WITH_BLOCKFIG.val i_fig = body.index(BLOCK_FIG) if body[:i_fig]: yield mkel('p', PATTRS.val, body[:i_fig]) cloned_attrs = dict( (k, v) for (k, v) in PATTRS.val.items() if k != 'id') else: cloned_attrs = PATTRS.val yield body[i_fig] if cloned_attrs or body[i_fig + 1:]: yield ('p', cloned_attrs, body[i_fig + 1:]) else: yield elem
def _as(format, node): PRE = Var('PRE') assert node == ('pre', {}, [PRE]) s = PRE.val lang = _guess_lang(s) if format == 'html': formatter = HtmlFormatter() elif format == 'latex': formatter = LatexFormatter() else: raise RuntimeError('Not a valid output format: %r' % format) return highlight(s, get_lexer_by_name(lang), formatter)
def coalesce(es): # pylint: disable=R0912,R0914 def grouper(thing): if isinstance(thing, basestring): return basestring else: return thing[:2] EMPTY_NON_VOID_ELEMENT = (Var('_', lambda tag: tag not in FULLY_VOID_TAGS), {}, []) EMPTY_BLOCK_ELEMENT = (Var('_', NON_EMPTY_BLOCK_TAGS.__contains__), Var('_'), [Var('_', blank)]) EMPTY_LINK = ('a', Var('ATTRS', lambda a: 'name' not in a), []) BOGUS_ELEMENTS = (EMPTY_NON_VOID_ELEMENT, EMPTY_BLOCK_ELEMENT, EMPTY_LINK) for (tag_attrs, group) in groupby(es, grouper): if tag_attrs is basestring: yield nfc("".join(group)) else: tag, attrs = tag_attrs if tag in INLINE_TAG or tag == 'blockquote': for x in _coalesce_siblings(tag, attrs, group): if x not in BOGUS_ELEMENTS: yield x elif tag == '.block': for x in _coalesce_blocks(attrs, group): yield x # FIXME(alexander): don't simplify CMD and LIT contents for now... # ... this is needed because of the stupid representation of # citations, in particular elif tag in ('LIT', 'CMD'): for x in group: yield x else: for x in (_coalesce_parent_child(parent) for parent in group): if x in BOGUS_ELEMENTS: continue if tag in H_TAGS: for y in _tidy_heading(*x): yield y else: yield x
def _coalesce_parent_child(parent): # the tidy below is tag, attrs, raw_body = parent body = tidy(raw_body) B = Var('B') # rationale: # <li> # <p>a</p> # <ul>...</ul> # </li> # should be transformed to: # <li> # a # <ul>...</ul> # </li> DOES_NOT_START_WITH_P = Var( 'DOES_NOT_START_WITH_P', lambda elts: not any(is_p(elt) for elt in elts)) BODY_WITH_BOGUS_P = Seq[('p', {}, B), DOES_NOT_START_WITH_P:] # google docs inserts paragraphs at the darnest places # unwrap singleton paragraphs where they don't belong # XXX(alexander): consider lifting p attributes # like justify class in comprehensive-test if tag in ('li', 'dt', 'dd', '.footnote') and body == BODY_WITH_BOGUS_P: body = B.val + DOES_NOT_START_WITH_P.val elif (tag, attrs) == ('p', {}) and body in ([('.pagebreak', {}, []) ], [('blockquote', {}, B)]): (tag, attrs, body), = body else: LIFTABLE_SPAN_STYLE = Var( 'LIFTABLE_SPAN_STYLE', lambda d: not (set(d) - ({'color', 'background-color'} - set( attrs.get('style', {}))))) if body == [('span', {'style': LIFTABLE_SPAN_STYLE}, B)]: body = B.val attrs = _style_merge(attrs, LIFTABLE_SPAN_STYLE.val) return mkel(tag, attrs, body)
def lift_code(para): def is_code(element): return element[:1] == ('code', ) # pylint: disable=C0103 ALL_CODE = Var('ALL_CODE', lambda xs: all(is_code(x) for x in xs)) if para == ('p', {}, ALL_CODE): # XXX(ash): maybe should do this coalescing of adjacent `code` # bodies in postprocess? new_body = [] for e in ALL_CODE.val: _, attrs, body = e if attrs: log.warn('ignoring attrs on code tag %r', e) new_body.extend(body) return mkel('code', {}, new_body) else: return para
def _pop_title_and_subtitle(body, head): """Pops (sub)titles from `body`' and stuff them into ``head``.""" _, BODY, REST = map(Var, '_, BODY, REST'.split(', ')) for tag, alt_h in [('title', 'h1'), ('subtitle', 'h2')]: if body in (Seq[(tag, {}, Seq[BODY:]), REST:], Seq[(alt_h, { 'class': tag }, Seq[BODY:]), REST:]): # XXX(alexander): plaintextification of (sub)titles title_str = space_normalize(plaintextify(BODY.val)) if title_str: head[tag] = title_str del body[0] # skip empty paragraphs between title and subtitle and subtitle and meta while body and body[0] in [ Var('_', lambda x: isinstance(x, basestring) and blank(x)), ('p', _, Seq(blank_flat_body)[:]) ]: log.warn('Killing blank gunk before metadata') del body[0]
def handle_run(self, r): # XXX(ash): pylint is right about this being too complex # pylint: disable=R0912 _ = Var('_') ans = [] rPr = first_of_tag(r, RUN_PROPS_TAG) content = rPr.itersiblings() if rPr is not None else iter(r) for e in content: # pylint: disable=W0622 type = e.attrib.get(ns.w('type')) if e.tag == TEXT_TAG: ans.append(e.text) elif e.tag == TAB_TAG: # XXX(alexander): this can also work like a '_' or '…' \dotfill ans.append('\t') elif e.tag in (FOOTNOTE_REF_TAG, ENDNOTE_REF_TAG): # XXX(ash): what is going on here pass elif e.tag == BREAK_TAG and type in ('page', 'column'): ans.append(mkel('.pagebreak', {}, [])) elif e.tag == BREAK_TAG or e.tag == CR_TAG: assert (type is None) or (type == 'textWrapping') ans.append(mkel('br', {}, [])) # FIXME, tags below untested elif e.tag == SOFT_HYPHEN_TAG: ans.append(SOFT_HYPHEN) elif e.tag == NON_BREAKING_HYPHEN_TAG: ans.append(NON_BREAKING_HYPHEN) elif e.tag == ns.w('drawing'): ans.extend( flatmap(self.transclude, e.xpath(self.IMAGE_XPATH, namespaces=ns.dict))) elif e.tag in (FOOTNOTE_REFERENCE_TAG, ENDNOTE_REFERENCE_TAG): ans.append(self.make_footnote(e)) else: # movie, # rt, ruby, rubyAlign etc. for ruby stuff # sym, with special handling for wingdings I guess... log.warn('Unknown tag %r', e.tag) if rPr is not None and ans != Seq[Seq['.footnote', _:], _:]: ans = self.apply_rpr(rPr, ans) return ans
def build_list(cls, tree): _ = Var('_') if isinstance(tree, list): ans = [] for (tag, attr), body in itertools.groupby( tree, lambda x: (_, _) if isinstance(x, list) else x[0]): this_body = [] if tag is _: body, = body ans.append(mkel('.block', {}, cls.build_list(body))) else: for x in body: if isinstance(x, list): item = cls.build_list(x) this_body[-1][2].extend(item) else: item = [x[1]] this_body.append(mkel('li', {}, item)) ans.append(mkel(tag, attr, this_body)) return ans
def extract_header(elems): attrs = [] for elem in elems: _TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY = map( Var, "_TAG, TATTRS, PATTRS, _SATTRS, _BATTRS, TBODY".split(', ')) _BLANK_BODY = Var('_BLANK', blank) if elem == (_TAG, TATTRS, _BLANK_BODY): # empty cell - accept, but do not propagate attrs, apart # from background-color bg = TATTRS.val.get('style', {}).get('background-color') attrs.append({}) if bg: iadd_style(attrs[-1], 'background-color', bg) elif elem in ((_TAG, TATTRS, [ ('p', PATTRS, [('span', _SATTRS, [('b', _BATTRS, TBODY)])]) ]), (_TAG, TATTRS, [('p', PATTRS, [('b', _BATTRS, TBODY)])])): attrs.append(merge_attrs(TATTRS.val, PATTRS.val)) else: #not header set return False, [] return True, attrs
def handle_fragment(fragment, indent, transclusions, h_shift, epub_clean, bibliography): # pylint: disable=R0911,R0914,R0912,R0913,R0915 # FIXME(alexander): clean this up a bit, and get rid of pylint muffles if isinstance(fragment, basestring): return cgi.escape(fragment) (tag, attrs, content) = fragment if tag in ['script', 'style'] and content: content_str, = content return NOT_INLINE_TEMPLATE % dict( indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=_indent( '\n' + maybe_cdatafy(_indent(content_str.strip('\n'), ' ')), indent)) if tag == 'pre': return '\n' + highlight.as_html(fragment) # special case figures and tables if tag == 'figure': style = attrs['style'].copy() width = style.pop('width', '100%') attrs = dict(attrs.items(), style=style) # FIXME(alexander): dirty hacks to fixup caption & width img = content[-1] assert img[0] == 'img' img[1].setdefault('style', OrderedDict())['width'] = width # put figcaption towards end if content[0][0] == 'figcaption': content[0], content[-1] = content[-1], content[0] if style['display'] == 'inline': ATTRS = Var('ATTRS') # pylint: disable=C0103 assert content[:1] == [('img', ATTRS, [])], \ "figure does not begin with an img" attrs = add_class(ATTRS.val, 'margin') # peel of the figure tag for inlined stuff # as a hack to make epub/html validate # (figures can't occur in all contexts imgs can) return handle_fragments([('img', attrs, [])], bibliography=bibliography, indent=indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean) elif tag == 'table': colgroups = [el for el in content if el[0] == 'colgroup'] COLS = Var("COLS") # pylint: disable=C0103 assert colgroups == [('colgroup', {}, COLS)], \ "Expected single colgroup in table %s" % content # FIXME(alexander): this deepcopy is a lazy hack so we can mutate away # imperatively propagate table cell alignment down # this is a pretty horrible hack and would blow # up nastily if there is attribute aliasing, # but deepcopying should kinda make it work content = copy.deepcopy(content) _propagate_alignment(content, COLS.val) elif tag == 'col': if not epub_clean: attrs = attrs.copy() attrs['width'] = attrs['style']['width'] del attrs['style'] # cull ## return handle_fragments(content, indent) # FIXME(alexander): might make more sense to filter (or h-ify) these out # elsewhere, but for now this seems not unreasonable elif tag == 'title': tag = 'h1' attrs = add_class(attrs, 'title') elif tag == 'subtitle': tag = 'h2' attrs = add_class(attrs, 'subtitle') elif tag in ('CMD', 'LIT'): bad_command = None cmd_type, = attrs['class'] # FIXME(alexander): convert tex to html for non-math; # convert tex math to MML for epub if cmd_type in ('$', 'tex'): tex, = content if cmd_type == '$': tex = r'\(%s\)' % tex return '<span class="tex2jax_process">%s</span>' % cgi.escape(tex) elif CITE_REX.match(cmd_type): if bibliography: bibliography.cited.add(content[0]) # post = ('[%s]' % content[1] if len(content) > 1 and content[1] # else '') # Post is ignored for the moment return _format_citation(cmd_type, content[0], bibliography) else: docerror.docproblem( 'Citation exists, but bibliography is missing') else: bad_command = cmd_type + (':' if content else '') docerror.docproblem('Unknown command type:%s' % cmd_type) elif epub_clean: if tag == 'a' and 'name' in attrs: assert len(attrs) == 1 attrs = {'id': attrs['name']} elif tag == 'img': attrs = { k: attrs[k] for k in attrs if k not in ('width', 'height') } # FIXME(alexander): support continued-list properly in html, by keeping # track of numbers of items per list-id and translating it to start if tag in H_TAGS: if h_shift: tag = 'h%d' % min(len(H_TAGS), max(1, int(tag[1]) + h_shift)) # generic [tagname].class tags if '.' in tag: if tag == '.pagebreak': tag = 'div.pagebreak' # for whitespace sanitization tagname, classname = tag.split('.', 1) tag = tagname or 'span' attrs = add_class(attrs, classname) if tag == 'CMD' and bad_command: tag = 'span' attrs = {'class': ['bad-command']} content = [('u', {}, [bad_command])] + content elif tag == 'ERR': tag = 'span' attrs = {'class': ['err'], 'title': attrs['info'][0]} content_str = handle_fragments(content, indent=' ' + indent, transclusions=transclusions, h_shift=h_shift, epub_clean=epub_clean, bibliography=bibliography) if tag in VOID_TAGS: assert not content template = "<%(tag)s%(attrs_str)s/>" elif tag in INLINE: template = "<%(tag)s%(attrs_str)s>%(content_str)s</%(tag)s>" elif '\n' in content_str: template = NOT_INLINE_TEMPLATE else: template = COMPACT_NOT_INLINE_TEMPLATE # FIXME(alexander): disgusting hack; fix this properly and # use a set representation to start with! classes = attrs.get('class') if classes: attrs = attrs.copy() attrs['class'] = sorted(set(classes)) return template % dict(indent=indent, tag=tag, attrs_str=encode_attrs(attrs, transclusions, epub_clean), content_str=content_str)
def latexify(self, ast): # pylint: disable=E0102,R0914,R0915,R0911,R0912 if isinstance(ast, list): return re.sub('\n\n$', '\n', join(*map(self.latexify, ast))) else: node = ast if isinstance(node, basestring): return quote(node) else: assert isinstance(node, tuple) h, a, b = node if h == 'div': # canonicalize pseudo-elements h = a['class'].pop() assert not a['class'] del a['class'] if h[:-1] == 'h': if self.am_inside('list') or self.am_inside('table'): return docwarn( self.latexify(b), 'Cannot have sections inside lists or tables: %r' % postprocess.plaintextify(b)) else: with self.inside('section'): if a: log.warn('heading w/ attr %r', a) labels, b = extract_labels(b) return self.section(h, b, labels) elif h == 'p': ans = nl(self.latexify(b)) if self.am_inside('.footnote') and self.am_inside('table'): return docwarn( ans, 'Multi-paragraph footnotes in tables are' ' unsupported') return nl(ans) elif h == 'span': return self.latexify(b) # XXX elif h in ('ol', 'ul'): ol = partial(self.enumerate_, start=a.get('start'), series=a.get('id'), resume=a.get('data-continue-list')) with self.inside('list'): return nl( freshline({ 'ol': ol, 'ul': itemize }[h](self.latexify(b)))) elif h == 'li': labels, b = extract_labels(b) labelling = (join(*(map(mklabel, labels) + [' '])) if labels else '') return join(freshline(cmd('item')), labelling, self.latexify(b)) elif h == 'table': nested_table = self.am_inside('table') with self.inside('table'): # pylint: disable=C0103 CLASS_TO_SPEC = { 'left': 'P', 'center': 'C', 'right': 'R', 'justify': 'N' } b = b[:] tablecaption = None if b[0][0] == 'caption': with self.inside('caption'): tablecaption = self.latexify(b[0][2]) del b[0] colgroup = [el for el in b if el[0] == 'colgroup'] rows = [el for el in b if el[0] == 'tr'] assert len(colgroup) == 1, \ "Expected single colgroup in table %s" % b cols = colgroup[0][2] colspecs = [] for col_h, col_a, col_b in cols: if col_h != 'col': break assert not col_b coltype = 'P' for cls in CLASS_TO_SPEC: if cls in col_a.get('class', []): coltype = CLASS_TO_SPEC[cls] coltype = "%s{%s}" % (coltype, textwidth_percent( col_a['style']['width'])) colspecs.append(coltype) rows = "\\tabularnewline\n".join( map(self.latexify, rows)) if nested_table and tablecaption: docproblem( "Tables within tables can't have captions;" " outputing caption as normal text", level='warning') ans = join(nl(table(colspecs, rows)), tablecaption) else: ans = table(colspecs, rows, tablecaption) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans elif h == 'col': # FIXME assert False, "Unexpected col" elif h == 'tr': return " & ".join(map(self.latexify, b)) elif h == 'td': if 'headcol' in a.get('class', []): return colh(self.latexify(b)) return self.latexify(b) elif h == 'th': if 'headcol' in a.get('class', []): return rowh(colh(self.latexify(b))) return rowh(self.latexify(b)) elif h == 'figure': b = b[:] if b[0][0] == 'figcaption': with self.inside('caption'): figcaption = self.latexify(b[0][2]) del b[0] else: figcaption = None assert len(b) == 1 and b[0][0] == 'img' img = b[0][1]['src'] inline = False warns = [] if a['style']['display'] == 'inline': if self.am_inside('table'): warns.append([ 'Margin figures not supported in tables, ' 'inserting into table cell' ]) else: inline = True if inline: if figcaption: warns.append([ 'Ignoring figcaption for inline figure:' ' "%s"', figcaption ]) ans = marginfigure(img=img) else: fakecaption = figcaption and self.am_inside('table') if fakecaption: warns.append([ "Figures in tables can't have captions; " "outputing caption as normal text" ]) # inside blockquotes more complicated figure # environments don't seem to work reliably rawincludegraphics = self.am_inside('blockquote') ans = figure(img=img, classes=a.get('class', []), width=a['style']['width'], figcaption=figcaption, fakecaption=fakecaption, rawincludegraphics=rawincludegraphics) if self.post_float_yuck and not self.am_inside('table'): ans = join(ans, *self.post_float_yuck) del self.post_float_yuck[:] return ans if not warns else docwarns(ans, *warns) elif h == 'img': assert False, 'unexpected image' elif h == 'a': if 'name' in a: # we can't do that blindly, because we want to # generate labels for things like lists and headings # this is only a fallback for anchors outside of # 'labelled' envs return cmd('hypertarget', [], [a['name'].lstrip('#'), '']) elif 'href' in a: if a['href'].startswith('#'): return cmd('hyperref', [latexify_href(a['href'][1:])], [self.latexify(b)]) ## # XXX(alexander): handle bare urls specially, because # we want more relaxed linebreaking rules for them. # Note that we're not using \url directly, because # it's not robust and also can't cope with certain # arguments, such as unbalanced '{'/'}'s. Also, even # with fairly aggressive hyphenization params, this is # in in itself not enough to resolve all overfull hbox # issues with urls, although it's not 100% clear to me # why. elif b and a['href'] in (b[0], url_fix(b[0])): # XXX(alexander): use url_fixed version here? return urldef(a['href'], self.urldefs) else: ans = cmd( 'href', [], [latexify_href(a['href']), self.latexify(b)]) if b[0].startswith('http'): ans = docwarn( ans, 'Suspicious link with body/href' ' mismatch: %r != %r' % (a['href'].encode('utf-8'), b[0])) return ans else: assert False, 'Malformed link: %s' % ((h, a, b), ) elif h == 'aside': return cmd('comment', [], [self.latexify(b)]) elif h in ('b', 'i', 'u', 's'): assert not a, 'unexpected <%s %r' % (h, a) return self.handle_emphasis(h, b) elif h == 'code': #FIXME: write something more specialized return cmd('texttt', [], [self.latexify(b)]) elif h == 'sup': return cmd('textsuperscript', [], [self.latexify(b)]) elif h == 'sub': return cmd('textsubscript', [], [self.latexify(b)]) elif h == '.footnote': with self.inside('.footnote'): if self.am_inside('caption'): self.post_float_yuck.append( cmd('footnotetext', [], [self.latexify(b)])) return cmd(r'protect\footnotemark', [], []) else: return cmd('footnote', [], [self.latexify(b)]) elif h == '.pagebreak': return nl(cmd('clearpage', [], [self.latexify(b)])) elif h == 'br': assert a == {} assert b == [] return nl(cmd('newline')) elif h == 'blockquote': with self.inside('blockquote'): return blockquote(self.latexify(b)) elif (h == 'footer' and b == [Seq['cite', :]] and self.am_inside('blockquote')): return nl(cmd('attrib', [], [self.latexify(b[0][2])])) elif node == ('CMD', {'class': ['$']}, b): return join('$', b[0], '$') elif node == ('CMD', { 'class': [Var('CITE', CITE_REX.match)] }, b): return self.munge_cite(node, b) elif node == ('CMD', {'class': ['tex']}, b): return b[0] elif h in ('CMD', 'LIT'): return self.bad_command(*node) elif h == 'pre': return highlight.as_latex(node) elif h == 'wbr': return '{}' else: #FIXME(alexander): set 1 as error-code? log.error('Unexpected tag: %s %r %r', h, a, b) return join("")
def parse_body(xml, context, normalize_transclusion): # pylint: disable=R0912,R0915,R0914 for e in xml: text = (e.text or '') tail = (e.tail or '') # some style properties should be promoted to tags, e.g. underlining # and bolding tags_from_style = [] stys_dealt_with = [] if e.tag in (S_TAG, TAB_TAG): yield ' \t'[e.tag == TAB_TAG] * int(e.attrib.get( ns.text('c'), '1')) if tail: yield tail continue if e.tag == LINEBREAK_TAG: yield mkel('br', {}, []) continue sty = context.stys.get( e.get(STYLE_NAME_ATTR) or e.get(TABLE_STYLE_NAME_ATTR)) # handle page breaks if sty and sty.par_break: assert e.tag in (H_TAG, P_TAG), \ "Unexpected page-break in %r" % e.tag yield mkel('.pagebreak', {}, []) stys_dealt_with.append('par_break') # Handle lists specially if e.tag == LIST_TAG: new_context = context.bump_list_level(sty) stys_dealt_with.append('sub_list_styles') else: new_context = context body = list(parse_body(e, new_context, normalize_transclusion)) assert type(body) is list and not body or type(body[0]) is not list attrs = {} if text: body = [text] + body if sty and sty.type.endswith('title'): head = sty.type body = [plaintextify(body)] sty = None elif e.tag == H_TAG: # skip empty headings; NB: this *must* happen # after we extracted eventual page-breaks, which are the only # useful information empty headings can contain if blank(body): continue head = sty.type # FIXME(alexander): keep track of the headings breadcrumbs in # context for two reasons # # 1. to associate errors with specific headings # 2. to warn about bad structure e.g. h1 followed by h4, # rather than h2 elif e.tag == LIST_TAG: head = new_context.list_type assert head in ('ol', 'ul') list_start = new_context.list_start if list_start is not None: assert head == 'ol' attrs['start'] = str(list_start) id_ = e.attrib.get(ns.xml('id')) # pylint: disable=E1101 if id_ is not None: attrs['id'] = id_ continues = e.attrib.get(ns.text('continue-list')) if continues is not None: # make this a data attrib, so we can stuff it # into the html, which doesn't have direct support attrs['data-continue-list'] = continues elif e.tag == LIST_ITEM_TAG: head = 'li' elif e.tag == ANNOTATION_TAG: head = 'aside' elif e.tag in (CREATOR_TAG, NOTE_CITATION_TAG, BOOKMARK_END_TAG): #FIXME: extract content if text: log.warning('Hey, someone actually specified a %s: %s', e.tag, text) if tail: yield tail continue elif e.tag == NOTE_TAG: # other valid option is 'endnote' assert e.attrib[ns.text('note-class')] == 'footnote' # skip ahead and exit early; we only represent the note-body assert len(e) == 2 and e[1].tag == NOTE_BODY_TAG assert len(body) == 1 yield body[0] if tail: yield tail continue elif e.tag == NOTE_BODY_TAG: head = '.footnote' # FIXME(alexander): sucky hack to strip the bogus whitespace # google docs enters at the beginning of a footnote for some # reason. I should really write a more generic whitespace # stripping mechanism in the postprocess module that can recognize # consecutive whitespace even if seperated-by/wrapped-in inline # tags. _, B1, B2, = map(Var, '_, B1, B2'.split(', ')) SPACED_STR = Var( 'SPACED_STR', lambda s: (isinstance(s, basestring) and re.match(r'\s+', s))) if body == Seq[('p', _, Seq[SPACED_STR, B2:]), B1:]: body[0][2][0] = SPACED_STR.val.lstrip() # FIXME(alexander): add anchors for all paras elif e.tag == P_TAG: margin = sty.margin_left or sty.text_indent if sty else None indent_level = in_indents(margin) if margin else 0 if indent_level: head = '.block' attrs['indent'] = indent_level else: head = 'p' #FIXME styled links etc. gdocs might not use that... #... but we should be able to handle non-span bolding etc. elif e.tag == SPAN_TAG: # XXX: order can matter; we need # <b><u>command</u><b> # not # <u><b>command</b><u> # # but more generally the minimal coalescing of abutting partially # overlapping styles is something that needs to be thought about # properly at some point. for attr, on_values, html_tags in [ ('underline', [True], ['u']), ('font_weight', ['bold'], ['b']), ('font_style', ['italic'], ['i']), ('line_through', [True], ['s']), ('text_position', ['sub', 'super'], ['sub', 'sup']) ]: value = getattr(sty, attr, None) if value: if value not in on_values: log.error("Bad value for %s: %s in %s", attr, value, e.tag) continue tags_from_style.append(html_tags[on_values.index(value)]) stys_dealt_with.append(attr) if is_code_font(sty.font_family): tags_from_style.append('code') stys_dealt_with.append('font_family') head = 'span' elif e.tag == A_TAG: assert e.attrib[ns.xlink('type')] == 'simple' head = 'a' attrs = dict(href=e.attrib[HREF_ATTR]) # FIXME the in 'span' check is a bit too general, should use # something else to markup textcolor body = tidy(whack(lambda x: x in ('span', 'u'), body)) elif e.tag == BOOKMARK_START_TAG: head = 'a' attrs = dict(name=e.attrib[TEXT_NAME_ATTR]) assert (blank(text) and blank(tail) and next(e.itersiblings()).tag == BOOKMARK_END_TAG) elif e.tag == TABLE_TAG: head = 'table' body = parse_table_body(body) elif e.tag == TABLE_ROW_TAG: head = 'tr' elif e.tag == TABLE_CELL_TAG: head = 'td' #FIXME repetition via table:number-columns-repeated #FIXME handle column-groups elif e.tag == TABLE_COLUMN_TAG: head = 'col' sty = context.stys.get(e.attrib.get(ns.table('style-name'))) if sty and sty.width is not None: # XXX this isn't really the column width # since google moronically saves this even # if set column width is turned off thank you google! attrs = dict(style=OrderedDict(width=sty.width)) stys_dealt_with.append('width') elif e.tag == FRAME_TAG: # XXX: try to find caption # FIXME(alexander): keep figures/tables with captions in context, # so that we can produce a lot/loi; add an id for all of them inline = e.attrib[ns.text('anchor-type')] == 'as-char' width = ( e.attrib.get(ns.svg('width')) # pylint: disable=E1101 or e.attrib[ns.style('rel-width')]) # FIXME(alexander): should handle all these, in theory: # <http://www.w3.org/TR/SVG11/struct.html#SVGElementWidthAttribute> # ("em" | "ex" | "px" | "in" | "cm" | "mm" | "pt" | "pc" ) assert width.endswith('cm'), \ 'Expected figure width in cm, got %s' % width relwidth = float(width[:-2]) / context.stys.textwidth head, attrs, body = make_figure( relwidth=relwidth, inline=inline, # FIXME(alexander): the body[0][1] to access the image # will blow up on leading whitespace in the body body=list(x for x in body if not (isinstance(x, basestring) and blank(x))), src=body[0][1]['src'], original_href=e.find(ns.draw('image')).get(ns.xlink('href'))) elif e.tag == IMAGE_TAG: head = 'img' attrs = dict(src=normalize_transclusion(e.attrib[HREF_ATTR])) else: log.warning('Ignoring tag %s', e.tag) continue # FIXME raise RuntimeError('Unexpected tag: %s' % e.tag) sty_tagged = reduce(lambda parsed, tag: [mkel(tag, {}, parsed)], tags_from_style, tidy(body)) if sty: if sty.text_align: stys_dealt_with.append('text_align') attrs = add_class(attrs, sty.text_align) if sty.background_color: stys_dealt_with.append('background_color') iadd_style(attrs, 'background-color', sty.background_color) if sty.color: stys_dealt_with.append('color') iadd_style(attrs, 'color', sty.color) if e.tag == LIST_TAG: if new_context.list_style_type: attrs = add_class(attrs, new_context.list_style_type) # FIXME additional tidy parsed = mkel(head, attrs, sty_tagged) if head == 'span' and 'style' in attrs: B = Var('B') if parsed == ('span', attrs, [('code', {}, B)]): parsed = mkel('code', {}, [('span', attrs, B.val)]) leftover_styles = sty and set( sty.active_props()) - set(stys_dealt_with) if leftover_styles: log.warn('Ignoring style elements: %r in %r "%s"', ([(k, getattr(sty, k)) for k in leftover_styles]), head, plaintextify(body)) preprocess.maybe_anchorize_id(head, attrs, sty_tagged) yield parsed if tail: yield tail
def parse_chunk(s, handle_data_url=None): BODY = Var('BODY') parsed = parse_body([parse_html_frag(s).find('body')], handle_data_url) assert [('body', {}, BODY)] == parsed, 'No body in %r' % (parsed, ) return BODY.val