def handle_starttag(self, tag, attrs): self.debug_msg("starttag", "%r atts: %s" % (tag, attrs)) if tag in IGNORE_TAGS: return headline = headline_tag_re.match(tag) if headline: self.cur = DocNode("headline", self.cur, level=int(headline.group(1))) return if tag in ("li", "ul", "ol"): if tag in ("ul", "ol"): self.__list_level += 1 self.cur = DocNode(tag, self.cur, None, attrs, level=self.__list_level) elif tag in ("img", "br"): # Work-a-round if img or br tag is not marked as startendtag: # wrong: <img src="/image.jpg"> doesn't work if </img> not exist # right: <img src="/image.jpg" /> DocNode(tag, self.cur, None, attrs) else: self.cur = DocNode(tag, self.cur, None, attrs)
def _image_repl(self, groups): """Handles images and attachemnts included in the page.""" target = groups.get("image_target", "").strip() text = (groups.get("image_text", "") or "").strip() node = DocNode("image", self.cur, target) DocNode("text", node, text or node.content) self.text = None
def _item_repl(self, groups): """ List item """ bullet = groups.get("item_head", "") text = groups.get("item_text", "") if bullet[-1] == "#": kind = "number_list" else: kind = "bullet_list" level = len(bullet) - 1 lst = self.cur # Find a list of the same kind and level up the tree while (lst and not (lst.kind in ("number_list", "bullet_list") and lst.level == level) and not lst.kind in ("document", "section", "blockquote")): lst = lst.parent if lst and lst.kind == kind: self.cur = lst else: # Create a new level of list self.cur = self._upto( self.cur, ("list_item", "document", "section", "blockquote")) self.cur = DocNode(kind, self.cur) self.cur.level = level self.cur = DocNode("list_item", self.cur) self.cur.level = level + 1 self.parse_inline(text) self.text = None
def _add_macro(self, groups, macro_type, name_key, args_key, text_key=None): """ generic method to handle the macro, used for all variants: inline, inline-tag, block """ # self.debug_groups(groups) assert macro_type in ("macro_inline", "macro_block") if text_key: macro_text = groups.get(text_key, "").strip() else: macro_text = None node = DocNode(macro_type, self.cur, macro_text) macro_name = groups[name_key] node.macro_name = macro_name self.root.used_macros.add(macro_name) node.macro_args = groups.get(args_key, "").strip() self.text = None
def _text_repl(self, groups): # print("_text_repl()", self.cur.kind) # self.debug_groups(groups) if self.cur.kind in ("table", "table_row", "bullet_list", "number_list"): self._upto_block() if self.cur.kind in ("document", "section", "blockquote"): self.cur = DocNode("paragraph", self.cur) text = groups.get("text", "") if groups.get("space"): # use wikipedia style line breaks and seperate a new line with one space text = " " + text self.parse_inline(text) if groups.get("break") and self.cur.kind in ( "paragraph", "emphasis", "strong", "pre_inline", ): self.last_text_break = DocNode("break", self.cur, "") self.text = None
def _pre_block_repl(self, groups): self._upto_block() kind = groups.get("pre_block_kind", None) text = groups.get("pre_block_text", "") def remove_tilde(m): return m.group("indent") + m.group("rest") text = self.pre_escape_re.sub(remove_tilde, text) node = DocNode("pre_block", self.cur, text) node.sect = kind or "" self.text = None
def _url_repl(self, groups): """Handle raw urls in text.""" if not groups.get("escaped_url"): # this url is NOT escaped target = groups.get("url_target", "") node = DocNode("link", self.cur) node.content = target DocNode("text", node, node.content) self.text = None else: # this url is escaped, we render it as text if self.text is None: self.text = DocNode("text", self.cur, "") self.text.content += groups.get("url_target")
def handle_startendtag(self, tag, attrs): self.debug_msg("startendtag", "%r atts: %s" % (tag, attrs)) attr_dict = dict(attrs) if tag in (self._block_placeholder, self._inline_placeholder): id = int(attr_dict["id"]) # block_type = attr_dict["type"] DocNode( "%s_%s" % (tag, attr_dict["type"]), self.cur, content=self.blockdata[id], # attrs = attr_dict ) else: DocNode(tag, self.cur, None, attrs)
def _inline_mark(self, groups, key): self.cur = DocNode(key, self.cur) self.text = None text = groups["%s_text" % key] self.parse_inline(text) self.cur = self._upto(self.cur, (key, )).parent self.text = None
def _link_repl(self, groups): """Handle all kinds of links.""" target = groups.get("link_target", "") text = (groups.get("link_text", "") or "").strip() parent = self.cur self.cur = DocNode("link", self.cur) self.cur.content = target self.text = None re.sub(self.link_re, self._replace, text) self.cur = parent self.text = None
def _table_repl(self, groups): row = groups.get("table", "|").strip() self.cur = self._upto(self.cur, ("table", "document", "section", "blockquote")) if self.cur.kind != "table": self.cur = DocNode("table", self.cur) tb = self.cur tr = DocNode("table_row", tb) for m in self.cell_re.finditer(row): cell = m.group("cell") if cell: text = cell.strip() self.cur = DocNode("table_cell", tr) self.text = None else: text = m.group("head").strip("= ") self.cur = DocNode("table_head", tr) self.text = DocNode("text", self.cur, "") self.parse_inline(text) self.cur = tb self.text = None
def __init__(self, debug=False): HTMLParser.__init__(self) self.debugging = debug if self.debugging: warnings.warn( message="Html2Creole debug is on! warn every data append.") self.result = DebugList(self) else: self.result = [] self.blockdata = [] self.root = DocNode("document", None) self.cur = self.root self.__list_level = 0
def __init__(self, raw, block_rules=None, blog_line_breaks=True): assert isinstance(raw, TEXT_TYPE) self.raw = raw if block_rules is None: block_rules = BlockRules(blog_line_breaks=blog_line_breaks) # setup block element rules: self.block_re = re.compile("|".join(block_rules.rules), block_rules.re_flags) self.blog_line_breaks = blog_line_breaks self.root = DocNode("document", None) self.cur = self.root # The most recent document node self.text = None # The node to add inline characters to self.last_text_break = None # Last break node, inserted by _text_repl() # Filled with all macros that's in the text self.root.used_macros = set()
def _line_repl(self, groups): """ Transfer newline from the original markup into the html code """ self._upto_block() DocNode("line", self.cur, "")
def _separator_repl(self, groups): self._upto_block() DocNode("separator", self.cur)
def handle_entityref(self, name): self.debug_msg("entityref", "%r" % name) DocNode("entityref", self.cur, content=name)
def handle_data(self, data): self.debug_msg("data", "%r" % data) if isinstance(data, BINARY_TYPE): data = unicode(data) DocNode("data", self.cur, content=data)
def _head_repl(self, groups): self._upto_block() node = DocNode("header", self.cur, groups["head_text"].strip()) node.level = len(groups["head_head"]) self.text = None
def _char_repl(self, groups): if self.text is None: self.text = DocNode("text", self.cur, "") self.text.content += groups.get("char", "")
def _linebreak_repl(self, groups): DocNode("break", self.cur, None) self.text = None
def _pre_inline_repl(self, groups): text = groups.get("pre_inline_text", "") DocNode("pre_inline", self.cur, text) self.text = None