def parse(source, prefixes=None, model=None, encoding=None, use_xhtml_ns=False): ''' Parse an input source with HTML text into an Amara Bindery tree Warning: if you pass a string, you must make sure it's a byte string, not a Unicode object. You might also want to wrap it with amara.lib.inputsource.text if it's not obviously XML or HTML (for example it could be confused with a file name) ''' from amara.lib.util import set_namespaces #from amara.bindery import html; doc = html.parse("http://www.hitimewine.net/istar.asp?a=6&id=161153!1247") #parser = html5lib.HTMLParser() if PRE_0_90: def get_tree_instance(): entity_factory = model.clone if model else entity return treebuilder(entity_factory) else: def get_tree_instance(namespaceHTMLElements, use_xhtml_ns=use_xhtml_ns): #use_xhtml_ns is a boolean, whether or not to use http://www.w3.org/1999/xhtml entity_factory = model.clone if model else entity return treebuilder(entity_factory, use_xhtml_ns) parser = html5lib.HTMLParser(tree=get_tree_instance) doc = parser.parse(inputsource(source, None).stream, encoding=encoding) if prefixes: set_namespaces(doc, prefixes) return doc
def parse(source, prefixes=None, model=None, encoding=None, use_xhtml_ns=False): ''' ''' from amara.lib.util import set_namespaces #from amara.bindery import html; doc = html.parse("http://www.hitimewine.net/istar.asp?a=6&id=161153!1247") #parser = html5lib.HTMLParser() if PRE_0_90: def get_tree_instance(): entity_factory = model.clone if model else entity return treebuilder(entity_factory) else: def get_tree_instance(namespaceHTMLElements, use_xhtml_ns=use_xhtml_ns): #use_xhtml_ns is a boolean, whether or not to use http://www.w3.org/1999/xhtml entity_factory = model.clone if model else entity return treebuilder(entity_factory, use_xhtml_ns) parser = html5lib.HTMLParser(tree=get_tree_instance) doc = parser.parse(inputsource(source, None).stream, encoding=encoding) if prefixes: set_namespaces(doc, prefixes) return doc
def parse(obj, uri=None, entity_factory=None, standalone=False, validate=False, prefixes=None, model=None): if model: entity_factory = model.clone if not entity_factory: entity_factory = nodes.entity_base doc = tree.parse(obj, uri, entity_factory=entity_factory, standalone=standalone, validate=validate) if prefixes: set_namespaces(doc, prefixes) return doc