def _process_url(self, url): self._processed_urls.append(url) tree = html5parser.parse(url) node = HTMLNode(tree.getroot()) self._process_node(url, node) if self._urls_to_process: self._process_url(self._urls_to_process.pop())
def parse_reveal_html5( fname): lp = html5parser.parse(fname) root = lp.getroot() root = root[1][0][0] return root.getchildren()
def doDemo(): root = lxml.html.parse(request.args['foruri']).getroot() root2 = html5parser.parse(request.args['blog']).getroot() tree2 = root2.getroottree() if tree2.docinfo.doctype == '': lxml.html.xhtml_to_html(root2) root.make_links_absolute(request.args['foruri'], resolve_base_href = True) root.xpath(request.args['xpath'])[0].addnext(root2.xpath(request.args['bxpath'])[0]) return lxml.html.tostring(root)
def fetch_dom(url, guestpass=None): request = Request(url) request.add_header('Accept-Language', 'sv') if guestpass != None: request.add_header('Cookie', 'dv_guestpass='******'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36') response = urlopen(request) parser = HTMLParserUTF8(namespaceHTMLElements=False) parsed = html5parser.parse(response, guess_charset=False, parser=parser) return parsed
def xml2etree(f, xml=True, html5=False): if xml: element_tree = etree.parse(f) elif html5 and html5parser: element_tree = html5parser.parse(f) elif html5parser: element_tree = html.parse(f) else: # html5lib's parser returns an Element, so we must convert it into an # ElementTree element_tree = ElementTree(html.parse(f)) return element_tree
def fetch_dom(url, guestpass=None): request = Request(url) request.add_header('Accept-Language', 'sv') if guestpass != None: request.add_header('Cookie', 'dv_guestpass='******'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36' ) response = urlopen(request) parser = HTMLParserUTF8(namespaceHTMLElements=False) parsed = html5parser.parse(response, guess_charset=False, parser=parser) return parsed
def read_xml(filename, mangle_entities=False): """ Read in a document, returning the ElementTree doc node. """ tree = treebuilders.getTreeBuilder('lxml') parser = html5lib.HTMLParser(strict=False, tree=tree) doc = html5parser.parse(filename, parser=parser) if parser.errors: sys.stderr.write('errors in {0}\n'.format(filename)) for e in parser.errors: sys.stderr.write(' {0}\n'.format(e)) return doc
def save_feed(): coll = g.db['feed'] d = {} d['about'] = request.form['about'] d['blog'] = request.form['blog'] d['bxpath'] = request.form['bxpath'] d['xpath'] = request.form['xpath'] d['author'] = request.form['author'] d['type'] = request.form['type'] d['lang'] = request.form['lang'] d['location'] = request.form['location'] coll.insert(d) if d['type'] == '5el': collection = g.db['post'] root = html5parser.parse(d['blog']).getroot() tree = root.getroottree() if tree.docinfo.doctype == '': lxml.html.xhtml_to_html(root) d['data'] = lxml.html.tostring(root.xpath(d['bxpath'])[0]) #TODO implement a function like lxml.html.make_links_absolute collection.insert(d) response = make_response() response.data = repr(request.form['blog']) response.headers['Access-Control-Allow-Origin'] = '*' return response
def call_it(self, *args, **kwargs): from lxml.html.html5parser import parse return parse(*args, **kwargs)
def main(argv, input, output): h = html5.parse(input) for link in links(h, url=argv[1]): print_link(link, file=output)
def parse_html_post(blogpost_path): """Given a path, parse an html file.""" doc = html5parser.parse(blogpost_path).getroot() logging.info("HTML document parsed") return doc
def parse_raw_post(raw_post_path): """Given a path, parse an html file.""" doc = html5parser.parse(raw_post_path).getroot() logging.info("parserrawpost: HTML document parsed") return doc
def parserawpost(rawpostpath): '''Given a path, parse an html file.''' doc = html5parser.parse(rawpostpath).getroot() # TODO: check if the file contains all the required information. logging.info("parserrawpost: HTML document parsed") return doc
import sys from lxml.etree import _Element from lxml.html import html5parser from html5lib import HTMLParser root = html5parser.parse('data/article.html', parser=HTMLParser(namespaceHTMLElements=False)) def find_paragraphs(elem: _Element): ps = elem.find('p') f = open('input.txt', mode='w') for p in ps: f.write(p.text + '\n') f.close() find_paragraphs(root)
from sys import stdin, stdout from csv import writer from lxml.html import html5parser doc = html5parser.parse(stdin) ns = {'h': 'http://www.w3.org/1999/xhtml'} w = writer(stdout) for item in doc.xpath('//h:div[@class="items-container"]/h:li/h:a', namespaces=ns): isaacId = item.getparent().get('data-sid') classes = item.xpath('string(h:div/@class)', namespaces=ns) iconId = next( c.split('r-itm', 1)[1] for c in classes.split() if c.startswith('r-itm')) name = item.xpath('string(h:span/h:p[@class="item-title"]/text())', namespaces=ns) description = item.xpath('string(h:span/h:p[@class="pickup"]/text())', namespaces=ns) w.writerow([isaacId, name, description, iconId, '500', '0'])
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given website as DOM nodes or a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' xpath = conf.get('xpath', **kwargs) html5 = conf.get('html5', **kwargs) == 'true' use_as_string = conf.get('useAsString', **kwargs) == 'true' tree = html5parser.parse(f) if html5 else html.parse(f) root = tree.getroot() items = root.xpath(xpath) if context and context.verbose: print 'XPathFetchPage: found count items:', len(items) for etree in items: i = utils.etree_to_dict(etree) if context and context.verbose: print '--------------item data --------------------' print i print '--------------EOF item data ----------------' if use_as_string: yield {'content': unicode(i)} else: yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs): """XPath Fetch Page module _INPUT -- not used since this does not have inputs. conf: URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. """ conf = DotDict(conf) urls = util.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = util.get_value(DotDict(item_url), DotDict(item), **kwargs) url = util.get_abspath(url) f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' xpath = conf.get('xpath', **kwargs) html5 = conf.get('html5', **kwargs) == 'true' use_as_string = conf.get('useAsString', **kwargs) == 'true' tree = html5parser.parse(f) if html5 else html.parse(f) root = tree.getroot() items = root.xpath(xpath) if context and context.verbose: print 'XPathFetchPage: found count items:', len(items) for etree in items: i = util.etree_to_dict(etree) if context and context.verbose: print '--------------item data --------------------' print i print '--------------EOF item data ----------------' if use_as_string: yield {'content': unicode(i)} else: yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
for doc in documents.getchildren(): # Build a non-wanky data structure from the MySQL dump output: attrs = dict( (i.attrib['name'], unicode(i) if i else i) for i in doc.getchildren() ) attrs['ID'] = int(attrs['ID']) if attrs['ID'] <= 2: continue # Skip the containers assert attrs['Parent'] == "2" post, created = Post.objects.get_or_create(pk=attrs['ID']) full_html = html5parser.parse("http://improbable.org/chris/?ID=%d" % attrs['ID']) post.markup = "none" # Yes, really post.title = unicode(full_html.xpath("/html/head/title")[0].text_content()).strip().replace(u"Chris Adams: ", u"") post.created = parse_date(attrs['Created']) post.modified = parse_date(attrs['Modified']) post.publish = max(post.created, post.modified) post.body = u"" blog_entry_el = full_html.find('//div[@class="BlogEntry"]') # TODO: Deal with local media files and page links blog_entry_el.make_links_absolute(base_url="http://improbable.org/chris/") blog_entry_el.rewrite_links(link_launderer)