Example #1
0
 def _process_url(self, url):
     self._processed_urls.append(url)
     tree = html5parser.parse(url)
     node = HTMLNode(tree.getroot())
     self._process_node(url, node)
     if self._urls_to_process:
         self._process_url(self._urls_to_process.pop())
def parse_reveal_html5( fname):
    
    lp = html5parser.parse(fname)
    root = lp.getroot()
    root = root[1][0][0]
    
    return root.getchildren()
Example #3
0
def doDemo():
    root = lxml.html.parse(request.args['foruri']).getroot()
    root2 = html5parser.parse(request.args['blog']).getroot()
    tree2 = root2.getroottree()
    if tree2.docinfo.doctype == '':
        lxml.html.xhtml_to_html(root2)
    root.make_links_absolute(request.args['foruri'], resolve_base_href = True)
    root.xpath(request.args['xpath'])[0].addnext(root2.xpath(request.args['bxpath'])[0])
    return lxml.html.tostring(root)
def fetch_dom(url, guestpass=None):
	request = Request(url)
	request.add_header('Accept-Language', 'sv')
	if guestpass != None:
		request.add_header('Cookie', 'dv_guestpass='******'User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36')
	response = urlopen(request)
	parser = HTMLParserUTF8(namespaceHTMLElements=False)
	parsed = html5parser.parse(response, guess_charset=False, parser=parser)
	return parsed
Example #5
0
def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
    else:
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree
Example #6
0
def fetch_dom(url, guestpass=None):
    request = Request(url)
    request.add_header('Accept-Language', 'sv')
    if guestpass != None:
        request.add_header('Cookie', 'dv_guestpass='******'User-Agent',
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'
    )
    response = urlopen(request)
    parser = HTMLParserUTF8(namespaceHTMLElements=False)
    parsed = html5parser.parse(response, guess_charset=False, parser=parser)
    return parsed
Example #7
0
def xml2etree(f, xml=True, html5=False):
    if xml:
        element_tree = etree.parse(f)
    elif html5 and html5parser:
        element_tree = html5parser.parse(f)
    elif html5parser:
        element_tree = html.parse(f)
    else:
        # html5lib's parser returns an Element, so we must convert it into an
        # ElementTree
        element_tree = ElementTree(html.parse(f))

    return element_tree
Example #8
0
def read_xml(filename, mangle_entities=False):
    """
    Read in a document, returning the ElementTree doc node.
    """
    tree = treebuilders.getTreeBuilder('lxml')
    parser = html5lib.HTMLParser(strict=False, tree=tree)
    doc = html5parser.parse(filename, parser=parser)

    if parser.errors:
        sys.stderr.write('errors in {0}\n'.format(filename))
        for e in parser.errors:
            sys.stderr.write('    {0}\n'.format(e))

    return doc
Example #9
0
def read_xml(filename, mangle_entities=False):
    """
    Read in a document, returning the ElementTree doc node.
    """
    tree = treebuilders.getTreeBuilder('lxml')
    parser = html5lib.HTMLParser(strict=False, tree=tree)
    doc = html5parser.parse(filename, parser=parser)

    if parser.errors:
        sys.stderr.write('errors in {0}\n'.format(filename))
        for e in parser.errors:
            sys.stderr.write('    {0}\n'.format(e))

    return doc
Example #10
0
def save_feed():
    coll = g.db['feed']
    d = {}
    d['about'] = request.form['about']
    d['blog'] = request.form['blog']
    d['bxpath'] = request.form['bxpath']
    d['xpath'] = request.form['xpath']
    d['author'] = request.form['author']
    d['type'] = request.form['type']
    d['lang']  = request.form['lang']
    d['location'] = request.form['location']
    coll.insert(d)
    if d['type'] == '5el':
        collection = g.db['post']
        root = html5parser.parse(d['blog']).getroot()
        tree = root.getroottree()
        if tree.docinfo.doctype == '':
            lxml.html.xhtml_to_html(root)
        d['data'] = lxml.html.tostring(root.xpath(d['bxpath'])[0]) #TODO implement a function like lxml.html.make_links_absolute
        collection.insert(d)
    response = make_response()
    response.data = repr(request.form['blog'])
    response.headers['Access-Control-Allow-Origin'] = '*'
    return response
Example #11
0
 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import parse
     return parse(*args, **kwargs)
Example #12
0
def main(argv, input, output):
  h = html5.parse(input)
  for link in links(h, url=argv[1]):
    print_link(link, file=output)
Example #13
0
def parse_html_post(blogpost_path):
    """Given a path, parse an html file."""
    doc = html5parser.parse(blogpost_path).getroot()
    logging.info("HTML document parsed")
    return doc
Example #14
0
def parse_raw_post(raw_post_path):
    """Given a path, parse an html file."""
    doc = html5parser.parse(raw_post_path).getroot()
    logging.info("parserrawpost: HTML document parsed")
    return doc
Example #15
0
def parserawpost(rawpostpath):
    '''Given a path, parse an html file.'''
    doc = html5parser.parse(rawpostpath).getroot()
    # TODO: check if the file contains all the required information.
    logging.info("parserrawpost: HTML document parsed")
    return doc
Example #16
0
def parse_raw_post(raw_post_path):
    """Given a path, parse an html file."""
    doc = html5parser.parse(raw_post_path).getroot()
    logging.info("parserrawpost: HTML document parsed")
    return doc
Example #17
0
import sys

from lxml.etree import _Element
from lxml.html import html5parser
from html5lib import HTMLParser

root = html5parser.parse('data/article.html',
                         parser=HTMLParser(namespaceHTMLElements=False))


def find_paragraphs(elem: _Element):
    ps = elem.find('p')
    f = open('input.txt', mode='w')
    for p in ps:
        f.write(p.text + '\n')
    f.close()


find_paragraphs(root)
Example #18
0
 def call_it(self, *args, **kwargs):
     from lxml.html.html5parser import parse
     return parse(*args, **kwargs)
Example #19
0
def parse_html_post(blogpost_path):
    """Given a path, parse an html file."""
    doc = html5parser.parse(blogpost_path).getroot()
    logging.info("HTML document parsed")
    return doc
Example #20
0
from sys import stdin, stdout
from csv import writer
from lxml.html import html5parser

doc = html5parser.parse(stdin)
ns = {'h': 'http://www.w3.org/1999/xhtml'}
w = writer(stdout)
for item in doc.xpath('//h:div[@class="items-container"]/h:li/h:a',
                      namespaces=ns):
    isaacId = item.getparent().get('data-sid')
    classes = item.xpath('string(h:div/@class)', namespaces=ns)
    iconId = next(
        c.split('r-itm', 1)[1] for c in classes.split()
        if c.startswith('r-itm'))
    name = item.xpath('string(h:span/h:p[@class="item-title"]/text())',
                      namespaces=ns)
    description = item.xpath('string(h:span/h:p[@class="pickup"]/text())',
                             namespaces=ns)
    w.writerow([isaacId, name, description, iconId, '500', '0'])
Example #21
0
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given website as DOM nodes or a
    string. Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : dict
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)
            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            xpath = conf.get('xpath', **kwargs)
            html5 = conf.get('html5', **kwargs) == 'true'
            use_as_string = conf.get('useAsString', **kwargs) == 'true'
            tree = html5parser.parse(f) if html5 else html.parse(f)
            root = tree.getroot()
            items = root.xpath(xpath)

            if context and context.verbose:
                print 'XPathFetchPage: found count items:', len(items)

            for etree in items:
                i = utils.etree_to_dict(etree)

                if context and context.verbose:
                    print '--------------item data --------------------'
                    print i
                    print '--------------EOF item data ----------------'

                if use_as_string:
                    yield {'content': unicode(i)}
                else:
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Example #22
0
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """XPath Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.
    """
    conf = DotDict(conf)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = util.get_abspath(url)
            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            xpath = conf.get('xpath', **kwargs)
            html5 = conf.get('html5', **kwargs) == 'true'
            use_as_string = conf.get('useAsString', **kwargs) == 'true'
            tree = html5parser.parse(f) if html5 else html.parse(f)
            root = tree.getroot()
            items = root.xpath(xpath)

            if context and context.verbose:
                print 'XPathFetchPage: found count items:', len(items)

            for etree in items:
                i = util.etree_to_dict(etree)

                if context and context.verbose:
                    print '--------------item data --------------------'
                    print i
                    print '--------------EOF item data ----------------'

                if use_as_string:
                    yield {'content': unicode(i)}
                else:
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Example #23
0
for doc in documents.getchildren():
    # Build a non-wanky data structure from the MySQL dump output:
    attrs = dict(
        (i.attrib['name'], unicode(i) if i else i) for i in doc.getchildren()
    )

    attrs['ID'] = int(attrs['ID'])
    if attrs['ID'] <= 2:
        continue # Skip the containers

    assert attrs['Parent'] == "2"

    post, created      = Post.objects.get_or_create(pk=attrs['ID'])

    full_html          = html5parser.parse("http://improbable.org/chris/?ID=%d" % attrs['ID'])

    post.markup        = "none" # Yes, really
    post.title         = unicode(full_html.xpath("/html/head/title")[0].text_content()).strip().replace(u"Chris Adams: ", u"")
    post.created       = parse_date(attrs['Created'])
    post.modified      = parse_date(attrs['Modified'])
    post.publish       = max(post.created, post.modified)

    post.body          = u""

    blog_entry_el      = full_html.find('//div[@class="BlogEntry"]')

    # TODO: Deal with local media files and page links

    blog_entry_el.make_links_absolute(base_url="http://improbable.org/chris/")
    blog_entry_el.rewrite_links(link_launderer)