Beispiel #1
0
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in StringTypes:
        if type(src) is unicode:
            # If an encoding was provided, do not change it.
            if not encoding:
                encoding = "utf-8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
        encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))


    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    #try:
    context.parseCSS()
    #except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
Beispiel #2
0
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None):
    """
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object
    """

    global CSSAttrCache
    CSSAttrCache = {}

    if xhtml:
        #TODO: XHTMLParser doesn't see to exist...
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in StringTypes:
        if type(src) is unicode:
            # If an encoding was provided, do not change it.
            if not encoding:
                encoding = "utf-8"
            src = src.encode(encoding)
        src = pisaTempFile(src, capacity=context.capacity)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)
    document = parser.parse(
        src,
        encoding=encoding)

    if xml_output:
        if encoding:
            xml_output.write(document.toprettyxml(encoding=encoding))
        else:
            xml_output.write(document.toprettyxml(encoding="utf8"))


    if default_css:
        context.addDefaultCSS(default_css)

    pisaPreLoop(document, context)
    #try:
    context.parseCSS()
    #except:
    #    context.cssText = DEFAULT_CSS
    #    context.parseCSS()
    # context.debug(9, pprint.pformat(context.css))

    pisaLoop(document, context)
    return context
Beispiel #3
0
def pisaParser(src, c, default_css="", xhtml=False, encoding=None, xml_output=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """
    
    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        #src = pisaTempFile(src, capacity=c.capacity)    

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1        
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
             if inputstream.codecName(encoding) is None:
                 log.error("%r is not a valid encoding", encoding)
    
    
    #encoding = 'utf-8'
    document = parser.parse(
        src,
        encoding=encoding)
        
    if xml_output:
        xml_output.write(document.toprettyxml(encoding="utf8"))    

    if default_css:
        c.addCSS(default_css)
    
    #from html5lib import treewalkers, serializer
    #walker = treewalkers.getTreeWalker("dom")
    #stream = walker(document)
    #s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    #output_generator = s.serialize(stream)
    #for item in output_generator:
    #    print item
    
    #pisaPreLoop(document, c)
    #try:
    c.parseCSS()        
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()        
    # c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
def pisaParser(src, c, default_css="", xhtml=False, encoding=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """
    
    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        src = StringIO.StringIO(src)    

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1        
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding)
        else:
             if inputstream.codecName(encoding) is None:
                 log.error("%r is not a valid encoding", encoding)
            
    document = parser.parse(
        src, 
        encoding=encoding)
    # print document.toprettyxml()    

    if default_css:
        c.addCSS(default_css)
        
    pisaPreLoop(document, c)    
    #try:
    c.parseCSS()        
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()        
    # c.debug(9, pprint.pformat(c.css))        
    pisaLoop(document, c)
    return c
Beispiel #5
0
def pisaParser(src, c, default_css="", xhtml=False, encoding=None):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """

    if xhtml:
        parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    else:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    if type(src) in types.StringTypes:
        if type(src) is types.UnicodeType:
            encoding = "utf8"
            src = src.encode(encoding)
        src = StringIO.StringIO(src)

    # Test for the restrictions of html5lib
    if encoding:
        # Workaround for html5lib<0.11.1
        if hasattr(inputstream, "isValidEncoding"):
            if encoding.strip().lower() == "utf8":
                encoding = "utf-8"
            if not inputstream.isValidEncoding(encoding):
                log.error(
                    "%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!",
                    encoding)
        else:
            if inputstream.codecName(encoding) is None:
                log.error("%r is not a valid encoding", encoding)

    document = parser.parse(src, encoding=encoding)
    # print document.toprettyxml()

    if default_css:
        c.addCSS(default_css)

    pisaPreLoop(document, c)
    #try:
    c.parseCSS()
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()
    # c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
Beispiel #6
0
    def __init__(self, alive_event, work_event, email, queue, index_queue,
                 passed, robots, unavailable_urls,
                 agent_name=DEFAULT_AGENT_NAME, headers=DEFAULT_HEADERS,
                 url_filter=lambda x: True):
        Process.__init__(self)
        self.alive_event = alive_event
        self.work_event = work_event
        self.email = email
        self.queue = queue
        self.index_queue = index_queue
        self.passed = passed
        self.robots = robots
        self.unavailable_urls = unavailable_urls
        self.agent_name = agent_name
        self.url_filter = url_filter
        self.is_working = False

        self.handler = request.build_opener(RobotHandler(agent_name, robots))
        handler_headers = [(k, v) for k, v in copy.copy(headers).items()]
        handler_headers.append(("User-Agent", agent_name))
        handler_headers.append(("From", email))
        self.handler.addheaders = handler_headers

        self.html_parser = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        self.connection = None
Beispiel #7
0
def parse_text(text):
	t1 = time.clock()
	parser = html5lib.HTMLParser(
			tree=treebuilders.getTreeBuilder('etree'),
			tokenizer=MySanitiser)
	t2 = time.clock()

	text = text.replace('\r', '')
	text = text.replace('\n', '<br>')
	t3 = time.clock()

	for search,replace in SMILEY_REPLACEMENTS:
		text = text.replace(search, replace)

	for regex,replace in BBCODE_REGEXES:
		text = regex.sub(replace, text)

	for search,replace in BBCODE_REPLACEMENTS:
		text = text.replace(search, replace)

	t4 = time.clock()
	doc = parser.parse(text)
	t5 = time.clock()

	walker = treewalkers.getTreeWalker('etree')
	stream = walker(doc)
	s = serializer.htmlserializer.HTMLSerializer()
	output_generator = s.serialize(stream)
	t6 = time.clock()

	done = Markup(''.join(list(output_generator)))
	t7 = time.clock()
	print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1))
	return done
Beispiel #8
0
def truncate_html(*args):
    document = truncate(*args)
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    document = parser.parse(document)

    xml = document.getElementsByTagName('body')[0].childNodes[0].toxml()
    return xml
Beispiel #9
0
def _get_default_parser():
    if settings.TEXT_HTML_SANITIZE:
        parser_classes = []
        for parser_class in settings.ALLOW_TOKEN_PARSERS:
            parser_classes.append(import_string(parser_class))
        TextSanitizer.allow_token_parsers = parser_classes
    return html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
Beispiel #10
0
    def get_favicon_url(self, html):
        """
        Parses *html* looking for a favicon URL.  Returns a tuple of:
            (<url>, <mimetime>)

        If no favicon can be found, returns:
            (None, None)
        """
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = p.parse(html)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        fetch_url = None
        mimetype = None
        icon = False
        found_token = None
        for token in stream:
            if 'name' in token:
                if token['name'] == 'link':
                    for attr in token['data']:
                        if attr[0] == 'rel':
                            if 'shortcut icon' in attr[1].lower():
                                found_token = token
                                icon = True
                        elif attr[0] == 'href':
                            fetch_url = attr[1]
                        elif attr[0] == 'type':
                            mimetype = attr[1]
                    if fetch_url and icon:
                        if not mimetype:
                            mimetype = "image/x-icon"
                        if mimetype in self.favicon_mimetypes:
                            return (fetch_url, mimetype)
        return (None, None)
Beispiel #11
0
def clean_html(buf):
    """Cleans HTML of dangerous tags and content."""
    buf = buf.strip()
    if not buf:
        return buf

    html_parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                      tokenizer=HTMLSanitizer)
    dom_tree = html_parser.parseFragment(buf)

    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,
                                                 quote_attr_values=True)
    output = s.render(stream, 'utf-8')

    while 'toberemoved' in output:
        oldoutput = output
        matches = re.findall(r'&lt;toberemoved.*?&gt;.*?&lt;/toberemoved&gt;',
                             output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;/toberemoved&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        matches = re.findall(r'&lt;toberemoved.*?&gt;', output, re.DOTALL)
        for s in matches:
            output = output.replace(s, '')
        if output == oldoutput:
            break

    return output
def get_spaces_available(dept_abbr, course_num):
	# define
	post_data = {
		'classyear' : '2008', #don't know WHY!?!
		'subj': dept_abbr,
		'crsenum': course_num,
	}
	url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'

	# get the html
	cj = cookielib.LWPCookieJar()
	opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
	urllib2.install_opener(opener)
	headers =  {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
	request = urllib2.Request(url, urllib.urlencode(post_data), headers)
	handle = urllib2.urlopen(request)
	html = handle.read()

	# parse the html
	parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
	soup = parser.parse(html)
	tbody = soup.find('th', text='Term').parent.parent.parent
	cells = tbody.findAll('tr')[2]('td')
	enrolled = int(cells[-2].contents[0])
	capacity = int(cells[-3].contents[0])

	print "%i spaces left (capacity of %i with %i enrolled)" % (capacity-enrolled, capacity, enrolled)
Beispiel #13
0
    def get_toc(self, path):
        # Only have TOC on tutorial pages. Don't do work for others.
        if not (re.search('/tutorials', path) or re.search('/mobile', path)):
            return ''

        toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
        if toc is None or not self.request.cache:
            template_text = render_to_string(path, {})

            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("dom"))
            dom_tree = parser.parse(template_text)
            walker = treewalkers.getTreeWalker("dom")
            stream = walker(dom_tree)
            toc = []
            current = None
            for element in stream:
                if element['type'] == 'StartTag':
                    if element['name'] in ['h2', 'h3', 'h4']:
                        for attr in element['data']:
                            if attr[0] == 'id':
                                current = {
                                    'level': int(element['name'][-1:]) - 1,
                                    'id': attr[1]
                                }
                elif element['type'] == 'Characters' and current is not None:
                    current['text'] = element['data']
                elif element['type'] == 'EndTag' and current is not None:
                    toc.append(current)
                    current = None
            memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path),
                         toc, 3600)

        return toc
Beispiel #14
0
def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True):
    """Parse an HTML text. Return value: lxml.html.HtmlElement document.

    parser: which parser to use.
    whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML."""
    doc = None

    if parser == scraper.LXML_HTML:
        if whole_doc:
            doc = html.document_fromstring(text)
        else:
            doc = html.fromstring(text)
    elif parser == scraper.HTML5PARSER:
        # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642
        #if whole_doc:
        #    doc = html5parser.document_fromstring(text)
        #else:
        #    doc = html5parser.fromstring(text)
        # Here is my workaround:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"),
                                     namespaceHTMLElements=False)
        etree_doc = parser.parse(text)  # returns an ElementTree
        doc = html.document_fromstring(elementtree_to_string(etree_doc))
        # ^ this double conversion makes it slow ^
    elif parser == scraper.BEAUTIFULSOUP:
        # soupparser has no document_fromstring method
        doc = soupparser.fromstring(text)
    else:
        print >> sys.stderr, "Warning: you want to use an unknown parser in lx.py."
        # doc is None

    return doc  # lxml.html.HtmlElement
def hmtl2text(html):
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html.decode("utf-8"))
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    in_script = False
    outbuf = []
    current_line = []
    for token in stream:
        token_name = token.get('name', "").lower()

        if token_name in ['script', 'style', 'noscript']:
            in_script = token.get('type', None) == 'StartTag'
        if in_script:
            continue

        if token_name in block_level_elements or token_name == "br":
            if current_line:
                outbuf.append(u"".join(current_line))
                current_line = []

        if token.get(u'type', None) == u'Characters':
            current_line.append(token['data'])
        if token.get(u'type', None) == u'SpaceCharacters':
            if current_line and current_line[-1] != u" ":
                current_line.append(u" ")

    if current_line:
        outbuf.append(u"".join(current_line))
    return clean_whitespace("\n".join(outbuf))
Beispiel #16
0
    def parseXml(self, xmlStr):
        '''
		Takes in a string of XML extracted from the XFA and finds any script elements. 
		Returns a tuple containing two elements:
			1) the root of the parsed XML tree structure
			2) a list of any script elements found in the tree
		'''

        # parse XML and look for 'script' elements
        #self.logger.debug("Parsing XML...")

        # all JavaScript executions here are to build the DOM
        is_dom = True

        try:
            # The XML must be wrapped in a root element because there may not be just 1 root
            # after I smashed together all the xml ;]
            xml = "<xfa>%s</xfa>" % (xmlStr)
            #logger.info("xml: %s" % repr(xml))
            xmlTree = etree.fromstring(xml)
        except Exception, e:
            #self.logger.warn("[lxml] exception from parsing XML: %s" % e)
            #logger.info(" [lxml] going to try with html5lib..")
            return (None, None)
            import html5lib
            from html5lib import treebuilders
            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("lxml"))
            xmlTree = parser.parse(xml)
Beispiel #17
0
def _get_default_parser():
    opts = {}

    sanitizer.HTMLSanitizer.acceptable_elements.extend(['cms-plugin'])

    if settings.TEXT_HTML_SANITIZE:
        sanitizer.HTMLSanitizer.acceptable_elements.extend(
            settings.TEXT_ADDITIONAL_TAGS)
        sanitizer.HTMLSanitizer.acceptable_attributes.extend(
            settings.TEXT_ADDITIONAL_ATTRIBUTES)
        sanitizer.HTMLSanitizer.allowed_elements = (
            sanitizer.HTMLSanitizer.acceptable_elements +
            sanitizer.HTMLSanitizer.mathml_elements +
            sanitizer.HTMLSanitizer.svg_elements)
        sanitizer.HTMLSanitizer.allowed_attributes = (
            sanitizer.HTMLSanitizer.acceptable_attributes +
            sanitizer.HTMLSanitizer.mathml_attributes +
            sanitizer.HTMLSanitizer.svg_attributes)
        sanitizer.HTMLSanitizer.allowed_protocols = (
            sanitizer.HTMLSanitizer.acceptable_protocols +
            list(settings.TEXT_ADDITIONAL_PROTOCOLS))
        parser_classes = []
        for parser_class in settings.ALLOW_TOKEN_PARSERS:
            parser_classes.append(import_string(parser_class))

        TextSanitizer.allow_token_parsers = parser_classes
        opts['tokenizer'] = TextSanitizer

    return html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **opts)
Beispiel #18
0
    def _parse_wayback_page(self, page_year):
        """
		Paser all recored web page URLs in specific year.
		"""
        his_urls = []
        wholepage = self.open_url(page_year)
        if wholepage == None: return his_urls

        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))

        try:
            html_doc = parser.parse(wholepage)
        except ValueError:
            wholepage_clean = ''.join(c for c in wholepage
                                      if _valid_XML_char_ordinal(ord(c)))
            html_doc = parser.parse(wholepage_clean)

        body = html_doc.find("./{*}body")
        position_div = body.find("./{*}div[@id='position']")
        wayback_cal = position_div.find("./{*}div[@id='wbCalendar']")
        calOver = wayback_cal.find("./{*}div[@id='calOver']")
        for month in calOver.findall("./{*}div[@class='month']"):
            for day in month.findall(".//{*}td"):
                day_div = day.find("./{*}div[@class='date tooltip']")
                if day_div != None:
                    for snapshot in day_div.findall(
                            "./{*}div[@class='pop']/{*}ul/{*}li"):
                        his_urls.append(snapshot[0].get('href'))

        year = self.extract_year(his_urls[0]) if len(his_urls) > 0 else None

        return (year, his_urls)
Beispiel #19
0
    def get_favicon_url(self, html):
        """
        Parses *html* looking for a favicon URL.  Returns a tuple of:
            (<url>, <mimetime>)

        If no favicon can be found, returns:
            (None, None)
        """
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom_tree = p.parse(html)
        walker = treewalkers.getTreeWalker("dom")
        stream = walker(dom_tree)
        fetch_url = None
        mimetype = None
        icon = False
        found_token = None
        for token in stream:
            if 'name' in token:
                if token['name'] == 'link':
                    for attr in token['data']:
                        if attr[0] == 'rel':
                            if 'shortcut icon' in attr[1].lower():
                                found_token = token
                                icon = True
                        elif attr[0] == 'href':
                            fetch_url = attr[1]
                        elif attr[0] == 'type':
                            mimetype = attr[1]
                    if fetch_url and icon:
                        if not mimetype:
                            mimetype = "image/x-icon"
                        if mimetype in self.favicon_mimetypes:
                            return (fetch_url, mimetype)
        return (None, None)
Beispiel #20
0
def _normalize(html):
    """
    Normalize the given string of HTML, collapsing whitespace.
    """

    # This is taken from the "Serialization of Streams" section of
    # http://code.google.com/p/html5lib/wiki/UserDocumentation.
    p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parse(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False)
    output_generator = s.serialize(stream)

    # TODO: We're not actually collapsing *all* whitespace; only
    # entire chunks of whitespace that the serializer gives us. Currently,
    # this seems "good enough" to pass our unit tests, which are
    # based on use cases of comparing pre-sanitized HTML to sanitized HTML,
    # but we may need to change this in the future.
    parts = []
    last_item_was_whitespace = False
    for item in output_generator:
        # Is it empty whitespace?
        if item.strip() != '':
            parts.append(item)
            last_item_was_whitespace = False
        elif not last_item_was_whitespace:
            # Collapse whitespace.
            parts.append(' ')
            last_item_was_whitespace = True
    return ''.join(parts)
Beispiel #21
0
 def get_dom(self, buf):
     buf = buf.strip()
     if not buf:
         return None
     p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                             tokenizer=self.token_sanitizer())
     return p.parseFragment(buf)
Beispiel #22
0
    def _make_dom(self, input_string):
        """Given an input_string containing [X]HTML, return a tuple of
        (dom, options).  If input_string is valid XML, xml.dom.minidom is 
        used to perform the parsing.  If input_string is not valid XML,
        fall back to using html5lib for creating the DOM.

        input_string is wrapped in StringIO so we can easily reset in the
        event of errors."""

        options = pyRdfa.Options()

        try:
            # try to parse as XML
            dom = xml.dom.minidom.parse(StringIO(input_string))

        except:
            # fall back to html5lib
            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("dom"))

            dom = parser.parse(input_string, encoding='utf-8')

            # The host language has changed
            options.host_language = pyRdfa.HTML5_RDFA

        return dom, options
Beispiel #23
0
 def __init__(self, api='etree'):
     # if no default implementation is defined for this api, set it to None
     # to let getTreeBuilder() using the corresponding implementation.
     implementation = self.defaults.get(api, None)
     HTMLParser.__init__(self,
                         tree=treebuilders.getTreeBuilder(
                             api, implementation))
Beispiel #24
0
def TrusteeImage(personName, withQuotes = True):
    # TODO
    # Make option to search for names without quotes as well.
    opener = urllib2.build_opener(urllib2.HTTPRedirectHandler)
    headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0. 6) Gecko/2009020911 Ubuntu/8.04 (hardy) Firefox/3.0.6'}
    if (withQuotes):
        url = "http://images.google.com/images?hl=en&q=" + urllib.quote("\"" + personName + "\"")
    else:
        url = "http://images.google.com/images?hl=en&q=" + urllib.quote(personName)

    request = urllib2.Request(url, None, headers)
    response = opener.open(request)
    results = response.read()
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(results)

    # check for meta tag, if we have it we need to find the redirect
    # From 3 March 2011, just need the regex
    #meta = soup.findAll("div", id="tphdr")
    #contents = meta[0].contents[0]
    metaRE = re.compile("0;url=(.+)\"")
    #m = metaRE.search(str(contents))
    m = metaRE.search(str(results))

    # If we have a match, do the redirect
    if (len(m.groups()) > 0):
        redirectURL = m.groups()[0]
        print redirectURL

        request = urllib2.Request(redirectURL, None, headers)
        response = opener.open(request)
        results = response.read()
        soup = parser.parse(results)

    # Find the div with the images
    div = soup.findAll("div", id="ImgCont")
    
    # If there's nothing found, return None
    if (len(div) == 0):
        return None

    # Get the links in the div
    links = div[0].findAll("a")

    # If there're no links, return None
    if (len(links) == 0):
        return None

    # If we're here, then we're probably okay
    # Get the first link
    link = links[0]

    # Create our regex
    value = re.compile("imgurl=(.+)&imgrefurl")
    m = value.search(link["href"])
    
    # Our link should be the first returned result
    imageLink = m.groups()[0]
    print imageLink
    return imageLink
Beispiel #25
0
	def parseXml(self,xmlStr):
		'''
		Takes in a string of XML extracted from the XFA and finds any script elements. 
		Returns a tuple containing two elements:
			1) the root of the parsed XML tree structure
			2) a list of any script elements found in the tree
		'''
		
		# parse XML and look for 'script' elements
		#self.logger.debug("Parsing XML...")

		# all JavaScript executions here are to build the DOM
		is_dom = True

		try:
			# The XML must be wrapped in a root element because there may not be just 1 root 
			# after I smashed together all the xml ;]
			xml = "<xfa>%s</xfa>" % (xmlStr)
			#logger.info("xml: %s" % repr(xml))
			xmlTree = etree.fromstring(xml)
		except Exception, e:
			#self.logger.warn("[lxml] exception from parsing XML: %s" % e)
			#logger.info(" [lxml] going to try with html5lib..")
			return (None, None)
			import html5lib
			from html5lib import treebuilders
			parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
			xmlTree = parser.parse(xml)
Beispiel #26
0
 def get_dom(self, buf):
     buf = buf.strip()
     if not buf:
         return None
     p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                             tokenizer=self.token_sanitizer())
     return p.parseFragment(buf)
Beispiel #27
0
    def GetMarginBlotterDataSet(self,UserID,PWD):
        u=urllib.urlopen('http://api.efxnow.com/DEMOWebServices2.8/Service.asmx/GetMarginBlotterDataSet?UserID='+UserID+'&PWD='+PWD)
        str=u.read()
        
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        tree=parser.parse(str)
        root= tree.getroot()

        PostedMargin=root.find('.//{http://www.w3.org/1999/xhtml}postedmargin').text
        RealizedProfit=root.find('.//{http://www.w3.org/1999/xhtml}realizedprofit').text
        UnrealizedProfit=root.find('.//{http://www.w3.org/1999/xhtml}unrealizedprofit').text
        MarginFactor=root.find('.//{http://www.w3.org/1999/xhtml}marginfactor').text
        MarginBalance=root.find('.//{http://www.w3.org/1999/xhtml}marginbalance').text
        TotalAvailable=root.find('.//{http://www.w3.org/1999/xhtml}totalavailable').text
        OpenPosiiton=root.find('.//{http://www.w3.org/1999/xhtml}openposiiton').text
        MaxDeal=root.find('.//{http://www.w3.org/1999/xhtml}maxdeal').text
        USDPostedMargin=root.find('.//{http://www.w3.org/1999/xhtml}usdpostedmargin').text
        USDRealizedProfit=root.find('.//{http://www.w3.org/1999/xhtml}usdrealizedprofit').text

        return {'PostedMargin':PostedMargin,
                'RealizedProfit':RealizedProfit,
                'UnrealizedProfit':UnrealizedProfit,
                'MarginFactor':MarginFactor,
                'MarginBalance':MarginBalance,
                'TotalAvailable':TotalAvailable,
                'OpenPosiiton':OpenPosiiton,
                'MaxDeal':MaxDeal,
                'USDPostedMargin':USDPostedMargin,
                'USDRealizedProfit':USDRealizedProfit}
 def __init__(self, url):
     page = urllib2.urlopen(url)
     parser = html5lib.HTMLParser(
         tokenizer=sanitizer.HTMLSanitizer,
         tree=treebuilders.getTreeBuilder("beautifulsoup"))
     self.document = parser.parse(page, encoding="iso8859-15")
     self.type_scrutin = 0
Beispiel #29
0
        def load_deployment_servername_ref( self, *args, **kwargs ):

                if self._is_DEPLOYMENT_SERVERNAME_in_debug: print green( '\t%s(...)\t|%s| <-' ) % ( self.load_deployment_servername_ref.func_name, 'load_deployment_servername_ref' )

                result = None

                self._httpconn.set_debuglevel( 0 )

                get_params = None
                post_params = None

                self._httpconn.request(
                        'GET',
                        u'/carto/serveurs/ServeurListe.php',
                        u'%s'.encode( 'UTF-8' ) % ( post_params ),
                        self._headers
                )

                resp = self._httpconn.getresponse().read()

		result = dict(
				map( 
					lambda e: ( e.text, re.match( 'ServeurVoir.php\?id=([0-9]*)', e.attrib[ 'href' ] ).group( 1 ) ),
					[ 
						e for e in HTMLParser( tree = treebuilders.getTreeBuilder( 'lxml' ) ).parse( resp ).xpath(
							'*//html:table/html:tbody//html:a', namespaces=self._d_namespaces 
						) if e.text is not None
					]
			)
		)

                if self._is_DEPLOYMENT_SERVERNAME_in_debug: print green( '\t%s(...)\t|%s| -> %s' ) % ( self.load_deployment_servername_ref.func_name, 'load_deployment_servername_ref', result )

                return result
def pisaParser(src, c, default_css=""):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    # XXX Bugfix for HTML5lib
    src = src.read().replace("<![CDATA[", "\n").replace("]]>", "\n")
    src = cStringIO.StringIO(src)

    document = parser.parse(src)
    # print document.toprettyxml()

    if default_css:
        c.addCSS(default_css)

    pisaPreLoop(document, c)
    # try:
    c.parseCSS()
    # except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()
    c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
Beispiel #31
0
def wasp():
    wx = {}
    try:
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        doc = p.parse(urllib2.urlopen("http://swaspgateway.suth/",
                                      timeout=1).read())
        t = doc.getElementsByTagName("table")[0]
        tds = t.getElementsByTagName("td")
        wx["Temp"] = float(tds[7].firstChild.nodeValue)
        if tds[10].firstChild.nodeValue == "RAIN":
            wx["Sky"] = "Rain"
            wx["Sky Temp"] = wx["Temp"]
        else:
            sky, stemp = tds[10].firstChild.nodeValue.split('(')
            stemp = stemp[0:-1]
            wx["Sky"] = sky
            wx["Sky Temp"] = stemp
        wx["T - DP"] = float(tds[9].firstChild.nodeValue)
        wx["RH"] = float(tds[8].firstChild.nodeValue)
        tds[6].normalize()
        wx["Wind Dir"] = tds[6].firstChild.nodeValue[1:]
        wx["Wind Speed"] = float(tds[5].firstChild.nodeValue)
        rain = tds[4].firstChild.nodeValue
        if rain == "DRY":
            wx["Raining"] = False
        else:
            wx["Raining"] = True
        wx["UT"] = tds[3].firstChild.nodeValue.strip()
        tds[31].normalize()
        wx["Status"] = tds[31].firstChild.nodeValue.strip()
        return wx
    except:
        return False
Beispiel #32
0
	def login(self, username, password):
		"""
		Login to o2online.ie

		Returns true if successful or false if fails.
		"""
		if self.resumable():
			self.logger.info("Resuming from login.")
			return True
		else:
			self.logger.info("Unable to resume, running connect from login.")
			self.connect()

		post = [
			('IDButton', 'Go'),
			('org', 'o2ext'),
			('CONNECTFORMGET', 'TRUE'),
			('IDToken1', username),
			('IDToken2', password)
		]

		handle = self.post('https://www.o2online.ie/amserver/UI/Login', post)
		from html5lib import HTMLParser, treebuilders
		parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
		soup = parser.parse(handle)

		if unicode(soup.html.head.title.string).strip() == u"LoginCheck":
			self.logger.info("login has correct HTML title.")
			return True
		return False
def scraper(request):
    post_data = {
            'classyear' : '2008', # why??
            'subj': 'COSC',
            'crsenum': '50'
        }
    url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch'

    
    # scrape the html
    cj = cookielib.LWPCookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    urllib2.install_opener(opener)
    headers =  {'User-agent' : 'Mozilla/c.0 (compatible; MSIE 5.5; Windows NT)'}
    request = urllib2.Request(url, urllib.urlencode(post_data), headers)
    handle = urllib2.urlopen(request)
    html = handle.read()

    # parse for the dept and course number
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    #tbody = soup.find('th', text='Term').parent.parent.parent
    #soup = tbody.findAll('tr')[2]('td')
    

    return render_to_response("scraper.html", {'soup': soup})
Beispiel #34
0
    def download(self, directory, filename=None, force=None):
        url = self.baseurl + self.action
        info("Downloading from tmpfile: %s" % url)
        debug("robot code: %s" % self.robot_code)
        values = {'robot_code': self.robot_code}
        data = urllib.urlencode(values)
        req = urllib2.Request(url, data)
        response = urllib2.urlopen(req)

        page = response.read()
        page = page.replace('xml:lang="ru"', '') 
        eparser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml", ElementTree))
        doc = eparser.parse(page)
        url = doc.xpath("//xhtml:div[@id='cntMain']//xhtml:center//xhtml:a", namespaces={'xhtml':'http://www.w3.org/1999/xhtml'})[0].text
        response = urllib2.urlopen(url)

        inf = response.info()
        if filename is None:
            match = re.search('filename="(.*)"', inf['Content-Disposition'])
            filename = match.group(1)

        debug("Destination directory: %s" % directory)
        debug("Destination file: %s" % filename)
        dstpath = os.path.join(directory, filename)
        if os.path.exists(dstpath) and not force:
            warn("File %s already exists, skipping")
            return
        self.save(response, dstpath)
Beispiel #35
0
def sanitize_html_fragment(unsafe, allowed_elements=None,
        allowed_attributes_map=None, allowed_styles_map=None,
        rename_elements=None, encoding='UTF-8'):
    # TODO: make this more simple / understandable and factor out from
    # plugins.html_to_template_text
    if not allowed_elements:
        allowed_elements = sanitizer.HTMLSanitizer.allowed_elements

    tokenizer = custom_sanitizer(allowed_elements, allowed_attributes_map,
                               allowed_styles_map, rename_elements)
    p = html5lib.HTMLParser(
        tree=treebuilders.getTreeBuilder("lxml"),
        tokenizer=tokenizer,
        namespaceHTMLElements=False
    )
    top_level_elements = p.parseFragment(unsafe, encoding=encoding)
    # put top level elements in container
    container = etree.Element('div')
    if top_level_elements and not hasattr(top_level_elements[0], 'tag'):
        container.text = top_level_elements.pop(0)
    container.extend(top_level_elements)

    html_bits = [etree.tostring(elem, method='html', encoding=encoding)
                     for elem in container]

    return ''.join([escape(container.text or '').encode(encoding)] + html_bits)
Beispiel #36
0
	def _parse_wayback_page(self, page_year):
		"""
		Paser all recored web page URLs in specific year.
		"""
		his_urls = []
		wholepage = self.open_url(page_year)
		if wholepage == None: return his_urls

		parser = html5lib.HTMLParser(tree = treebuilders.getTreeBuilder("lxml"))

		try:
			html_doc = parser.parse(wholepage)
		except ValueError:
			wholepage_clean = ''.join(c for c in wholepage if _valid_XML_char_ordinal(ord(c)))
			html_doc = parser.parse(wholepage_clean)

		body = html_doc.find("./{*}body")
		position_div = body.find("./{*}div[@id='position']")
		wayback_cal = position_div.find("./{*}div[@id='wbCalendar']")
		calOver = wayback_cal.find("./{*}div[@id='calOver']")
		for month in calOver.findall("./{*}div[@class='month']"):
			for day in month.findall(".//{*}td"):
				day_div = day.find("./{*}div[@class='date tooltip']")
				if day_div != None:
					for snapshot in day_div.findall("./{*}div[@class='pop']/{*}ul/{*}li"):
						his_urls.append(snapshot[0].get('href'))

		year =  self.extract_year(his_urls[0]) if len(his_urls) > 0 else None

		return (year, his_urls)
Beispiel #37
0
def pisaParser(src, c, default_css=""):
    """    
    - Parse HTML and get miniDOM
    - Extract CSS informations, add default CSS, parse CSS
    - Handle the document DOM itself and build reportlab story
    - Return Context object     
    """
    parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))

    # XXX Bugfix for HTML5lib
    src = src.read().replace("<![CDATA[", "\n").replace("]]>", "\n")
    src = cStringIO.StringIO(src)

    document = parser.parse(src)
    # print document.toprettyxml()

    if default_css:
        c.addCSS(default_css)

    pisaPreLoop(document, c)
    #try:
    c.parseCSS()
    #except:
    #    c.cssText = DEFAULT_CSS
    #    c.parseCSS()
    c.debug(9, pprint.pformat(c.css))
    pisaLoop(document, c)
    return c
Beispiel #38
0
def html_parser(html):
    try:
        soup = BeautifulSoup(html)
    except:
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        soup = parser.parse(html)
    return soup
Beispiel #39
0
def sanitize_html_fragment(unsafe, allowed_elements=None,
        allowed_attributes_map=None, allowed_styles_map=None,
        rename_elements=None, encoding='UTF-8'):
    # TODO: make this more simple / understandable and factor out from
    # plugins.html_to_template_text
    if not allowed_elements:
        allowed_elements = sanitizer.HTMLSanitizer.allowed_elements

    tokenizer = custom_sanitizer(allowed_elements, allowed_attributes_map,
                               allowed_styles_map, rename_elements)
    p = html5lib.HTMLParser(
        tree=treebuilders.getTreeBuilder("lxml"),
        tokenizer=tokenizer,
        namespaceHTMLElements=False
    )
    top_level_elements = p.parseFragment(unsafe, encoding=encoding)
    # put top level elements in container
    container = etree.Element('div')
    if top_level_elements and not hasattr(top_level_elements[0], 'tag'):
        container.text = top_level_elements.pop(0)
    container.extend(top_level_elements)

    html_bits = [etree.tostring(elem, method='html', encoding=encoding)
                     for elem in container]

    return ''.join([escape(container.text or '').encode(encoding)] + html_bits)
Beispiel #40
0
def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True):
    """Parse an HTML text. Return value: lxml.html.HtmlElement document.
    
    parser: which parser to use. 
    whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML."""
    doc = None
    
    if parser == scraper.LXML_HTML:
        if whole_doc:
            doc = html.document_fromstring(text)
        else:
            doc = html.fromstring(text)
    elif parser == scraper.HTML5PARSER:
        # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642
        #if whole_doc:
        #    doc = html5parser.document_fromstring(text)
        #else:
        #    doc = html5parser.fromstring(text)
        # Here is my workaround:
        parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False)
        etree_doc = parser.parse(text)  # returns an ElementTree
        doc = html.document_fromstring(elementtree_to_string(etree_doc))
        # ^ this double conversion makes it slow ^
    elif parser == scraper.BEAUTIFULSOUP:
        # soupparser has no document_fromstring method
        doc = soupparser.fromstring(text)
    else:
        print >>sys.stderr, "Warning: you want to use an unknown parser in lx.py."
        # doc is None
        
    return doc  # lxml.html.HtmlElement
def _get_default_parser():
    opts = {}

    sanitizer.HTMLSanitizer.acceptable_elements.extend(['cms-plugin'])

    if settings.TEXT_HTML_SANITIZE:
        sanitizer.HTMLSanitizer.acceptable_elements.extend(
            settings.TEXT_ADDITIONAL_TAGS)
        sanitizer.HTMLSanitizer.acceptable_attributes.extend(
            settings.TEXT_ADDITIONAL_ATTRIBUTES)
        sanitizer.HTMLSanitizer.allowed_elements = (
            sanitizer.HTMLSanitizer.acceptable_elements +
            sanitizer.HTMLSanitizer.mathml_elements +
            sanitizer.HTMLSanitizer.svg_elements)
        sanitizer.HTMLSanitizer.allowed_attributes = (
            sanitizer.HTMLSanitizer.acceptable_attributes +
            sanitizer.HTMLSanitizer.mathml_attributes +
            sanitizer.HTMLSanitizer.svg_attributes)
        sanitizer.HTMLSanitizer.allowed_protocols = (
            sanitizer.HTMLSanitizer.acceptable_protocols +
            list(settings.TEXT_ADDITIONAL_PROTOCOLS))
        parser_classes = []
        for parser_class in settings.ALLOW_TOKEN_PARSERS:
            parser_classes.append(import_string(parser_class))

        TextSanitizer.allow_token_parsers = parser_classes
        opts['tokenizer'] = TextSanitizer

    return html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                               **opts)
Beispiel #42
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs['sanitize'] = True
        else:
            parser_kwargs['tokenizer'] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Beispiel #43
0
    def savePage(self, page, client):
        self.retries = 0
        print "Got page:", self.PAGE_NUMBER, "with status", client.status, client.message
        if int(client.status) < self.ERROR_CODE and self.PAGE_NUMBER < self.NPAGES:
            print "Trying to schedule getting the next page..."
            filename = self.FILENAME_PATTERN % (self.PAGE_NUMBER,)
            f = open(filename, "w")
            f.write(page)
            f.close()

            if self.PAGE_NUMBER == 0:
                # only do this for the first page:
                f = open(filename, "r")
                parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
                bs = parser.parse(f)
                pagingDiv = bs.find("div", {"class": "paging"})
                pageLinks = pagingDiv.findAll("a")
                # Magic offset ( - 1 ) to adjust for zero indexing
                self.NPAGES = len(pageLinks) - 1
                print "Setting number of pages in query to:", self.NPAGES

            self.PAGE_NUMBER += 1
            d = self.getNextPage()
            return d
        else:
            return page, client
    def __init__ (self, *a, **kw) :
        super(HTMLParser, self).__init__(*a, **kw)

        self._parser = html5parser.HTMLParser(
            tree=treebuilders.getTreeBuilder("dom"),
        )
        self._treewalker = None
Beispiel #45
0
    def __init__(self, html):
        """Create a parse tree from the given HTML."""
        def really_parse_fragment(parser, html):
            """Parse a possibly multi-rooted HTML fragment, wrapping it in a
            <div> to make it easy to query later.

            As far as I can tell, this is what parseFragment is supposed to do
            (but doesn't). See
            http://code.google.com/p/html5lib/issues/detail?id=161.

            """
            top_level_elements = parser.parseFragment(html)
            container = Element(self.CONTAINER_TAG)

            # Why lxml couldn't just have text nodes, I'll never understand.
            # Text nodes that come other than first are automatically stuffed
            # into the tail attrs of the preceding elements by html5lib.
            if top_level_elements and isinstance(top_level_elements[0],
                                                 basestring):
                container.text = top_level_elements.pop(0)

            container.extend(top_level_elements)
            return container

        p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
        self._root = really_parse_fragment(p, html)
Beispiel #46
0
def prepare_project_stream(stream, base_url, metadata):
    """ Sanitizes a butter HTML export
     - Picks the plug-in required from the stream.
    """
    stream = force_unicode(stream) if stream else u""
    tree = treebuilders.getTreeBuilder("lxml")
    parser = html5lib.HTMLParser(tree=tree, namespaceHTMLElements=False)
    document_tree = parser.parse(stream)
    # plugins are relative
    scripts = document_tree.xpath("//script[@src]")
    plugins = [s.get("src") for s in scripts if not urlparse(s.get("src")).netloc]
    # styles are relative
    styles = document_tree.xpath("//link[@href]")
    css = [s.get("href") for s in styles if not urlparse(s.get("href")).netloc]
    # inline css
    inline_css = []
    for inline in document_tree.xpath("//style"):
        inline_css.append(strip_tags(inline.text))
        inline.getparent().remove(inline)
    # remove script tags
    for inline in document_tree.xpath("//script"):
        inline.getparent().remove(inline)
    popcorn = prepare_popcorn_string_from_project_data(json.loads(metadata)) if metadata else ""
    body = [clean(tostring(b)) + popcorn for b in document_tree.xpath("//body")]
    context = {"styles": css, "scripts": plugins, "inline_css": inline_css, "body": body}

    return render_to_string("project/skeleton.html", context)
Beispiel #47
0
    def _make_dom(self, input_string):
        """Given an input_string containing [X]HTML, return a tuple of
        (dom, options).  If input_string is valid XML, xml.dom.minidom is 
        used to perform the parsing.  If input_string is not valid XML,
        fall back to using html5lib for creating the DOM.

        input_string is wrapped in StringIO so we can easily reset in the
        event of errors."""

        options = pyRdfa.Options()

        try:
            # try to parse as XML
            dom = xml.dom.minidom.parse(StringIO(input_string))

        except:
            # fall back to html5lib
            parser = html5lib.HTMLParser(
                tree=treebuilders.getTreeBuilder("dom"))

            dom = parser.parse(input_string, encoding='utf-8')

            # The host language has changed
            options.host_language = pyRdfa.HTML5_RDFA

        return dom, options
Beispiel #48
0
def summary_scrape(urn):
    print " - summary"
    url = "http://www.edubase.gov.uk/establishment/summary.xhtml?urn=" + urn
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))

    keyvaluepairs = table_extract(page)

    raw_address = [x.strip() for x in keyvaluepairs.pop("").split(" / ")]
    if postcode.match(raw_address[-1]):
        keyvaluepairs["Postcode"] = raw_address[-1]
        raw_address = raw_address[:-1]
    keyvaluepairs["Address"] = " / ".join(raw_address)

    for t in page.findall(
            path([
                "body", "div", "div", "div", "div", "table", "tbody", "tr",
                "td", "h1"
            ], pre)):
        x = t.text.split(": ")
        keyvaluepairs[x[0]] = x[1]

    for t in page.findall(
            path([
                "body", "div", "div", "div", "div", "table", "tbody", "tr",
                "td", "div", "p", "b"
            ], pre)):
        keyvaluepairs[t.text.strip().strip(":")] = (t.tail or "").strip()

    return keyvaluepairs
Beispiel #49
0
    def extract_html_urls(self, html):
        """
        Take all ``<img src="..">`` from the HTML
        """
        p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom = p.parse(html)
        urls = []

        for img in dom.getElementsByTagName("img"):
            src = img.getAttribute("src")
            if src:
                urls.append(unquote_utf8(src))

            srcset = img.getAttribute("srcset")
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName("source"):
            srcset = source.getAttribute("srcset")
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName("a"):
            href = source.getAttribute("href")
            if href:
                urls.append(unquote_utf8(href))

        return urls
Beispiel #50
0
    def __init__(self, html):
        """Create a parse tree from the given HTML."""
        def really_parse_fragment(parser, html):
            """Parse a possibly multi-rooted HTML fragment, wrapping it in a
            <div> to make it easy to query later.

            As far as I can tell, this is what parseFragment is supposed to do
            (but doesn't). See
            http://code.google.com/p/html5lib/issues/detail?id=161.

            """
            top_level_elements = parser.parseFragment(html)
            container = Element(self.CONTAINER_TAG)

            # Why lxml couldn't just have text nodes, I'll never understand.
            # Text nodes that come other than first are automatically stuffed
            # into the tail attrs of the preceding elements by html5lib.
            if top_level_elements and isinstance(top_level_elements[0],
                                                 basestring):
                container.text = top_level_elements.pop(0)

            container.extend(top_level_elements)
            return container

        p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
        self._root = really_parse_fragment(p, html)
 def test_sanitizer_without_token_parsers(self):
     sanitizer.TextSanitizer.allow_token_parsers = ()
     parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"),
                                  tokenizer=sanitizer.TextSanitizer)
     body = '<span data-one="1" data-two="2">some text</span>'
     body = html.clean_html(body, full=False, parser=parser)
     self.assertEqual('<span>some text</span>', body)
Beispiel #52
0
def wasp():
    wx = {}
    try:
        p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        doc = p.parse(
            urllib2.urlopen("http://swaspgateway.suth/", timeout=1).read())
        t = doc.getElementsByTagName("table")[0]
        tds = t.getElementsByTagName("td")
        wx["Temp"] = float(tds[7].firstChild.nodeValue)
        if tds[10].firstChild.nodeValue == "RAIN":
            wx["Sky"] = "Rain"
            wx["Sky Temp"] = wx["Temp"]
        else:
            sky, stemp = tds[10].firstChild.nodeValue.split('(')
            stemp = stemp[0:-1]
            wx["Sky"] = sky
            wx["Sky Temp"] = stemp
        wx["T - DP"] = float(tds[9].firstChild.nodeValue)
        wx["RH"] = float(tds[8].firstChild.nodeValue)
        tds[6].normalize()
        wx["Wind Dir"] = tds[6].firstChild.nodeValue[1:]
        wx["Wind Speed"] = float(tds[5].firstChild.nodeValue)
        rain = tds[4].firstChild.nodeValue
        if rain == "DRY":
            wx["Raining"] = False
        else:
            wx["Raining"] = True
        wx["UT"] = tds[3].firstChild.nodeValue.strip()
        tds[31].normalize()
        wx["Status"] = tds[31].firstChild.nodeValue.strip()
        return wx
    except:
        return False
Beispiel #53
0
	def load_env_ref( self, *args, **kwargs ):

		if self._is_ENV_in_debug: print red( '\t\t\t%s(...)\t|%s| <-' ) % ( self.load_env_ref.func_name, 'load_env_ref' )

		result = None

		self._httpconn.set_debuglevel( 0 )

		get_params = urlencode( 
			{ 
				u'Code'.encode( 'UTF-8' ): u''.encode(' UTF-8' ) 
			} 
		)
		post_params = None

		self._httpconn.request(
			'GET', 
			u'/carto/lib/Div_AddEnvironnement.php?%s'.encode( 'UTF-8' ) % ( get_params ),
			u'%s'.encode( 'UTF-8' ) % ( post_params ), 
			self._headers 
		)

		resp = self._httpconn.getresponse().read()

		result = dict( 
				map( lambda e: ( e.text, e.attrib['value'] ), 
				HTMLParser( tree = treebuilders.getTreeBuilder( 'lxml' ) ).parse( resp ).xpath(
					'*//html:select[@id="AddEnvironnementIdTypeEnv"]/html:option', namespaces=self._d_namespaces ) 
				) 
		)

		if self._is_ENV_in_debug: print red( '\t\t\t%s(...)\t|%s| -> %s' ) % ( self.load_env_ref.func_name, 'load_env_ref', result )
		return result
def do_year(y, url):
    pagetext = urllib2.urlopen(url)
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"),
                        tokenizer=sanitizer.HTMLSanitizer)
    page = parser.parse(pagetext)

    for section in page.findall(
            "body/div/div/div/div/div/div/div/div/table[@class='fixture']"):

        matchtype = section.find("caption").text

        for match in section.findall("tbody/tr"):

            l = list(match.getchildren())
            d = {}
            d["Match type"] = matchtype
            d["Match number"] = l[0].text
            d["Date"] = make_date(l[1].text, y)
            d["Team 1"] = flatten_refs(l[3])
            d["Team 2"] = flatten_refs(l[5])
            a = l[4].find("a")
            d["Score"] = a.text
            d["Report"] = "http://www.fifa.com" + a.get("href")
            print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"],
                                       d["Team 2"])
            datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
Beispiel #55
0
  def get_toc(self, path):
    # Only have TOC on tutorial pages. Don't do work for others.
    if not (re.search('/tutorials', path) or re.search('/mobile', path)):
      return ''

    toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path))
    if toc is None or not self.request.cache:
      template_text = render_to_string(path, {})

      parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
      dom_tree = parser.parse(template_text)
      walker = treewalkers.getTreeWalker("dom")
      stream = walker(dom_tree)
      toc = []
      current = None
      for element in stream:
        if element['type'] == 'StartTag':
          if element['name'] in ['h2', 'h3', 'h4']:
            for attr in element['data']:
              if attr[0] == 'id':
                current = {
                  'level' : int(element['name'][-1:]) - 1,
                  'id' : attr[1]
                }
        elif element['type'] == 'Characters' and current is not None:
          current['text'] = element['data']
        elif element['type'] == 'EndTag' and current is not None:
          toc.append(current)
          current = None
      memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600)

    return toc
Beispiel #56
0
        def validate_url(self, url, use_w3c=True, quite=True):
            'validate urls with the w3c validator. Need an Internet Connection'

            client = Client()
            response = client.get(url, follow=True)
            if response.status_code == 200:
                src = response.content
                treebuilder = treebuilders.getTreeBuilder("etree")
                parser = HTMLParser(tree=treebuilder, strict=True)
                try:
                    parser.parse(src)
                except Exception as ex:
                    pass

                if not parser.errors and use_w3c:
                    #uploading to w3c
                    w3c = w3c_client(src)
                    if w3c and not w3c[0]:
                        print('%s: %s' % (
                            url,
                            w3c[1],
                        ))
                        if not quite:
                            for i in w3c[2]['messages']:
                                print(i['messageid'])
                                print('\t%s' % (i['message'], ))
                        #self.assertTrue(w3c[0])
            else:
                print('skipping html check %s', (response.status_code, ))
Beispiel #57
0
def html_parser(html):
    try:
        soup = BeautifulSoup(html)
    except:
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        soup = parser.parse(html)
    return soup
Beispiel #58
0
def getitems(params):
    http = GET('http://www.planeta-online.tv/')
    if http != None:
        DT = html5lib.HTMLParser(
            tree=treebuilders.getTreeBuilder('dom')).parse(http)
        for div0 in DT.getElementsByTagName('div'):
            if div0.getAttribute('id') == 'mainChannelList':
                for div1 in div0.getElementsByTagName('a'):
                    if div1.getAttribute('class') == 'tip_trigger chA':
                        title = None
                        img = None
                        for child in div1.childNodes:
                            if child.nodeType == child.TEXT_NODE:
                                title = child.data.encode('utf8')
                            else:
                                for imgs in child.getElementsByTagName('img'):
                                    img = 'http://www.planeta-online.tv%s' % imgs.getAttribute(
                                        'src').encode('utf8')
                        if title and img:
                            uri = '%s?%s' % (sys.argv[0],
                                             urllib.urlencode({
                                                 'func':
                                                 'play',
                                                 'href':
                                                 div1.getAttribute('href')
                                             }))
                            i = xbmcgui.ListItem(title,
                                                 iconImage=img,
                                                 thumbnailImage=img)
                            i.setProperty('IsPlayable', 'true')
                            xbmcplugin.addDirectoryItem(h, uri, i)
        xbmcplugin.endOfDirectory(h)