def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ global CSSAttrCache CSSAttrCache = {} if xhtml: #TODO: XHTMLParser doesn't see to exist... parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in StringTypes: if type(src) is unicode: # If an encoding was provided, do not change it. if not encoding: encoding = "utf-8" src = src.encode(encoding) src = pisaTempFile(src, capacity=context.capacity) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse( src, encoding=encoding) if xml_output: if encoding: xml_output.write(document.toprettyxml(encoding=encoding)) else: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: context.addDefaultCSS(default_css) pisaPreLoop(document, context) #try: context.parseCSS() #except: # context.cssText = DEFAULT_CSS # context.parseCSS() # context.debug(9, pprint.pformat(context.css)) pisaLoop(document, context) return context
def pisaParser(src, c, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) #src = pisaTempFile(src, capacity=c.capacity) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) #encoding = 'utf-8' document = parser.parse( src, encoding=encoding) if xml_output: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: c.addCSS(default_css) #from html5lib import treewalkers, serializer #walker = treewalkers.getTreeWalker("dom") #stream = walker(document) #s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False) #output_generator = s.serialize(stream) #for item in output_generator: # print item #pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def pisaParser(src, c, default_css="", xhtml=False, encoding=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) src = StringIO.StringIO(src) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse( src, encoding=encoding) # print document.toprettyxml() if default_css: c.addCSS(default_css) pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def pisaParser(src, c, default_css="", xhtml=False, encoding=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ if xhtml: parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in types.StringTypes: if type(src) is types.UnicodeType: encoding = "utf8" src = src.encode(encoding) src = StringIO.StringIO(src) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error( "%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse(src, encoding=encoding) # print document.toprettyxml() if default_css: c.addCSS(default_css) pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() # c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def __init__(self, alive_event, work_event, email, queue, index_queue, passed, robots, unavailable_urls, agent_name=DEFAULT_AGENT_NAME, headers=DEFAULT_HEADERS, url_filter=lambda x: True): Process.__init__(self) self.alive_event = alive_event self.work_event = work_event self.email = email self.queue = queue self.index_queue = index_queue self.passed = passed self.robots = robots self.unavailable_urls = unavailable_urls self.agent_name = agent_name self.url_filter = url_filter self.is_working = False self.handler = request.build_opener(RobotHandler(agent_name, robots)) handler_headers = [(k, v) for k, v in copy.copy(headers).items()] handler_headers.append(("User-Agent", agent_name)) handler_headers.append(("From", email)) self.handler.addheaders = handler_headers self.html_parser = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) self.connection = None
def parse_text(text): t1 = time.clock() parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder('etree'), tokenizer=MySanitiser) t2 = time.clock() text = text.replace('\r', '') text = text.replace('\n', '<br>') t3 = time.clock() for search,replace in SMILEY_REPLACEMENTS: text = text.replace(search, replace) for regex,replace in BBCODE_REGEXES: text = regex.sub(replace, text) for search,replace in BBCODE_REPLACEMENTS: text = text.replace(search, replace) t4 = time.clock() doc = parser.parse(text) t5 = time.clock() walker = treewalkers.getTreeWalker('etree') stream = walker(doc) s = serializer.htmlserializer.HTMLSerializer() output_generator = s.serialize(stream) t6 = time.clock() done = Markup(''.join(list(output_generator))) t7 = time.clock() print('Init:%f, BR:%f, Regex:%f, Parse:%f, Serial:%f, Join:%f, All:%f' % (t2-t1, t3-t2, t4-t3, t5-t4, t6-t5, t7-t6, t7-t1)) return done
def truncate_html(*args): document = truncate(*args) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) document = parser.parse(document) xml = document.getElementsByTagName('body')[0].childNodes[0].toxml() return xml
def _get_default_parser(): if settings.TEXT_HTML_SANITIZE: parser_classes = [] for parser_class in settings.ALLOW_TOKEN_PARSERS: parser_classes.append(import_string(parser_class)) TextSanitizer.allow_token_parsers = parser_classes return html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
def get_favicon_url(self, html): """ Parses *html* looking for a favicon URL. Returns a tuple of: (<url>, <mimetime>) If no favicon can be found, returns: (None, None) """ p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) fetch_url = None mimetype = None icon = False found_token = None for token in stream: if 'name' in token: if token['name'] == 'link': for attr in token['data']: if attr[0] == 'rel': if 'shortcut icon' in attr[1].lower(): found_token = token icon = True elif attr[0] == 'href': fetch_url = attr[1] elif attr[0] == 'type': mimetype = attr[1] if fetch_url and icon: if not mimetype: mimetype = "image/x-icon" if mimetype in self.favicon_mimetypes: return (fetch_url, mimetype) return (None, None)
def clean_html(buf): """Cleans HTML of dangerous tags and content.""" buf = buf.strip() if not buf: return buf html_parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=HTMLSanitizer) dom_tree = html_parser.parseFragment(buf) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False, quote_attr_values=True) output = s.render(stream, 'utf-8') while 'toberemoved' in output: oldoutput = output matches = re.findall(r'<toberemoved.*?>.*?</toberemoved>', output, re.DOTALL) for s in matches: output = output.replace(s, '') matches = re.findall(r'</toberemoved>', output, re.DOTALL) for s in matches: output = output.replace(s, '') matches = re.findall(r'<toberemoved.*?>', output, re.DOTALL) for s in matches: output = output.replace(s, '') if output == oldoutput: break return output
def get_spaces_available(dept_abbr, course_num): # define post_data = { 'classyear' : '2008', #don't know WHY!?! 'subj': dept_abbr, 'crsenum': course_num, } url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch' # get the html cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) headers = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} request = urllib2.Request(url, urllib.urlencode(post_data), headers) handle = urllib2.urlopen(request) html = handle.read() # parse the html parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) tbody = soup.find('th', text='Term').parent.parent.parent cells = tbody.findAll('tr')[2]('td') enrolled = int(cells[-2].contents[0]) capacity = int(cells[-3].contents[0]) print "%i spaces left (capacity of %i with %i enrolled)" % (capacity-enrolled, capacity, enrolled)
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search('/tutorials', path) or re.search('/mobile', path)): return '' toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2', 'h3', 'h4']: for attr in element['data']: if attr[0] == 'id': current = { 'level': int(element['name'][-1:]) - 1, 'id': attr[1] } elif element['type'] == 'Characters' and current is not None: current['text'] = element['data'] elif element['type'] == 'EndTag' and current is not None: toc.append(current) current = None memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True): """Parse an HTML text. Return value: lxml.html.HtmlElement document. parser: which parser to use. whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML.""" doc = None if parser == scraper.LXML_HTML: if whole_doc: doc = html.document_fromstring(text) else: doc = html.fromstring(text) elif parser == scraper.HTML5PARSER: # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642 #if whole_doc: # doc = html5parser.document_fromstring(text) #else: # doc = html5parser.fromstring(text) # Here is my workaround: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) etree_doc = parser.parse(text) # returns an ElementTree doc = html.document_fromstring(elementtree_to_string(etree_doc)) # ^ this double conversion makes it slow ^ elif parser == scraper.BEAUTIFULSOUP: # soupparser has no document_fromstring method doc = soupparser.fromstring(text) else: print >> sys.stderr, "Warning: you want to use an unknown parser in lx.py." # doc is None return doc # lxml.html.HtmlElement
def hmtl2text(html): p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(html.decode("utf-8")) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) in_script = False outbuf = [] current_line = [] for token in stream: token_name = token.get('name', "").lower() if token_name in ['script', 'style', 'noscript']: in_script = token.get('type', None) == 'StartTag' if in_script: continue if token_name in block_level_elements or token_name == "br": if current_line: outbuf.append(u"".join(current_line)) current_line = [] if token.get(u'type', None) == u'Characters': current_line.append(token['data']) if token.get(u'type', None) == u'SpaceCharacters': if current_line and current_line[-1] != u" ": current_line.append(u" ") if current_line: outbuf.append(u"".join(current_line)) return clean_whitespace("\n".join(outbuf))
def parseXml(self, xmlStr): ''' Takes in a string of XML extracted from the XFA and finds any script elements. Returns a tuple containing two elements: 1) the root of the parsed XML tree structure 2) a list of any script elements found in the tree ''' # parse XML and look for 'script' elements #self.logger.debug("Parsing XML...") # all JavaScript executions here are to build the DOM is_dom = True try: # The XML must be wrapped in a root element because there may not be just 1 root # after I smashed together all the xml ;] xml = "<xfa>%s</xfa>" % (xmlStr) #logger.info("xml: %s" % repr(xml)) xmlTree = etree.fromstring(xml) except Exception, e: #self.logger.warn("[lxml] exception from parsing XML: %s" % e) #logger.info(" [lxml] going to try with html5lib..") return (None, None) import html5lib from html5lib import treebuilders parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("lxml")) xmlTree = parser.parse(xml)
def _get_default_parser(): opts = {} sanitizer.HTMLSanitizer.acceptable_elements.extend(['cms-plugin']) if settings.TEXT_HTML_SANITIZE: sanitizer.HTMLSanitizer.acceptable_elements.extend( settings.TEXT_ADDITIONAL_TAGS) sanitizer.HTMLSanitizer.acceptable_attributes.extend( settings.TEXT_ADDITIONAL_ATTRIBUTES) sanitizer.HTMLSanitizer.allowed_elements = ( sanitizer.HTMLSanitizer.acceptable_elements + sanitizer.HTMLSanitizer.mathml_elements + sanitizer.HTMLSanitizer.svg_elements) sanitizer.HTMLSanitizer.allowed_attributes = ( sanitizer.HTMLSanitizer.acceptable_attributes + sanitizer.HTMLSanitizer.mathml_attributes + sanitizer.HTMLSanitizer.svg_attributes) sanitizer.HTMLSanitizer.allowed_protocols = ( sanitizer.HTMLSanitizer.acceptable_protocols + list(settings.TEXT_ADDITIONAL_PROTOCOLS)) parser_classes = [] for parser_class in settings.ALLOW_TOKEN_PARSERS: parser_classes.append(import_string(parser_class)) TextSanitizer.allow_token_parsers = parser_classes opts['tokenizer'] = TextSanitizer return html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **opts)
def _parse_wayback_page(self, page_year): """ Paser all recored web page URLs in specific year. """ his_urls = [] wholepage = self.open_url(page_year) if wholepage == None: return his_urls parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) try: html_doc = parser.parse(wholepage) except ValueError: wholepage_clean = ''.join(c for c in wholepage if _valid_XML_char_ordinal(ord(c))) html_doc = parser.parse(wholepage_clean) body = html_doc.find("./{*}body") position_div = body.find("./{*}div[@id='position']") wayback_cal = position_div.find("./{*}div[@id='wbCalendar']") calOver = wayback_cal.find("./{*}div[@id='calOver']") for month in calOver.findall("./{*}div[@class='month']"): for day in month.findall(".//{*}td"): day_div = day.find("./{*}div[@class='date tooltip']") if day_div != None: for snapshot in day_div.findall( "./{*}div[@class='pop']/{*}ul/{*}li"): his_urls.append(snapshot[0].get('href')) year = self.extract_year(his_urls[0]) if len(his_urls) > 0 else None return (year, his_urls)
def _normalize(html): """ Normalize the given string of HTML, collapsing whitespace. """ # This is taken from the "Serialization of Streams" section of # http://code.google.com/p/html5lib/wiki/UserDocumentation. p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = p.parse(html) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False) output_generator = s.serialize(stream) # TODO: We're not actually collapsing *all* whitespace; only # entire chunks of whitespace that the serializer gives us. Currently, # this seems "good enough" to pass our unit tests, which are # based on use cases of comparing pre-sanitized HTML to sanitized HTML, # but we may need to change this in the future. parts = [] last_item_was_whitespace = False for item in output_generator: # Is it empty whitespace? if item.strip() != '': parts.append(item) last_item_was_whitespace = False elif not last_item_was_whitespace: # Collapse whitespace. parts.append(' ') last_item_was_whitespace = True return ''.join(parts)
def get_dom(self, buf): buf = buf.strip() if not buf: return None p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=self.token_sanitizer()) return p.parseFragment(buf)
def _make_dom(self, input_string): """Given an input_string containing [X]HTML, return a tuple of (dom, options). If input_string is valid XML, xml.dom.minidom is used to perform the parsing. If input_string is not valid XML, fall back to using html5lib for creating the DOM. input_string is wrapped in StringIO so we can easily reset in the event of errors.""" options = pyRdfa.Options() try: # try to parse as XML dom = xml.dom.minidom.parse(StringIO(input_string)) except: # fall back to html5lib parser = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("dom")) dom = parser.parse(input_string, encoding='utf-8') # The host language has changed options.host_language = pyRdfa.HTML5_RDFA return dom, options
def __init__(self, api='etree'): # if no default implementation is defined for this api, set it to None # to let getTreeBuilder() using the corresponding implementation. implementation = self.defaults.get(api, None) HTMLParser.__init__(self, tree=treebuilders.getTreeBuilder( api, implementation))
def TrusteeImage(personName, withQuotes = True): # TODO # Make option to search for names without quotes as well. opener = urllib2.build_opener(urllib2.HTTPRedirectHandler) headers = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0. 6) Gecko/2009020911 Ubuntu/8.04 (hardy) Firefox/3.0.6'} if (withQuotes): url = "http://images.google.com/images?hl=en&q=" + urllib.quote("\"" + personName + "\"") else: url = "http://images.google.com/images?hl=en&q=" + urllib.quote(personName) request = urllib2.Request(url, None, headers) response = opener.open(request) results = response.read() parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(results) # check for meta tag, if we have it we need to find the redirect # From 3 March 2011, just need the regex #meta = soup.findAll("div", id="tphdr") #contents = meta[0].contents[0] metaRE = re.compile("0;url=(.+)\"") #m = metaRE.search(str(contents)) m = metaRE.search(str(results)) # If we have a match, do the redirect if (len(m.groups()) > 0): redirectURL = m.groups()[0] print redirectURL request = urllib2.Request(redirectURL, None, headers) response = opener.open(request) results = response.read() soup = parser.parse(results) # Find the div with the images div = soup.findAll("div", id="ImgCont") # If there's nothing found, return None if (len(div) == 0): return None # Get the links in the div links = div[0].findAll("a") # If there're no links, return None if (len(links) == 0): return None # If we're here, then we're probably okay # Get the first link link = links[0] # Create our regex value = re.compile("imgurl=(.+)&imgrefurl") m = value.search(link["href"]) # Our link should be the first returned result imageLink = m.groups()[0] print imageLink return imageLink
def parseXml(self,xmlStr): ''' Takes in a string of XML extracted from the XFA and finds any script elements. Returns a tuple containing two elements: 1) the root of the parsed XML tree structure 2) a list of any script elements found in the tree ''' # parse XML and look for 'script' elements #self.logger.debug("Parsing XML...") # all JavaScript executions here are to build the DOM is_dom = True try: # The XML must be wrapped in a root element because there may not be just 1 root # after I smashed together all the xml ;] xml = "<xfa>%s</xfa>" % (xmlStr) #logger.info("xml: %s" % repr(xml)) xmlTree = etree.fromstring(xml) except Exception, e: #self.logger.warn("[lxml] exception from parsing XML: %s" % e) #logger.info(" [lxml] going to try with html5lib..") return (None, None) import html5lib from html5lib import treebuilders parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) xmlTree = parser.parse(xml)
def GetMarginBlotterDataSet(self,UserID,PWD): u=urllib.urlopen('http://api.efxnow.com/DEMOWebServices2.8/Service.asmx/GetMarginBlotterDataSet?UserID='+UserID+'&PWD='+PWD) str=u.read() parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) tree=parser.parse(str) root= tree.getroot() PostedMargin=root.find('.//{http://www.w3.org/1999/xhtml}postedmargin').text RealizedProfit=root.find('.//{http://www.w3.org/1999/xhtml}realizedprofit').text UnrealizedProfit=root.find('.//{http://www.w3.org/1999/xhtml}unrealizedprofit').text MarginFactor=root.find('.//{http://www.w3.org/1999/xhtml}marginfactor').text MarginBalance=root.find('.//{http://www.w3.org/1999/xhtml}marginbalance').text TotalAvailable=root.find('.//{http://www.w3.org/1999/xhtml}totalavailable').text OpenPosiiton=root.find('.//{http://www.w3.org/1999/xhtml}openposiiton').text MaxDeal=root.find('.//{http://www.w3.org/1999/xhtml}maxdeal').text USDPostedMargin=root.find('.//{http://www.w3.org/1999/xhtml}usdpostedmargin').text USDRealizedProfit=root.find('.//{http://www.w3.org/1999/xhtml}usdrealizedprofit').text return {'PostedMargin':PostedMargin, 'RealizedProfit':RealizedProfit, 'UnrealizedProfit':UnrealizedProfit, 'MarginFactor':MarginFactor, 'MarginBalance':MarginBalance, 'TotalAvailable':TotalAvailable, 'OpenPosiiton':OpenPosiiton, 'MaxDeal':MaxDeal, 'USDPostedMargin':USDPostedMargin, 'USDRealizedProfit':USDRealizedProfit}
def __init__(self, url): page = urllib2.urlopen(url) parser = html5lib.HTMLParser( tokenizer=sanitizer.HTMLSanitizer, tree=treebuilders.getTreeBuilder("beautifulsoup")) self.document = parser.parse(page, encoding="iso8859-15") self.type_scrutin = 0
def load_deployment_servername_ref( self, *args, **kwargs ): if self._is_DEPLOYMENT_SERVERNAME_in_debug: print green( '\t%s(...)\t|%s| <-' ) % ( self.load_deployment_servername_ref.func_name, 'load_deployment_servername_ref' ) result = None self._httpconn.set_debuglevel( 0 ) get_params = None post_params = None self._httpconn.request( 'GET', u'/carto/serveurs/ServeurListe.php', u'%s'.encode( 'UTF-8' ) % ( post_params ), self._headers ) resp = self._httpconn.getresponse().read() result = dict( map( lambda e: ( e.text, re.match( 'ServeurVoir.php\?id=([0-9]*)', e.attrib[ 'href' ] ).group( 1 ) ), [ e for e in HTMLParser( tree = treebuilders.getTreeBuilder( 'lxml' ) ).parse( resp ).xpath( '*//html:table/html:tbody//html:a', namespaces=self._d_namespaces ) if e.text is not None ] ) ) if self._is_DEPLOYMENT_SERVERNAME_in_debug: print green( '\t%s(...)\t|%s| -> %s' ) % ( self.load_deployment_servername_ref.func_name, 'load_deployment_servername_ref', result ) return result
def pisaParser(src, c, default_css=""): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) # XXX Bugfix for HTML5lib src = src.read().replace("<![CDATA[", "\n").replace("]]>", "\n") src = cStringIO.StringIO(src) document = parser.parse(src) # print document.toprettyxml() if default_css: c.addCSS(default_css) pisaPreLoop(document, c) # try: c.parseCSS() # except: # c.cssText = DEFAULT_CSS # c.parseCSS() c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def wasp(): wx = {} try: p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) doc = p.parse(urllib2.urlopen("http://swaspgateway.suth/", timeout=1).read()) t = doc.getElementsByTagName("table")[0] tds = t.getElementsByTagName("td") wx["Temp"] = float(tds[7].firstChild.nodeValue) if tds[10].firstChild.nodeValue == "RAIN": wx["Sky"] = "Rain" wx["Sky Temp"] = wx["Temp"] else: sky, stemp = tds[10].firstChild.nodeValue.split('(') stemp = stemp[0:-1] wx["Sky"] = sky wx["Sky Temp"] = stemp wx["T - DP"] = float(tds[9].firstChild.nodeValue) wx["RH"] = float(tds[8].firstChild.nodeValue) tds[6].normalize() wx["Wind Dir"] = tds[6].firstChild.nodeValue[1:] wx["Wind Speed"] = float(tds[5].firstChild.nodeValue) rain = tds[4].firstChild.nodeValue if rain == "DRY": wx["Raining"] = False else: wx["Raining"] = True wx["UT"] = tds[3].firstChild.nodeValue.strip() tds[31].normalize() wx["Status"] = tds[31].firstChild.nodeValue.strip() return wx except: return False
def login(self, username, password): """ Login to o2online.ie Returns true if successful or false if fails. """ if self.resumable(): self.logger.info("Resuming from login.") return True else: self.logger.info("Unable to resume, running connect from login.") self.connect() post = [ ('IDButton', 'Go'), ('org', 'o2ext'), ('CONNECTFORMGET', 'TRUE'), ('IDToken1', username), ('IDToken2', password) ] handle = self.post('https://www.o2online.ie/amserver/UI/Login', post) from html5lib import HTMLParser, treebuilders parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(handle) if unicode(soup.html.head.title.string).strip() == u"LoginCheck": self.logger.info("login has correct HTML title.") return True return False
def scraper(request): post_data = { 'classyear' : '2008', # why?? 'subj': 'COSC', 'crsenum': '50' } url = 'http://oracle-www.dartmouth.edu/dart/groucho/timetable.course_quicksearch' # scrape the html cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) headers = {'User-agent' : 'Mozilla/c.0 (compatible; MSIE 5.5; Windows NT)'} request = urllib2.Request(url, urllib.urlencode(post_data), headers) handle = urllib2.urlopen(request) html = handle.read() # parse for the dept and course number parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) #tbody = soup.find('th', text='Term').parent.parent.parent #soup = tbody.findAll('tr')[2]('td') return render_to_response("scraper.html", {'soup': soup})
def download(self, directory, filename=None, force=None): url = self.baseurl + self.action info("Downloading from tmpfile: %s" % url) debug("robot code: %s" % self.robot_code) values = {'robot_code': self.robot_code} data = urllib.urlencode(values) req = urllib2.Request(url, data) response = urllib2.urlopen(req) page = response.read() page = page.replace('xml:lang="ru"', '') eparser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml", ElementTree)) doc = eparser.parse(page) url = doc.xpath("//xhtml:div[@id='cntMain']//xhtml:center//xhtml:a", namespaces={'xhtml':'http://www.w3.org/1999/xhtml'})[0].text response = urllib2.urlopen(url) inf = response.info() if filename is None: match = re.search('filename="(.*)"', inf['Content-Disposition']) filename = match.group(1) debug("Destination directory: %s" % directory) debug("Destination file: %s" % filename) dstpath = os.path.join(directory, filename) if os.path.exists(dstpath) and not force: warn("File %s already exists, skipping") return self.save(response, dstpath)
def sanitize_html_fragment(unsafe, allowed_elements=None, allowed_attributes_map=None, allowed_styles_map=None, rename_elements=None, encoding='UTF-8'): # TODO: make this more simple / understandable and factor out from # plugins.html_to_template_text if not allowed_elements: allowed_elements = sanitizer.HTMLSanitizer.allowed_elements tokenizer = custom_sanitizer(allowed_elements, allowed_attributes_map, allowed_styles_map, rename_elements) p = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder("lxml"), tokenizer=tokenizer, namespaceHTMLElements=False ) top_level_elements = p.parseFragment(unsafe, encoding=encoding) # put top level elements in container container = etree.Element('div') if top_level_elements and not hasattr(top_level_elements[0], 'tag'): container.text = top_level_elements.pop(0) container.extend(top_level_elements) html_bits = [etree.tostring(elem, method='html', encoding=encoding) for elem in container] return ''.join([escape(container.text or '').encode(encoding)] + html_bits)
def _parse_wayback_page(self, page_year): """ Paser all recored web page URLs in specific year. """ his_urls = [] wholepage = self.open_url(page_year) if wholepage == None: return his_urls parser = html5lib.HTMLParser(tree = treebuilders.getTreeBuilder("lxml")) try: html_doc = parser.parse(wholepage) except ValueError: wholepage_clean = ''.join(c for c in wholepage if _valid_XML_char_ordinal(ord(c))) html_doc = parser.parse(wholepage_clean) body = html_doc.find("./{*}body") position_div = body.find("./{*}div[@id='position']") wayback_cal = position_div.find("./{*}div[@id='wbCalendar']") calOver = wayback_cal.find("./{*}div[@id='calOver']") for month in calOver.findall("./{*}div[@class='month']"): for day in month.findall(".//{*}td"): day_div = day.find("./{*}div[@class='date tooltip']") if day_div != None: for snapshot in day_div.findall("./{*}div[@class='pop']/{*}ul/{*}li"): his_urls.append(snapshot[0].get('href')) year = self.extract_year(his_urls[0]) if len(his_urls) > 0 else None return (year, his_urls)
def pisaParser(src, c, default_css=""): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) # XXX Bugfix for HTML5lib src = src.read().replace("<![CDATA[", "\n").replace("]]>", "\n") src = cStringIO.StringIO(src) document = parser.parse(src) # print document.toprettyxml() if default_css: c.addCSS(default_css) pisaPreLoop(document, c) #try: c.parseCSS() #except: # c.cssText = DEFAULT_CSS # c.parseCSS() c.debug(9, pprint.pformat(c.css)) pisaLoop(document, c) return c
def html_parser(html): try: soup = BeautifulSoup(html) except: parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) soup = parser.parse(html) return soup
def to_doc(text, parser=scraper.LXML_HTML, whole_doc=True): """Parse an HTML text. Return value: lxml.html.HtmlElement document. parser: which parser to use. whole_doc: parse to complete HTML document (with <html> around), or parse just a fragment of HTML.""" doc = None if parser == scraper.LXML_HTML: if whole_doc: doc = html.document_fromstring(text) else: doc = html.fromstring(text) elif parser == scraper.HTML5PARSER: # html5parser was broken for me, bug report is here: https://bugs.launchpad.net/lxml/+bug/780642 #if whole_doc: # doc = html5parser.document_fromstring(text) #else: # doc = html5parser.fromstring(text) # Here is my workaround: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) etree_doc = parser.parse(text) # returns an ElementTree doc = html.document_fromstring(elementtree_to_string(etree_doc)) # ^ this double conversion makes it slow ^ elif parser == scraper.BEAUTIFULSOUP: # soupparser has no document_fromstring method doc = soupparser.fromstring(text) else: print >>sys.stderr, "Warning: you want to use an unknown parser in lx.py." # doc is None return doc # lxml.html.HtmlElement
def clean_html(input, sanitize=False): """ Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed. :param sanitize: Remove unwanted HTML tags and attributes. >>> clean_html("<p>Foo<b>bar</b></p>") u'<p>Foo<b>bar</b></p>' >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>") u'<p>Foo<b>bar</b><i>Ooops!</i></p>' >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>') u'<p>Foo<b>bar</b>& oops<a href=#foo&bar>This is a <>link</a></p>' """ parser_kwargs = {} serializer_kwargs = {} if sanitize: if HTMLSanitizer is None: # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016) serializer_kwargs['sanitize'] = True else: parser_kwargs['tokenizer'] = HTMLSanitizer p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs) dom_tree = p.parseFragment(input) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs) return "".join(s.serialize(stream))
def savePage(self, page, client): self.retries = 0 print "Got page:", self.PAGE_NUMBER, "with status", client.status, client.message if int(client.status) < self.ERROR_CODE and self.PAGE_NUMBER < self.NPAGES: print "Trying to schedule getting the next page..." filename = self.FILENAME_PATTERN % (self.PAGE_NUMBER,) f = open(filename, "w") f.write(page) f.close() if self.PAGE_NUMBER == 0: # only do this for the first page: f = open(filename, "r") parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup")) bs = parser.parse(f) pagingDiv = bs.find("div", {"class": "paging"}) pageLinks = pagingDiv.findAll("a") # Magic offset ( - 1 ) to adjust for zero indexing self.NPAGES = len(pageLinks) - 1 print "Setting number of pages in query to:", self.NPAGES self.PAGE_NUMBER += 1 d = self.getNextPage() return d else: return page, client
def __init__ (self, *a, **kw) : super(HTMLParser, self).__init__(*a, **kw) self._parser = html5parser.HTMLParser( tree=treebuilders.getTreeBuilder("dom"), ) self._treewalker = None
def __init__(self, html): """Create a parse tree from the given HTML.""" def really_parse_fragment(parser, html): """Parse a possibly multi-rooted HTML fragment, wrapping it in a <div> to make it easy to query later. As far as I can tell, this is what parseFragment is supposed to do (but doesn't). See http://code.google.com/p/html5lib/issues/detail?id=161. """ top_level_elements = parser.parseFragment(html) container = Element(self.CONTAINER_TAG) # Why lxml couldn't just have text nodes, I'll never understand. # Text nodes that come other than first are automatically stuffed # into the tail attrs of the preceding elements by html5lib. if top_level_elements and isinstance(top_level_elements[0], basestring): container.text = top_level_elements.pop(0) container.extend(top_level_elements) return container p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER)) self._root = really_parse_fragment(p, html)
def prepare_project_stream(stream, base_url, metadata): """ Sanitizes a butter HTML export - Picks the plug-in required from the stream. """ stream = force_unicode(stream) if stream else u"" tree = treebuilders.getTreeBuilder("lxml") parser = html5lib.HTMLParser(tree=tree, namespaceHTMLElements=False) document_tree = parser.parse(stream) # plugins are relative scripts = document_tree.xpath("//script[@src]") plugins = [s.get("src") for s in scripts if not urlparse(s.get("src")).netloc] # styles are relative styles = document_tree.xpath("//link[@href]") css = [s.get("href") for s in styles if not urlparse(s.get("href")).netloc] # inline css inline_css = [] for inline in document_tree.xpath("//style"): inline_css.append(strip_tags(inline.text)) inline.getparent().remove(inline) # remove script tags for inline in document_tree.xpath("//script"): inline.getparent().remove(inline) popcorn = prepare_popcorn_string_from_project_data(json.loads(metadata)) if metadata else "" body = [clean(tostring(b)) + popcorn for b in document_tree.xpath("//body")] context = {"styles": css, "scripts": plugins, "inline_css": inline_css, "body": body} return render_to_string("project/skeleton.html", context)
def summary_scrape(urn): print " - summary" url = "http://www.edubase.gov.uk/establishment/summary.xhtml?urn=" + urn parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) keyvaluepairs = table_extract(page) raw_address = [x.strip() for x in keyvaluepairs.pop("").split(" / ")] if postcode.match(raw_address[-1]): keyvaluepairs["Postcode"] = raw_address[-1] raw_address = raw_address[:-1] keyvaluepairs["Address"] = " / ".join(raw_address) for t in page.findall( path([ "body", "div", "div", "div", "div", "table", "tbody", "tr", "td", "h1" ], pre)): x = t.text.split(": ") keyvaluepairs[x[0]] = x[1] for t in page.findall( path([ "body", "div", "div", "div", "div", "table", "tbody", "tr", "td", "div", "p", "b" ], pre)): keyvaluepairs[t.text.strip().strip(":")] = (t.tail or "").strip() return keyvaluepairs
def extract_html_urls(self, html): """ Take all ``<img src="..">`` from the HTML """ p = HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom = p.parse(html) urls = [] for img in dom.getElementsByTagName("img"): src = img.getAttribute("src") if src: urls.append(unquote_utf8(src)) srcset = img.getAttribute("srcset") if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName("source"): srcset = source.getAttribute("srcset") if srcset: urls += self.extract_srcset(srcset) for source in dom.getElementsByTagName("a"): href = source.getAttribute("href") if href: urls.append(unquote_utf8(href)) return urls
def test_sanitizer_without_token_parsers(self): sanitizer.TextSanitizer.allow_token_parsers = () parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=sanitizer.TextSanitizer) body = '<span data-one="1" data-two="2">some text</span>' body = html.clean_html(body, full=False, parser=parser) self.assertEqual('<span>some text</span>', body)
def wasp(): wx = {} try: p = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) doc = p.parse( urllib2.urlopen("http://swaspgateway.suth/", timeout=1).read()) t = doc.getElementsByTagName("table")[0] tds = t.getElementsByTagName("td") wx["Temp"] = float(tds[7].firstChild.nodeValue) if tds[10].firstChild.nodeValue == "RAIN": wx["Sky"] = "Rain" wx["Sky Temp"] = wx["Temp"] else: sky, stemp = tds[10].firstChild.nodeValue.split('(') stemp = stemp[0:-1] wx["Sky"] = sky wx["Sky Temp"] = stemp wx["T - DP"] = float(tds[9].firstChild.nodeValue) wx["RH"] = float(tds[8].firstChild.nodeValue) tds[6].normalize() wx["Wind Dir"] = tds[6].firstChild.nodeValue[1:] wx["Wind Speed"] = float(tds[5].firstChild.nodeValue) rain = tds[4].firstChild.nodeValue if rain == "DRY": wx["Raining"] = False else: wx["Raining"] = True wx["UT"] = tds[3].firstChild.nodeValue.strip() tds[31].normalize() wx["Status"] = tds[31].firstChild.nodeValue.strip() return wx except: return False
def load_env_ref( self, *args, **kwargs ): if self._is_ENV_in_debug: print red( '\t\t\t%s(...)\t|%s| <-' ) % ( self.load_env_ref.func_name, 'load_env_ref' ) result = None self._httpconn.set_debuglevel( 0 ) get_params = urlencode( { u'Code'.encode( 'UTF-8' ): u''.encode(' UTF-8' ) } ) post_params = None self._httpconn.request( 'GET', u'/carto/lib/Div_AddEnvironnement.php?%s'.encode( 'UTF-8' ) % ( get_params ), u'%s'.encode( 'UTF-8' ) % ( post_params ), self._headers ) resp = self._httpconn.getresponse().read() result = dict( map( lambda e: ( e.text, e.attrib['value'] ), HTMLParser( tree = treebuilders.getTreeBuilder( 'lxml' ) ).parse( resp ).xpath( '*//html:select[@id="AddEnvironnementIdTypeEnv"]/html:option', namespaces=self._d_namespaces ) ) ) if self._is_ENV_in_debug: print red( '\t\t\t%s(...)\t|%s| -> %s' ) % ( self.load_env_ref.func_name, 'load_env_ref', result ) return result
def do_year(y, url): pagetext = urllib2.urlopen(url) parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), tokenizer=sanitizer.HTMLSanitizer) page = parser.parse(pagetext) for section in page.findall( "body/div/div/div/div/div/div/div/div/table[@class='fixture']"): matchtype = section.find("caption").text for match in section.findall("tbody/tr"): l = list(match.getchildren()) d = {} d["Match type"] = matchtype d["Match number"] = l[0].text d["Date"] = make_date(l[1].text, y) d["Team 1"] = flatten_refs(l[3]) d["Team 2"] = flatten_refs(l[5]) a = l[4].find("a") d["Score"] = a.text d["Report"] = "http://www.fifa.com" + a.get("href") print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"], d["Team 2"]) datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
def get_toc(self, path): # Only have TOC on tutorial pages. Don't do work for others. if not (re.search('/tutorials', path) or re.search('/mobile', path)): return '' toc = memcache.get('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path)) if toc is None or not self.request.cache: template_text = render_to_string(path, {}) parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) dom_tree = parser.parse(template_text) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) toc = [] current = None for element in stream: if element['type'] == 'StartTag': if element['name'] in ['h2', 'h3', 'h4']: for attr in element['data']: if attr[0] == 'id': current = { 'level' : int(element['name'][-1:]) - 1, 'id' : attr[1] } elif element['type'] == 'Characters' and current is not None: current['text'] = element['data'] elif element['type'] == 'EndTag' and current is not None: toc.append(current) current = None memcache.set('%s|toc|%s' % (settings.MEMCACHE_KEY_PREFIX, path), toc, 3600) return toc
def validate_url(self, url, use_w3c=True, quite=True): 'validate urls with the w3c validator. Need an Internet Connection' client = Client() response = client.get(url, follow=True) if response.status_code == 200: src = response.content treebuilder = treebuilders.getTreeBuilder("etree") parser = HTMLParser(tree=treebuilder, strict=True) try: parser.parse(src) except Exception as ex: pass if not parser.errors and use_w3c: #uploading to w3c w3c = w3c_client(src) if w3c and not w3c[0]: print('%s: %s' % ( url, w3c[1], )) if not quite: for i in w3c[2]['messages']: print(i['messageid']) print('\t%s' % (i['message'], )) #self.assertTrue(w3c[0]) else: print('skipping html check %s', (response.status_code, ))
def getitems(params): http = GET('http://www.planeta-online.tv/') if http != None: DT = html5lib.HTMLParser( tree=treebuilders.getTreeBuilder('dom')).parse(http) for div0 in DT.getElementsByTagName('div'): if div0.getAttribute('id') == 'mainChannelList': for div1 in div0.getElementsByTagName('a'): if div1.getAttribute('class') == 'tip_trigger chA': title = None img = None for child in div1.childNodes: if child.nodeType == child.TEXT_NODE: title = child.data.encode('utf8') else: for imgs in child.getElementsByTagName('img'): img = 'http://www.planeta-online.tv%s' % imgs.getAttribute( 'src').encode('utf8') if title and img: uri = '%s?%s' % (sys.argv[0], urllib.urlencode({ 'func': 'play', 'href': div1.getAttribute('href') })) i = xbmcgui.ListItem(title, iconImage=img, thumbnailImage=img) i.setProperty('IsPlayable', 'true') xbmcplugin.addDirectoryItem(h, uri, i) xbmcplugin.endOfDirectory(h)