def html_to_pango (html): """Simple html to pango stripper.""" s = MLStripper() s.feed(html) no_html = s.get_data() decoded = BeautifulStoneSoup(no_html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) result = decoded.encode("UTF-8") return result.strip(" \n")
def prepare_str( s, html=False ): """ Clean string for any mark-up so that string can be shown by other applications (WWT/XMP). """ s = force_unicode( s ) if html: s = strip_tags(s) s = s.replace("\r\n", " ") s = s.replace("\n", " ") s = s.replace("\r", " ") if s: s = BeautifulStoneSoup( s, convertEntities=BeautifulStoneSoup.HTML_ENTITIES ).contents[0] return s.encode('utf8')
def extractContentWashingtonPost(soup): print 'extracting content: Washington Post' article = soup('div', {'class': 'article_body'}) articletext = '' if (len(article) > 0): for elem in article: articletext += elem.text + ' ' else: article = soup('div', {'id': 'entrytext'}) for elem in article: articletext += elem.text + ' ' cleanarticle = BeautifulStoneSoup(articletext, convertEntities = BeautifulStoneSoup.ALL_ENTITIES).text cleanarticle = cleanarticle.encode('cp949', errors='replace') return cleanarticle
def element(self, ): name = None attrName = None value = None token = None try: try: # xmlTreeParser.g:33:5: ( ^( ELEMENT name= GENERIC_ID ( ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE ) )* ( element | ^( SPACE_TOKEN token= PCDATA_TOKEN ) | ^( TEXT_TOKEN token= PCDATA_TOKEN ) )* ) ) # xmlTreeParser.g:33:7: ^( ELEMENT name= GENERIC_ID ( ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE ) )* ( element | ^( SPACE_TOKEN token= PCDATA_TOKEN ) | ^( TEXT_TOKEN token= PCDATA_TOKEN ) )* ) pass self.match(self.input, ELEMENT, self.FOLLOW_ELEMENT_in_element68) self.match(self.input, DOWN, None) name=self.match(self.input, GENERIC_ID, self.FOLLOW_GENERIC_ID_in_element72) #action start self.current_el = name.getText() #action end # xmlTreeParser.g:36:13: ( ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE ) )* while True: #loop1 alt1 = 2 LA1_0 = self.input.LA(1) if (LA1_0 == ATTRIBUTE) : alt1 = 1 if alt1 == 1: # xmlTreeParser.g:37:17: ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE ) pass self.match(self.input, ATTRIBUTE, self.FOLLOW_ATTRIBUTE_in_element121) self.match(self.input, DOWN, None) attrName=self.match(self.input, GENERIC_ID, self.FOLLOW_GENERIC_ID_in_element125) value=self.match(self.input, ATTR_VALUE, self.FOLLOW_ATTR_VALUE_in_element129) self.match(self.input, UP, None) #action start if(attrName.text.lower()=="notice.num"): self.instance_id=value.getText() elif(self.current_el=="lang" and attrName.text.lower=="police" and value=="betagr"): self.is_beta_greek = True #action end else: break #loop1 # xmlTreeParser.g:45:13: ( element | ^( SPACE_TOKEN token= PCDATA_TOKEN ) | ^( TEXT_TOKEN token= PCDATA_TOKEN ) )* while True: #loop2 alt2 = 4 LA2 = self.input.LA(1) if LA2 == ELEMENT: alt2 = 1 elif LA2 == SPACE_TOKEN: alt2 = 2 elif LA2 == TEXT_TOKEN: alt2 = 3 if alt2 == 1: # xmlTreeParser.g:45:14: element pass self._state.following.append(self.FOLLOW_element_in_element179) self.element() self._state.following.pop() elif alt2 == 2: # xmlTreeParser.g:46:15: ^( SPACE_TOKEN token= PCDATA_TOKEN ) pass self.match(self.input, SPACE_TOKEN, self.FOLLOW_SPACE_TOKEN_in_element196) self.match(self.input, DOWN, None) token=self.match(self.input, PCDATA_TOKEN, self.FOLLOW_PCDATA_TOKEN_in_element200) self.match(self.input, UP, None) elif alt2 == 3: # xmlTreeParser.g:48:15: ^( TEXT_TOKEN token= PCDATA_TOKEN ) pass self.match(self.input, TEXT_TOKEN, self.FOLLOW_TEXT_TOKEN_in_element235) self.match(self.input, DOWN, None) token=self.match(self.input, PCDATA_TOKEN, self.FOLLOW_PCDATA_TOKEN_in_element239) self.match(self.input, UP, None) #action start if(self.current_el.lower()=="resume"): if(token.token.text.find(self.NBSP)== -1): self.logger.debug("%s in %s #%s"% (token.token, self.current_el,self.instance_id)); #self.logger.debug("%s"% (dir(token.token))); tok = {} tok["start"]=token.token.start tok["end"]=token.token.stop tok["otext"]=token.token.text from BeautifulSoup import BeautifulStoneSoup temp=BeautifulStoneSoup(tok["otext"],convertEntities=BeautifulStoneSoup.ALL_ENTITIES) tok["utext"]=temp.encode("utf-8") self.tokens.append(tok); # TODO: add a check for self.current_el="lang" elif(token.token.text.find(self.NBSP)!= -1): newtok={} while(token.token.text.find(self.NBSP)!= -1): # cycle over the token until all the NBSP entities have been removed self.logger.debug("The token %s contains %s"%(token.token,self.NBSP)) idx = token.token.text.find(self.NBSP) newtok["start"]=token.token.start newtok["end"]=token.token.start + (idx-1) newtok["otext"]=token.token.text[:idx] from BeautifulSoup import BeautifulStoneSoup newtok["utext"]=BeautifulStoneSoup(newtok["otext"],convertEntities=BeautifulStoneSoup.ALL_ENTITIES).encode("utf-8") before = token.token.text[:idx] self.logger.debug(before) after = token.token.text[idx+len(self.NBSP):] self.logger.debug(after) # REPAIR token.token.text = token.token.text[idx+len(self.NBSP):] token.token.start = token.token.start+idx+len(self.NBSP) self.tokens.append(newtok) newtok={} # this is the last "after" bit newtok["start"]=token.token.start newtok["end"]=token.token.start+len(token.token.text) newtok["otext"] = token.token.text newtok["utext"]= BeautifulStoneSoup(newtok["otext"],convertEntities=BeautifulStoneSoup.ALL_ENTITIES).encode("utf-8") self.tokens.append(newtok) #action end else: break #loop2 #action start if(self.current_el.lower()=="resume" and len(self.tokens) > 0): self.instances[self.instance_id] = self.tokens #self.logger.debug("%s",self.instances) self.tokens = [] #action end self.match(self.input, UP, None) except RecognitionException, re: self.reportError(re) self.recover(self.input, re) finally: pass return
#!/usr/bin/python import urllib2, time from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup # for local testing, as to not hit the web server #html = open('page.html').read() #soup = BeautifulSoup(html) # 674 pages last time I checked. Oddly enough, their pages seem zero-based. Additionally, if you # substitute an arbitrary number, outside of the range of pages, you'll get data back instead # of 404. I'm not sure why they're doing this. for page_num in range(0,674): url = 'http://www.chucknorrisfacts.com/all-chuck-norris-facts?page=%d' % page_num html = urllib2.urlopen(url) soup = BeautifulSoup(html) entries = soup.findAll("li","views-row") for entry in entries: # use BeautifulStoneSoup to remove any HTML-escaped text that BS returns. the_quote = BeautifulStoneSoup(entry.div.text, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).contents[0] # print it to stdout. I just redirect the program's output to a file. print the_quote.encode('utf-8') # be a good citizen and wait a few seconds before visiting the next page time.sleep(6)