Ejemplo n.º 1
0
def html_to_pango (html):	
	"""Simple html to pango stripper."""
	s = MLStripper()
	s.feed(html)
	no_html = s.get_data()
	decoded = BeautifulStoneSoup(no_html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
	result = decoded.encode("UTF-8")
	return result.strip(" \n")
Ejemplo n.º 2
0
def prepare_str( s, html=False ):
    """
    Clean string for any mark-up so that string can be shown by other applications (WWT/XMP).
    """
    s = force_unicode( s )
    if html:
        s = strip_tags(s)
        s = s.replace("\r\n", " ")
        s = s.replace("\n", " ")
        s = s.replace("\r", " ")
        if s:
            s = BeautifulStoneSoup( s, convertEntities=BeautifulStoneSoup.HTML_ENTITIES ).contents[0]
    return s.encode('utf8')
Ejemplo n.º 3
0
def extractContentWashingtonPost(soup):
  print 'extracting content: Washington Post'
  article = soup('div', {'class': 'article_body'})
  articletext = ''
  if (len(article) > 0):
      for elem in article:
        articletext += elem.text + ' '
  else:
      article = soup('div', {'id': 'entrytext'})
      for elem in article:
        articletext += elem.text + ' '
  cleanarticle = BeautifulStoneSoup(articletext, convertEntities = BeautifulStoneSoup.ALL_ENTITIES).text
  cleanarticle = cleanarticle.encode('cp949', errors='replace')
  return cleanarticle
Ejemplo n.º 4
0
    def element(self, ):

        name = None
        attrName = None
        value = None
        token = None

        try:
            try:
                # xmlTreeParser.g:33:5: ( ^( ELEMENT name= GENERIC_ID ( ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE ) )* ( element | ^( SPACE_TOKEN token= PCDATA_TOKEN ) | ^( TEXT_TOKEN token= PCDATA_TOKEN ) )* ) )
                # xmlTreeParser.g:33:7: ^( ELEMENT name= GENERIC_ID ( ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE ) )* ( element | ^( SPACE_TOKEN token= PCDATA_TOKEN ) | ^( TEXT_TOKEN token= PCDATA_TOKEN ) )* )
                pass 
                self.match(self.input, ELEMENT, self.FOLLOW_ELEMENT_in_element68)

                self.match(self.input, DOWN, None)
                name=self.match(self.input, GENERIC_ID, self.FOLLOW_GENERIC_ID_in_element72)
                #action start
                self.current_el = name.getText()
                            
                #action end
                # xmlTreeParser.g:36:13: ( ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE ) )*
                while True: #loop1
                    alt1 = 2
                    LA1_0 = self.input.LA(1)

                    if (LA1_0 == ATTRIBUTE) :
                        alt1 = 1


                    if alt1 == 1:
                        # xmlTreeParser.g:37:17: ^( ATTRIBUTE attrName= GENERIC_ID value= ATTR_VALUE )
                        pass 
                        self.match(self.input, ATTRIBUTE, self.FOLLOW_ATTRIBUTE_in_element121)

                        self.match(self.input, DOWN, None)
                        attrName=self.match(self.input, GENERIC_ID, self.FOLLOW_GENERIC_ID_in_element125)
                        value=self.match(self.input, ATTR_VALUE, self.FOLLOW_ATTR_VALUE_in_element129)

                        self.match(self.input, UP, None)
                        #action start
                                         
                        if(attrName.text.lower()=="notice.num"):
                          self.instance_id=value.getText()
                        elif(self.current_el=="lang" and attrName.text.lower=="police" and value=="betagr"):
                          self.is_beta_greek = True
                                        
                        #action end


                    else:
                        break #loop1


                # xmlTreeParser.g:45:13: ( element | ^( SPACE_TOKEN token= PCDATA_TOKEN ) | ^( TEXT_TOKEN token= PCDATA_TOKEN ) )*
                while True: #loop2
                    alt2 = 4
                    LA2 = self.input.LA(1)
                    if LA2 == ELEMENT:
                        alt2 = 1
                    elif LA2 == SPACE_TOKEN:
                        alt2 = 2
                    elif LA2 == TEXT_TOKEN:
                        alt2 = 3

                    if alt2 == 1:
                        # xmlTreeParser.g:45:14: element
                        pass 
                        self._state.following.append(self.FOLLOW_element_in_element179)
                        self.element()

                        self._state.following.pop()


                    elif alt2 == 2:
                        # xmlTreeParser.g:46:15: ^( SPACE_TOKEN token= PCDATA_TOKEN )
                        pass 
                        self.match(self.input, SPACE_TOKEN, self.FOLLOW_SPACE_TOKEN_in_element196)

                        self.match(self.input, DOWN, None)
                        token=self.match(self.input, PCDATA_TOKEN, self.FOLLOW_PCDATA_TOKEN_in_element200)

                        self.match(self.input, UP, None)


                    elif alt2 == 3:
                        # xmlTreeParser.g:48:15: ^( TEXT_TOKEN token= PCDATA_TOKEN )
                        pass 
                        self.match(self.input, TEXT_TOKEN, self.FOLLOW_TEXT_TOKEN_in_element235)

                        self.match(self.input, DOWN, None)
                        token=self.match(self.input, PCDATA_TOKEN, self.FOLLOW_PCDATA_TOKEN_in_element239)

                        self.match(self.input, UP, None)
                        #action start
                                     
                        if(self.current_el.lower()=="resume"):
                          if(token.token.text.find(self.NBSP)== -1):
                           self.logger.debug("%s in %s #%s"% (token.token, self.current_el,self.instance_id));
                           #self.logger.debug("%s"% (dir(token.token)));
                           tok = {}
                           tok["start"]=token.token.start
                           tok["end"]=token.token.stop
                           tok["otext"]=token.token.text
                           from BeautifulSoup import BeautifulStoneSoup
                           temp=BeautifulStoneSoup(tok["otext"],convertEntities=BeautifulStoneSoup.ALL_ENTITIES)
                           tok["utext"]=temp.encode("utf-8")
                           self.tokens.append(tok);
                            # TODO: add a check for self.current_el="lang"
                          elif(token.token.text.find(self.NBSP)!= -1):
                              newtok={}
                              while(token.token.text.find(self.NBSP)!= -1):
                          	    # cycle over the token until all the NBSP entities have been removed
                                self.logger.debug("The token %s contains %s"%(token.token,self.NBSP))
                                idx = token.token.text.find(self.NBSP)
                                newtok["start"]=token.token.start
                                newtok["end"]=token.token.start + (idx-1)
                                newtok["otext"]=token.token.text[:idx]
                                from BeautifulSoup import BeautifulStoneSoup
                                newtok["utext"]=BeautifulStoneSoup(newtok["otext"],convertEntities=BeautifulStoneSoup.ALL_ENTITIES).encode("utf-8")
                                before = token.token.text[:idx]
                                self.logger.debug(before)
                                after = token.token.text[idx+len(self.NBSP):]
                                self.logger.debug(after)
                                # REPAIR
                                token.token.text = token.token.text[idx+len(self.NBSP):]
                                token.token.start = token.token.start+idx+len(self.NBSP)
                                self.tokens.append(newtok)
                                newtok={}
                            # this is the last "after" bit
                              newtok["start"]=token.token.start
                              newtok["end"]=token.token.start+len(token.token.text)
                              newtok["otext"] = token.token.text
                              newtok["utext"]= BeautifulStoneSoup(newtok["otext"],convertEntities=BeautifulStoneSoup.ALL_ENTITIES).encode("utf-8")
                              self.tokens.append(newtok)
                                        
                        #action end


                    else:
                        break #loop2


                #action start
                             
                if(self.current_el.lower()=="resume" and len(self.tokens) > 0):
                  self.instances[self.instance_id] = self.tokens
                  #self.logger.debug("%s",self.instances)
                self.tokens = []
                            
                #action end

                self.match(self.input, UP, None)




            except RecognitionException, re:
                self.reportError(re)
                self.recover(self.input, re)
        finally:

            pass

        return 
Ejemplo n.º 5
0
#!/usr/bin/python
import urllib2, time
from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup

# for local testing, as to not hit the web server
#html = open('page.html').read()
#soup = BeautifulSoup(html)

# 674 pages last time I checked. Oddly enough, their pages seem zero-based. Additionally, if you 
# substitute an arbitrary number, outside of the range of pages, you'll get data back instead
# of 404. I'm not sure why they're doing this.
for page_num in range(0,674):
	url = 'http://www.chucknorrisfacts.com/all-chuck-norris-facts?page=%d' % page_num
	html = urllib2.urlopen(url)
	soup = BeautifulSoup(html)

	entries = soup.findAll("li","views-row")
	for entry in entries:
		
		# use BeautifulStoneSoup to remove any HTML-escaped text that BS returns.
		the_quote = BeautifulStoneSoup(entry.div.text, 
		                   convertEntities=BeautifulStoneSoup.HTML_ENTITIES).contents[0]
		
		# print it to stdout. I just redirect the program's output to a file.
		print the_quote.encode('utf-8')
	# be a good citizen and wait a few seconds before visiting the next page
	time.sleep(6)