def html(value): value = value.replace("\r\n","\n") soup = BeautifulSoup(value) # remove HTML comments for comment in soup.findAll( text=lambda text: isinstance(text, Comment)): comment.extract() # limit tags and attributes, 'nofollow' links, escape <pre> contents for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True tag.attrs = [(attr, val) for attr, val in tag.attrs if attr in valid_attrs and url(val)] if tag.name == 'a': tag.attrs.append(['rel', 'nofollow']) if tag.name == 'pre': # convert < into < AND \n\n -> \n \n so that we don't add <br>s in the linebreaking step tag.replaceWith('<pre>%s</pre>' % tag.contents[0].replace('<', '<').replace('\n', '\r')) # add a linebreak whenever there are two returns html = soup.renderContents().decode('utf8') html = html.replace("\n", "<br />\n") html = html.replace('\r', '\n') return html
def fetch_burger_string(self): """Fetch the html from the TASTY_URL and scrape it to extract the available burger for the current friday.""" # return value burger_string = '' try: # fetch html from the given url url_fetch_response = urlfetch.fetch(TASTY_URL) if url_fetch_response.status_code == 200: # parse html document soup = BeautifulSoup(url_fetch_response.content) # find all elments with the text 'burger' in it burgers = soup.findAll( text=re.compile('[^A-Za-z]burger', re.IGNORECASE)) # find an element which does not contain 'veggie' burger_element = None for item in burgers: if re.search(r'veggie|neu', item, re.IGNORECASE) == None: burger_element = item # stop here if there is no valid element if burger_element == None: raise Exception('Could not find a burger element') # find the parent 'p' element parent_p_burger_element = burger_element.findParent(name='p') if(parent_p_burger_element): # get all text without the html tags and set the # burger_string burger_string = ''.join( parent_p_burger_element.findAll(text=True)) else: burger_string = burger_element # TODO: look at the next p element if there is a second half except Exception, e: logging.error('UpdateHandler::fetch_burger_string() - ' + str(e))
import re import urllib from vendor.BeautifulSoup import BeautifulSoup URL = "http://www.tasty-babelsberg.de" soup = BeautifulSoup(urllib.urlopen(URL)) burgerArray = soup.findAll(text=re.compile('[^A-Za-z]burger', re.IGNORECASE)) print burgerArray burger_element = None for item in burgerArray: if re.search(r'veggie|neu', item, re.IGNORECASE) == None: burger_element = item if(burger_element): parent_burger_p = burger_element.findParent(name='p') if(parent_burger_p): string = ''.join(parent_burger_p.findAll(text=True)) print unicode(string.replace('\n', ' ').strip()) else: print unicode(burger_element)