def get_lyrics_from_url(self, url): page = geturl(url, referer=self.baseurl) soup = BeautifulSoup(page) content = soup.find('div', attrs={'id': 'content'}) [div.extract() for div in content.findAll('div')] [link.extract() for link in content.findAll('a')] [script.extract() for script in content.findAll('script')] lines = [str(line) for line in content.contents] data = ''.join(lines) data = self._newline.sub('', data) data = self._leadbreak.sub('', data) data = self._endbreak.sub('', data) lines = self._break.split(data) verses = [] while True: try: i = lines.index('') verse, lines = lines[:i], lines[i+1:] verses.append(verse) except ValueError: verses.append(lines) break for i, verse in enumerate(verses): verse = ' / '.join(verse) verse = whitespace.sub(' ', verse) verses[i] = verse if self._spam in verses: del verses[verses.index(self._spam)] return verses
def clock(self, query): """Use google to look up time in a given location""" try: doc = self.ua.open(self.search, {'q': 'time in %s' % query}) soup = BeautifulSoup(doc) time = soup.find('img', src=self.clock_re).findNext('td') try: time.find('table').extract() except AttributeError: pass return stripHTML(time.renderContents().decode('utf-8')).strip() except: pass
def bodycount(self): try: doc = geturl(IraqWar._bodycount_url) soup = BeautifulSoup(doc) data = soup.find('td', attrs={'class': 'main-num'}) data = data.find('a') data = str(data.contents[0]) data = stripHTML(data) data = IraqWar._re_whitespace.sub(' ', data) data = data.strip() return data except Exception, e: log.warn('error in %s: %s' % (self.__module__, e)) log.exception(e) return 'UNKNOWN'
def response(self, nick, args, kwargs): try: doc = geturl(self.random) soup = BeautifulSoup(doc) confs = soup.findAll('div', attrs={'class': 'content'})[3:] conf = random.choice(confs) conf = [str(p) for p in conf.findAll('p')] conf = ' '.join(conf) conf = stripHTML(conf) conf = conf.strip() return conf except Exception, e: log.warn('error in %s: %s' % (self.__module__, e)) log.exception(e) return '%s: I had some issues with that..' % nick
def get_comment(self): page = geturl(self.url) # remove high ascii since this is going to IRC page = self.utf8.sub('', page) # create BeautifulSoup document tree soup = BeautifulSoup(page) table = soup.find('table') rows = table.findAll('tr') row = rows[1] cells = row.findAll('td') source = cells[1].string comment = cells[2].string author = cells[3].string return '<%s@%s> %s' % (author, source, comment)
def response(self, nick, args, kwargs): try: doc = geturl(self.random, add_headers={'Accept': '*/*'}) soup = BeautifulSoup(doc) main = soup.find(u'div', attrs={u'id': u'main'}) confs = main.findAll(u'div', attrs={u'class': u'content'}) conf = random.choice(confs) conf = [unicode(p) for p in conf.findAll(u'p')] conf = u' '.join(conf) conf = stripHTML(conf) conf = conf.strip() return conf except Exception, error: log.warn(u'error in module %s' % self.__module__) log.exception(error) return u'%s: I had some issues with that..' % nick
def response(self, nick, args, kwargs): try: fail = BeautifulSoup(geturl(self.url)).h1 return self.spaces_re.sub( " ", stripHTML( u"%s: %s: %s %s: %s" % ( nick, self.col("red", text="FAIL"), self.fail_re.search(fail.renderContents()).group(1), self.col("green", text="FIX"), self.fail_re.search(fail.findNext("h1").renderContents()).group(1), ) ), ) except Exception, error: log.warn("error in module %s" % self.__module__) log.exception(error) return u"%s: Too much fail for technobabble" % (nick, error)
def rate_rt(self, name): """Rating from rotten tomatoes""" page = geturl(self.rt_search, {'search': name}, referer=self.rt_url) soup = BeautifulSoup(page) for table in soup.body('table'): if table.caption.renderContents() == 'Movies': break else: raise ValueError('no movies found in search results') name = self.normalize(name) for row in table.tbody('tr'): link = row.a if self.normalize(link.renderContents()) == name: url = urljoin(self.rt_url, link['href']) break else: raise ValueError('no exact matches') soup = BeautifulSoup(geturl(url, referer=self.rt_search)) info = soup.body.find('div', 'movie_info_area') return stripHTML(info.h1.renderContents()), info.a['title']
def get_soup(self, query): if isinstance(query, (list, tuple)): query = u' '.join(query) # load page if query == u'random': opts = {} url = urljoin(self.base_url, self.random_path) else: opts = {u'search': query, u'go': u'Go'} url = urljoin(self.base_url, self.search_path) page = geturl(url, referer=self.base_url, opts=opts, size=self.sample_size) # create BeautifulSoup document tree soup = BeautifulSoup(page) # extract title minus WP advert title = soup.title.string.replace(self.advert, u'') # remove all tabular data/sidebars for table in soup.findAll(u'table'): table.extract() # remove disambiguation links for dablink in soup.findAll(u'div', attrs={u'class': u'dablink'}): dablink.extract() # remove latitude/longitude metadata for places for coord in soup.findAll(u'span', attrs={u'id': u'coordinates'}): coord.extract() # strip non-english content wrappers for span in soup.findAll(u'span', attrs={u'lang': True}): span.extract() # remove IPA pronounciation guidelines for span in soup.findAll(u'span', attrs={u'class': u'IPA'}): span.extract() for link in soup.findAll(u'a', text=u'IPA'): link.extract() for span in soup.findAll(u'span', attrs={u'class': Wiki._audio}): span.extract() return soup, title
def _getpage(self, url, opts=None): page = geturl(url, referer=self.baseurl, opts=opts) # HTMLParser doesn't handle this very well.. see: # http://www.crummy.com/software/BeautifulSoup/3.1-problems.html page = self.scripts_re.sub('', page) soup = BeautifulSoup(page) # get page title title = soup.title.string if self.advert and self.advert in title: title = title.replace(self.advert, '') # remove all tabular data/sidebars for table in soup.findAll('table'): table.extract() # remove disambiguation links for div in soup.findAll('div', 'dablink'): div.extract() # remove latitude/longitude metadata for places for span in soup.findAll('span', id='coordinates'): span.extract() # strip non-english content wrappers for span in soup.findAll('span', lang=True): span.extract() # remove IPA pronounciation guidelines for span in soup.findAll('span', 'IPA'): span.extract() for a in soup.findAll('a', text='IPA'): a.extract() for span in soup.findAll('span', 'audiolink'): span.extract() return soup, title
def forecast(self, location): page = geturl(url=self.search, opts={'query': location}, referer=self.baseurl) soup = BeautifulSoup(page) # disambiguation page if 'Search Results' in str(soup): table = soup.find('table', attrs={'class': 'boxB full'}) rows = table.findAll('tr') results = [] match = None for row in rows: cells = row.findAll('td', attrs={'class': 'sortC'}) for cell in cells: link = cell.find('a') if link is None or 'addfav' in str(link['href']): continue city = str(link.contents[0]) href = urljoin(self.baseurl, str(link['href'])) results.append(city) if city.lower() == location.lower(): match = urljoin(self.baseurl, href) break if match: break if match: page = geturl(url=match) soup = BeautifulSoup(page) else: return 'Multiple results found: %s' % ', '.join(results) rss_url = soup.find('link', attrs=self._rss_link)['href'] rss = rssparser.parse(rss_url) title = str(soup.find('h1').string).strip() conditions = stripHTML(rss['items'][0]['description']) fields = self._bar.split(conditions) data = {} for field in fields: try: key, val = self._keyval.search(field).groups() data[key] = val except: pass try: temp = float(self._tempF.search(data['Temperature']).group(1)) blink = False if temp < 0: color = 6 elif temp >=0 and temp < 40: color = 2 elif temp >= 40 and temp < 60: color = 10 elif temp >= 60 and temp < 80: color = 3 elif temp >= 80 and temp < 90: color = 7 elif temp >= 90 and temp < 100: color = 5 elif temp >= 100: color = 5 blink = True data['Temperature'] = '\x03%s\x16\x16%s\x0F' % (color, data['Temperature']) if blink: data['Temperature'] = '\x1b[5m' + data['Temperature'] + \ '\x1b[0m' except: pass output = [] for key, val in data.items(): line = '%s: %s' % (key, val) output.append(line) output = ' | '.join(output) return '%s: %s' % (title, output)
class PageProcessor(): def __init__(self,html): self.html=html self.soup = BeautifulSoup(self.html) def isFirstPage(self): if self.soup.find('div',{'class':'userMsg', 'id':'firstPostText'})!=None: return True else: return False def getTitle(self): if self.isFirstPage()==True: title = self.soup.find('div',{'class':'post_title'}).findAll(lambda tag:tag.name=='a',text=True) title = ''.join(title) title = title.replace('google_ad_region_start=title', '') title = title.replace('google_ad_region_end=title', '') title = title.replace('Archived From: Hot Deals', '') title = title.replace('&', '') title = title.replace('\n','') return title.strip() else: print >> sys.stderr, 'it is not the first page' def getRating(self): pass def getReplyNum(self): pass def getViewNum(self): pass def getPostTime(self): if self.isFirstPage(): time = self.soup.find('div',{'class':'post_date'}).findAll(lambda tag:tag.name!='b',text=True) time = ''.join(time) time = time.replace('posted:', '') time = time.replace('updated:', '') time = time.replace('\n','') return time.strip() else: print >> sys.stderr, 'it is not the first page' def getDescription(self): if self.isFirstPage(): content = self.soup.find('div',{'class':'userMsg', 'id':'firstPostText'}).findAll(lambda tag:tag.name=='table',text=True) return (''.join(content[1:-1])).strip() else: print >> sys.stderr, 'it is not the first page' def getCategory(self): pass def getFeedback(self): pass def getUser(self): if self.isFirstPage(): username = self.soup.find('li',{'class':'user_name'}).findAll(lambda tag:tag.name!='span',text=True) return (''.join(username)).strip() else: print >> sys.stderr, 'it is not the first page'
def __init__(self,html): self.html=html self.soup = BeautifulSoup(self.html)
def forecast(self, location): page = geturl(url=self.search, opts={u'query': location}, referer=self.baseurl) soup = BeautifulSoup(page) # disambiguation page if u'Search Results' in unicode(soup): table = soup.find(u'table', attrs={u'class': u'dataTable'}) tbody = soup.find(u'tbody') results = [row.findAll(u'td')[0].find(u'a') for row in tbody.findAll(u'tr')] results = [(normalize(unicode(result.contents[0])), urljoin(Weather.baseurl, unicode(result[u'href']))) for result in results] match = None for result in results: if result[0] == normalize(location): match = result[1] break if match is None: match = results[0][1] page = geturl(url=match, referer=self.search) soup = BeautifulSoup(page) title = soup.find(u'h1').string.strip() rss_url = soup.find(u'link', attrs=self._rss_link)[u'href'] rss = feedparser.parse(rss_url) conditions = rss.entries[0].description # XXX ok, here's the deal. this page has raw utf-8 bytes encoded # as html entities, and in some cases latin1. this demonstrates a # total misunderstanding of how unicode works on the part of the # authors, so we need to jump through some hoops to make it work conditions = conditions.encode(u'raw-unicode-escape') conditions = stripHTML(conditions) conditions = encoding.convert(conditions) fields = self._bar.split(conditions) data = {} for field in fields: try: key, val = self._keyval.search(field).groups() data[key] = val except: pass try: temp = float(self._tempF.search(data[u'Temperature']).group(1)) blink = False if temp < 0: color = u'magenta' elif temp >=0 and temp < 40: color = u'blue' elif temp >= 40 and temp < 60: color = u'cyan' elif temp >= 60 and temp < 80: color = u'green' elif temp >= 80 and temp < 90: color = u'yellow' elif temp >= 90 and temp < 100: color = u'red' elif temp >= 100: color = u'red' blink = True data[u'Temperature'] = self.colorlib.get_color(color, text=data[u'Temperature']) # XXX this seems ill-conceived if blink: data[u'Temperature'] = u'\x1b[5m' + data[u'Temperature'] + \ u'\x1b[0m' except Exception, error: log.exception(error)
def get_quote(self, symbol): url = Yahoo._quote_url.replace('SYMBOL', symbol) page = geturl(url) soup = BeautifulSoup(page) company = ' '.join([str(item) for item in soup.find('h1').contents]) company = stripHTML(company) tables = soup.findAll('table') table = tables[0] rows = table.findAll('tr') data = {} current_value = 0.0 open_value = 0.0 for row in rows: key, val = row.findAll('td') key = str(key.contents[0]) if key == 'Change:': try: img = val.find('img') alt = str(img['alt']) val = alt + stripHTML(str(val.contents[0])) except: val = '0.00%' elif key == 'Ask:': continue else: val = stripHTML(str(val.contents[0])) val = val.replace(',', '') if Yahoo._isfloat.search(val): val = float(val) data[key] = val if key == 'Last Trade:' or key == 'Index Value:': current_value = val elif key == 'Prev Close:': open_value = val # see if we can calculate percentage try: change = 100 * (current_value - open_value) / open_value data['Change:'] += ' (%.2f%%)' % change except: pass # try and colorize the change field try: if 'Up' in data['Change:']: data['Change:'] = self._green + data['Change:'] + self._reset elif 'Down' in data['Change:']: data['Change:'] = self._red + data['Change:'] + self._reset except: pass # build friendly output output = [] for key, val in data.items(): if isinstance(val, float): val = '%.2f' % val output.append('%s %s' % (key, val)) return '%s - ' % company + ' | '.join(output)