def update_bill(self, bill): """ Check if a bill exists in datastore, and update its stats. """ this_bill = Bill.get_by_key_name(bill['title']) logging.info(bill['title']) if this_bill is None: this_bill = self.create_bill(bill) is_new_bill = True else: is_new_bill = False this_bill.rank = bill['rank'] import urllib self.request_args = {'bill_id': bill['id']} self.formatted_args = urllib.urlencode(self.request_args) from google.appengine.api import urlfetch fetch_page = urlfetch.fetch( url = OPENCONGRESS_INFO_URL + self.formatted_args, method = urlfetch.GET) from utils.BeautifulSoup import BeautifulSoup document = BeautifulSoup(fetch_page.content) property_count = 0 this_bill.introduction_date = str(document.findAll('li')[property_count]).split('</strong> ')[1].split('</li>')[0] this_bill.status = str(document.findAll('li')[property_count + 1]).split('</strong> ')[1].split('</li>')[0] if this_bill.status == "This Bill Has Become Law": property_count = -1 # no next step else: this_bill.next_step = str(document.findAll('li')[property_count + 2]).split('</strong> ')[1].split('</li>')[0] this_bill.latest_action = str(document.findAll('li')[property_count + 3]).split('</strong> ')[1].split('</li>')[0] if len( this_bill.latest_action ) > 68: this_bill.latest_action = " ".join(this_bill.latest_action.split(' ')[:9]) + "..." this_bill.sponsor = str(document.findAll('li')[property_count + 4]).split('</strong> ')[1].split('</li>')[0].decode('utf-8') this_bill.sponsor_name = this_bill.sponsor.split("[")[0] self.save.append(this_bill) if is_new_bill: self.send_email_updates(this_bill) return
def _get_video_links(self,html_data): soup = BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES) link_tds=soup.findAll('td',width='420') link_a=[] for td in link_tds: link_a.append(td.find('a')['href']) return link_a
def _GetPostingPage(page_num): url = urllib.urlopen(settings.SOURCE_ALL_URL + str(page_num)) data = BeautifulSoup(url.read()) postings = data.findAll('li', {'class': re.compile('hlisting')}) for posting in postings: posting_object = models.Posting() try: descr_div = BeautifulSoup(posting).findAll('div', {'class': re.compile('description')})[0] except TypeError: continue descr_div = descr_div.findAll('h3')[0] description = BeautifulSoup(descr_div).findAll('a')[0] url = BeautifulSoup(descr_div).get('href') posting_object.content = posting logging.critical(description) logging.critical(url)
def get_recent_updates(self): WIKIRAGE_URL = "http://www.wikirage.com/rss/1/" from google.appengine.api import urlfetch fetch_page = urlfetch.fetch(WIKIRAGE_URL, follow_redirects=False) from utils.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(fetch_page.content) updates = [] wiki_topics = [ guid.contents[0].split('/')[-2] for guid in soup.findAll('guid') ] import urllib for topic in wiki_topics: topic = urllib.unquote(topic) topic_name = topic.replace('_', ' ') updates.append( { 'topic_path': topic, 'topic_name': topic_name } ) return updates
def get_soup(self, page): #in case we need to meet 100k limit, truncate page. soup_url = SEMANTICPROXY_URL + str(page.url) # + TRUNCATE URL + # timeout for fetch_page (and all fetch pages) try: logging.debug('Fetching calais response') fetch_page = urlfetch.fetch(soup_url) # perform semantic analysis except: logging.debug('Unable to fetch calais response') return False soup = BeautifulSoup(fetch_page.content) #whole page try: # look for error exception = soup.findAll('exception')[0].contents[0] print exception return False except: return soup
def parse_summary(self, summary, link): #summary = escape.utf8(summary) soup = BeautifulSoup(summary) for script in list(soup.findAll('script')): script.extract() for o in soup.findAll(onload=True): del o['onload'] for script in list(soup.findAll('noscript')): script.extract() for attr in self.remove_attributes: for x in soup.findAll(attrs={attr:True}): del x[attr] for tag in self.remove_tags: for x in soup.findAll(tag['name']): x.extract() for base in list(soup.findAll(['base', 'iframe'])): base.extract() #for p in list(soup.findAll(['p', 'div'])): # p['style'] = 'text-indent:2em' img_count = 1 for img in list(soup.findAll('img')): if self.noimage or img_count >= self.max_images: img.extract() else: image_url = absolute_path(img['src'], link) image = self.down_image(image_url, link) if image: img['src'] = image else: img.extract() img_count = img_count + 1 return soup.renderContents('utf-8')
def convertToASS(self, script): soup = BeautifulSoup(script, convertEntities=BeautifulSoup.HTML_ENTITIES) header = soup.find('subtitle_script') header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\n\n"; #styles = "[V4 Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n"; styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n"; events = "[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n"; stylelist = soup.findAll('style') eventlist = soup.findAll('event') for style in stylelist: #styles += "Style: " + style['name'] + ", " + style['font_name'] + ", " + style['font_size'] + ", " + style['primary_colour'] + ", " + style['secondary_colour'] + ", " + style['outline_colour'] + ", " + style['back_colour'] + ", " + style['bold'] + ", " + style['italic'] + ", " + style['underline'] + ", " + style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n" style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n" styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n" for event in eventlist: events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n" formattedSubs = header+styles+events return formattedSubs