Exemple #1
0
  def update_bill(self, bill):
	"""
	Check if a bill exists in datastore, and update its stats.
	"""
	this_bill = Bill.get_by_key_name(bill['title']) 
	logging.info(bill['title']) 
	if this_bill is None: 
		this_bill = self.create_bill(bill)
		is_new_bill = True
	else: is_new_bill = False
	this_bill.rank = bill['rank']
	import urllib
	self.request_args = {'bill_id':  bill['id']}
	self.formatted_args = urllib.urlencode(self.request_args)
	from google.appengine.api import urlfetch
	fetch_page = urlfetch.fetch( url = OPENCONGRESS_INFO_URL + self.formatted_args,
								method = urlfetch.GET) 
	from utils.BeautifulSoup import BeautifulSoup	
	document = BeautifulSoup(fetch_page.content)
	property_count = 0	 
	this_bill.introduction_date = str(document.findAll('li')[property_count]).split('</strong> ')[1].split('</li>')[0]
	this_bill.status = str(document.findAll('li')[property_count + 1]).split('</strong> ')[1].split('</li>')[0]
	if this_bill.status == "This Bill Has Become Law":  property_count = -1 # no next step
	else: this_bill.next_step = str(document.findAll('li')[property_count + 2]).split('</strong> ')[1].split('</li>')[0]
	this_bill.latest_action = str(document.findAll('li')[property_count + 3]).split('</strong> ')[1].split('</li>')[0]
	if len( this_bill.latest_action ) > 68: this_bill.latest_action = " ".join(this_bill.latest_action.split(' ')[:9]) + "..."
	this_bill.sponsor = str(document.findAll('li')[property_count + 4]).split('</strong> ')[1].split('</li>')[0].decode('utf-8')
	this_bill.sponsor_name = this_bill.sponsor.split("[")[0] 
	self.save.append(this_bill)
	if is_new_bill: self.send_email_updates(this_bill)
	return
 def _get_video_links(self,html_data):
     soup = BeautifulSoup(''.join(html_data),convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
     link_tds=soup.findAll('td',width='420')
     link_a=[]
     for td in link_tds:
         link_a.append(td.find('a')['href'])
     return link_a
Exemple #3
0
def _GetPostingPage(page_num):
  url = urllib.urlopen(settings.SOURCE_ALL_URL + str(page_num))
  data = BeautifulSoup(url.read())
  postings = data.findAll('li', {'class': re.compile('hlisting')})

  for posting in postings:
    posting_object = models.Posting()
    try:
      descr_div = BeautifulSoup(posting).findAll('div', {'class': re.compile('description')})[0]
    except TypeError:
      continue
    descr_div = descr_div.findAll('h3')[0]
    description = BeautifulSoup(descr_div).findAll('a')[0]
    url = BeautifulSoup(descr_div).get('href')
    posting_object.content = posting
    logging.critical(description)
    logging.critical(url)
Exemple #4
0
 def get_recent_updates(self):
 	WIKIRAGE_URL = "http://www.wikirage.com/rss/1/"
 	from google.appengine.api import urlfetch
 	fetch_page = urlfetch.fetch(WIKIRAGE_URL, follow_redirects=False)
 	from utils.BeautifulSoup import BeautifulSoup
 	soup = BeautifulSoup(fetch_page.content) 
 	updates = []
 	wiki_topics = [ guid.contents[0].split('/')[-2] for guid in soup.findAll('guid') ]
 	import urllib
 	for topic in wiki_topics:
 		topic = urllib.unquote(topic)
 		topic_name = topic.replace('_', ' ')
 		updates.append( { 'topic_path': topic, 'topic_name': topic_name } )
 	return updates
Exemple #5
0
  def get_soup(self, page):
	#in case we need to meet 100k limit, truncate page.
	soup_url = SEMANTICPROXY_URL +  str(page.url)    # + TRUNCATE URL + 
	# timeout for fetch_page (and all fetch pages)
	try: 
	    logging.debug('Fetching calais response')
	    fetch_page = urlfetch.fetch(soup_url)              # perform semantic analysis
	except:
		logging.debug('Unable to fetch calais response')
		return False 
	
	soup = BeautifulSoup(fetch_page.content) #whole page
	try: # look for error
		exception = soup.findAll('exception')[0].contents[0]
		print exception
		return False
	except: return soup 
Exemple #6
0
    def parse_summary(self, summary, link):
        
        #summary = escape.utf8(summary)
        soup = BeautifulSoup(summary)
        
        for script in list(soup.findAll('script')):
            script.extract()
            
        for o in soup.findAll(onload=True):
            del o['onload']
            
        for script in list(soup.findAll('noscript')):
            script.extract()
            
        for attr in self.remove_attributes:
            for x in soup.findAll(attrs={attr:True}):
                del x[attr]
                
        for tag in self.remove_tags:
            for x in soup.findAll(tag['name']):
                x.extract()
                
        for base in list(soup.findAll(['base', 'iframe'])):
            base.extract()
            
        #for p in list(soup.findAll(['p', 'div'])):
        #    p['style'] = 'text-indent:2em'
        
        img_count = 1
        for img in list(soup.findAll('img')):
            
            if self.noimage or img_count >= self.max_images:
                img.extract()
            else:
                image_url = absolute_path(img['src'], link)
                image = self.down_image(image_url, link)

                if image:
                    img['src'] = image
                else:
                    img.extract()
                    
            img_count = img_count + 1
        
        return soup.renderContents('utf-8')
	def convertToASS(self, script):
		soup = BeautifulSoup(script, convertEntities=BeautifulSoup.HTML_ENTITIES)
		header = soup.find('subtitle_script')
		header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\n\n";
		#styles = "[V4 Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
                styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
		events = "[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n";
		stylelist = soup.findAll('style')
		eventlist = soup.findAll('event')
		
		for style in stylelist:
			#styles += "Style: " + style['name'] + ", " + style['font_name'] + ", " + style['font_size'] + ", " + style['primary_colour'] + ", " + style['secondary_colour'] + ", " + style['outline_colour'] + ", " + style['back_colour'] + ", " + style['bold'] + ", " + style['italic'] + ", " + style['underline'] + ", " + style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n"
                        style['strikeout'] + ", " + style['scale_x'] + ", " + style['scale_y'] + ", " + style['spacing'] + ", " + style['angle'] + ", " + style['border_style'] + ", " + style['outline'] + ", " + style['shadow'] + ", " + style['alignment'] + ", " + style['margin_l'] + ", " + style['margin_r'] + ", " + style['margin_v'] + ", " + style['encoding'] + "\n"
			styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n"
		
		for event in eventlist:
			events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n"
	
		formattedSubs = header+styles+events
		return formattedSubs