def download_images(soup, dest_folder, href_prefix, base_href = None):
	"""
	Download all referenced images to the {dest} folder
	Replace href attributes with {href_prefix}/output_filename
	
		>>> from lib.mock import Mock
		>>> ensure_dir_exists = Mock()
		>>> import process
		>>> process.download_file = Mock()
		>>> process.download_file.return_value = "image.jpg"
		
		>>> soup = BeautifulSoup('<img src="http://google.com/image.jpg?a=b&c=d"/>')
		>>> process.download_images(soup, 'dest_folder', 'local_folder/')
		True
		>>> soup
		<img src="local_folder/image.jpg" />
	
		# (make sure the file was downloaded from the correct URL:)
		>>> process.download_file.call_args
		((u'http://google.com/image.jpg?a=b&c=d', 'image.jpg'), {'base_path': 'dest_folder'})
	"""
	images = soup.findAll('img',{'src':True})
	success = True
	
	if len(images) > 0:
		ensure_dir_exists(dest_folder)
	img_num = 0
	for img in images:
		debug("processing image %s of %s" % (img_num, len(images)))
		img_num += 1
		if img['src'].startswith(app_globals.CONFIG['resources_path']):
			continue
		href = absolute_url(img['src'], base_href)
		
		filename = get_filename(img['src'])
		try:
			filename = download_file(href, filename, base_path=dest_folder)
			if filename is not None:
				img['src'] = urllib2.quote(href_prefix + filename)
		except StandardError, e:
			info("Image %s failed to download: %s" % (img['src'], e))
			success = False
		
		# since this is a long running process; let the thread know we're still alive
		thread_pool.ping()
Example #2
0
	def process(self):
		debug("item %s -> process()" % self.title)
		self.soup_setup()
		thread_pool.ping()
		
		# process
		debug("item %s -> insert_alt_text()" % self.title)
		process.insert_alt_text(self.soup)
		thread_pool.ping()
		
		self.download_images(need_soup = False)
		thread_pool.ping()
		
		# save changes back as content
		self.soup_teardown()