def download_images(soup, dest_folder, href_prefix, base_href = None): """ Download all referenced images to the {dest} folder Replace href attributes with {href_prefix}/output_filename >>> from lib.mock import Mock >>> ensure_dir_exists = Mock() >>> import process >>> process.download_file = Mock() >>> process.download_file.return_value = "image.jpg" >>> soup = BeautifulSoup('<img src="http://google.com/image.jpg?a=b&c=d"/>') >>> process.download_images(soup, 'dest_folder', 'local_folder/') True >>> soup <img src="local_folder/image.jpg" /> # (make sure the file was downloaded from the correct URL:) >>> process.download_file.call_args ((u'http://google.com/image.jpg?a=b&c=d', 'image.jpg'), {'base_path': 'dest_folder'}) """ images = soup.findAll('img',{'src':True}) success = True if len(images) > 0: ensure_dir_exists(dest_folder) img_num = 0 for img in images: debug("processing image %s of %s" % (img_num, len(images))) img_num += 1 if img['src'].startswith(app_globals.CONFIG['resources_path']): continue href = absolute_url(img['src'], base_href) filename = get_filename(img['src']) try: filename = download_file(href, filename, base_path=dest_folder) if filename is not None: img['src'] = urllib2.quote(href_prefix + filename) except StandardError, e: info("Image %s failed to download: %s" % (img['src'], e)) success = False # since this is a long running process; let the thread know we're still alive thread_pool.ping()
def process(self): debug("item %s -> process()" % self.title) self.soup_setup() thread_pool.ping() # process debug("item %s -> insert_alt_text()" % self.title) process.insert_alt_text(self.soup) thread_pool.ping() self.download_images(need_soup = False) thread_pool.ping() # save changes back as content self.soup_teardown()