def _crawl(self, options): # ToDo: allow more than just the images that are initially loaded on imgur starting_page_url = urlparse.urljoin(self.IMGUR_BASE_URL, self.url_path) starting_page_response = requests.get(starting_page_url) starting_page = StartingPage(starting_page_response) if not starting_page.links: sys.exit('No posts found on the specified starting page') # batch the links, so that not too many async request calls are made at once for link_batch in iterable_helper.batch(starting_page.links, 5): # ToDo: replace the set membership testing with a bloom filter when urls are persisted between runs new_links = filter(lambda l: l not in self.visited_links, link_batch) rs = (grequests.get(link) for link in new_links) responses = grequests.map(rs) while len(responses): response = responses.pop() post = Post(response) self.visited_links.add(post.url) if self._meets_criteria(post): self.posts.append(post)
def download_images(file_names, progress_reporter=None): mkdir_p(DOWNLOAD_PATH) existing_files = set() for name in file_names: if os.path.isfile(generate_file_path(name)): existing_files.add(name) urls_to_download = [ 'http://i.imgur.com/' + name for name in set(file_names) - existing_files ] downloaded = 0 # batch the image requests, so that not too many async request calls are made at once for batch in iterable_helper.batch(urls_to_download, 5): requests = (grequests.get(url) for url in batch) responses = grequests.map(requests) for response in responses: path = generate_file_path(response.url.split('/')[-1]) with open(path, 'w') as f: f.write(response.content) downloaded += 1 if progress_reporter: progress_reporter(downloaded)
def download_images(file_names, progress_reporter=None): mkdir_p(DOWNLOAD_PATH) existing_files = set() for name in file_names: if os.path.isfile(generate_file_path(name)): existing_files.add(name) urls_to_download = ['http://i.imgur.com/' + name for name in set(file_names) - existing_files] downloaded = 0 # batch the image requests, so that not too many async request calls are made at once for batch in iterable_helper.batch(urls_to_download, 5): requests = (grequests.get(url) for url in batch) responses = grequests.map(requests) for response in responses: path = generate_file_path(response.url.split('/')[-1]) with open(path, 'w') as f: f.write(response.content) downloaded += 1 if progress_reporter: progress_reporter(downloaded)