def random_article(): return 'http://en.wikipedia.org/wiki/Special:Random' def images_in_article(url): html = urlopen(url).read().decode('utf-8') soup = BeautifulSoup(html) title = safe_characters(soup.find('h1', id='firstHeading').string) print('Parse article:', title) for wrapper in soup.find_all('a', class_='image'): img = wrapper.find('img', recursive=False) yield 'http:' + img.get('src') if __name__ == '__main__': parser = ArgumentParser(description='Download random images from random \ Wikipedia articles.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('-n', '--count', type=int, default=20, help='Rough amount of images to download') parser.add_argument('-d', '--directory', default='data/wikipedia', help='Directory to download images into; gets created if not exists') args = parser.parse_args() ensure_directory(args.directory) count = 0 while count < args.count: urls = images_in_article(random_article()) count += download_files(urls, args.directory)
response = urlopen(request).readall().decode('utf-8') content = json.loads(response) results = content['responseData']['results'] # Return urls for result in results: yield result['url'] if __name__ == '__main__': parser = ArgumentParser(description='Download Google image search \ results.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('-q', '--query', required=True, help='Search term') parser.add_argument('-p', '--pages', type=int, default=8, help='Amount of result pages to fetch; up to 8') parser.add_argument( '-d', '--directory', default='data/google/<query>', help='Directory to download images into; gets created if not exists') args = parser.parse_args() args.directory = args.directory.replace('<query>', args.query) ensure_directory(args.directory) urls = google_images(args.query, args.pages) download_files(urls, args.directory)
def google_images(query, pages): for page in range(pages): request = google_images_request(query, page) print('Query page', page + 1) # Fetch results response = urlopen(request).readall().decode('utf-8') content = json.loads(response) results = content['responseData']['results'] # Return urls for result in results: yield result['url'] if __name__ == '__main__': parser = ArgumentParser(description='Download Google image search \ results.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('-q', '--query', required=True, help='Search term') parser.add_argument('-p', '--pages', type=int, default=8, help='Amount of result pages to fetch; up to 8') parser.add_argument('-d', '--directory', default='data/google/<query>', help='Directory to download images into; gets created if not exists') args = parser.parse_args() args.directory = args.directory.replace('<query>', args.query) ensure_directory(args.directory) urls = google_images(args.query, args.pages) download_files(urls, args.directory)