response = urlopen(request).readall().decode('utf-8') content = json.loads(response) results = content['responseData']['results'] # Return urls for result in results: yield result['url'] if __name__ == '__main__': parser = ArgumentParser(description='Download Google image search \ results.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('-q', '--query', required=True, help='Search term') parser.add_argument('-p', '--pages', type=int, default=8, help='Amount of result pages to fetch; up to 8') parser.add_argument( '-d', '--directory', default='data/google/<query>', help='Directory to download images into; gets created if not exists') args = parser.parse_args() args.directory = args.directory.replace('<query>', args.query) ensure_directory(args.directory) urls = google_images(args.query, args.pages) download_files(urls, args.directory)
parser.add_argument('-c', '--count', type=int, default=100, help='Amount of query result images to fetch; images without \ metadata will be skipped so fewer images may be downloaded') parser.add_argument('-u', '--uris', help='Download images and metadata from existing list of Wikimedia \ Commons uris rather than querying them first') parser.add_argument('-d', '--directory', default='data/fetch/<timestamp>-commons', help='Directory to download images into; gets created if not exists') args = parser.parse_args() timestamp = str(datetime.now().strftime('%y-%m-%d-%H-%M')) directory = args.directory.replace('<timestamp>', timestamp) uris = [] if args.uris: name = os.path.basename(args.uris) uris = read_lines(args.uris) elif args.metadata_query: name = args.metadata_query[0] uris = fetch_uris_from_metadata(args.metadata_query, args.count) elif args.article_query: name = args.article_query[0] uris = fetch_uris_from_articles(args.article_query, args.count) else: assert False, ('One of parameters --uris, --metadata-query and ' + '--article-query is required') ensure_directory(directory) print('Download', len(uris), 'images and metadata into', directory) images_and_metadata(uris, directory)
def random_article(): return 'http://en.wikipedia.org/wiki/Special:Random' def images_in_article(url): html = urlopen(url).read().decode('utf-8') soup = BeautifulSoup(html) title = safe_characters(soup.find('h1', id='firstHeading').string) print('Parse article:', title) for wrapper in soup.find_all('a', class_='image'): img = wrapper.find('img', recursive=False) yield 'http:' + img.get('src') if __name__ == '__main__': parser = ArgumentParser(description='Download random images from random \ Wikipedia articles.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('-n', '--count', type=int, default=20, help='Rough amount of images to download') parser.add_argument('-d', '--directory', default='data/wikipedia', help='Directory to download images into; gets created if not exists') args = parser.parse_args() ensure_directory(args.directory) count = 0 while count < args.count: urls = images_in_article(random_article()) count += download_files(urls, args.directory)