response = urlopen(request).readall().decode('utf-8')
        content = json.loads(response)
        results = content['responseData']['results']
        # Return urls
        for result in results:
            yield result['url']


if __name__ == '__main__':
    parser = ArgumentParser(description='Download Google image search \
        results.',
                            formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('-q', '--query', required=True, help='Search term')
    parser.add_argument('-p',
                        '--pages',
                        type=int,
                        default=8,
                        help='Amount of result pages to fetch; up to 8')
    parser.add_argument(
        '-d',
        '--directory',
        default='data/google/<query>',
        help='Directory to download images into; gets created if not exists')
    args = parser.parse_args()

    args.directory = args.directory.replace('<query>', args.query)

    ensure_directory(args.directory)
    urls = google_images(args.query, args.pages)
    download_files(urls, args.directory)
    parser.add_argument('-c', '--count', type=int, default=100,
        help='Amount of query result images to fetch; images without \
        metadata will be skipped so fewer images may be downloaded')
    parser.add_argument('-u', '--uris',
        help='Download images and metadata from existing list of Wikimedia \
        Commons uris rather than querying them first')
    parser.add_argument('-d', '--directory',
        default='data/fetch/<timestamp>-commons',
        help='Directory to download images into; gets created if not exists')
    args = parser.parse_args()

    timestamp = str(datetime.now().strftime('%y-%m-%d-%H-%M'))
    directory = args.directory.replace('<timestamp>', timestamp)
    uris = []
    if args.uris:
        name = os.path.basename(args.uris)
        uris = read_lines(args.uris)
    elif args.metadata_query:
        name = args.metadata_query[0]
        uris = fetch_uris_from_metadata(args.metadata_query, args.count)
    elif args.article_query:
        name = args.article_query[0]
        uris = fetch_uris_from_articles(args.article_query, args.count)
    else:
        assert False, ('One of parameters --uris, --metadata-query and ' +
            '--article-query is required')

    ensure_directory(directory)
    print('Download', len(uris), 'images and metadata into', directory)
    images_and_metadata(uris, directory)
def random_article():
    return 'http://en.wikipedia.org/wiki/Special:Random'


def images_in_article(url):
    html = urlopen(url).read().decode('utf-8')
    soup = BeautifulSoup(html)
    title = safe_characters(soup.find('h1', id='firstHeading').string)
    print('Parse article:', title)
    for wrapper in soup.find_all('a', class_='image'):
        img = wrapper.find('img', recursive=False)
        yield 'http:' + img.get('src')


if __name__ == '__main__':
    parser = ArgumentParser(description='Download random images from random \
        Wikipedia articles.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('-n', '--count', type=int, default=20,
        help='Rough amount of images to download')
    parser.add_argument('-d', '--directory', default='data/wikipedia',
        help='Directory to download images into; gets created if not exists')
    args = parser.parse_args()

    ensure_directory(args.directory)
    count = 0
    while count < args.count:
        urls = images_in_article(random_article())
        count += download_files(urls, args.directory)
Example #4
0
    parser.add_argument('-c', '--count', type=int, default=100,
        help='Amount of query result images to fetch; images without \
        metadata will be skipped so fewer images may be downloaded')
    parser.add_argument('-u', '--uris',
        help='Download images and metadata from existing list of Wikimedia \
        Commons uris rather than querying them first')
    parser.add_argument('-d', '--directory',
        default='data/fetch/<timestamp>-commons',
        help='Directory to download images into; gets created if not exists')
    args = parser.parse_args()

    timestamp = str(datetime.now().strftime('%y-%m-%d-%H-%M'))
    directory = args.directory.replace('<timestamp>', timestamp)
    uris = []
    if args.uris:
        name = os.path.basename(args.uris)
        uris = read_lines(args.uris)
    elif args.metadata_query:
        name = args.metadata_query[0]
        uris = fetch_uris_from_metadata(args.metadata_query, args.count)
    elif args.article_query:
        name = args.article_query[0]
        uris = fetch_uris_from_articles(args.article_query, args.count)
    else:
        assert False, ('One of parameters --uris, --metadata-query and ' +
            '--article-query is required')

    ensure_directory(directory)
    print('Download', len(uris), 'images and metadata into', directory)
    images_and_metadata(uris, directory)