def test_domain_filter(self): url = download.generate_search_url("wibble", domain='example.com') baseurl, params = url.split('?') self.assertEqual(baseurl, "https://www.google.com/search") params_list = params.split('&') if 'q=wibble+site%3Aexample.com' not in params_list: self.fail("Domain filtering not set appropriately")
def test_google_url_gen(self): url = download.generate_search_url('wibble') baseurl, params = url.split('?') self.assertEqual(baseurl, "https://www.google.com/search") params_list = params.split('&') if 'tbm=isch' not in params_list: self.fail("Image search type not set appropriately") if 'q=wibble' not in params_list: self.fail("Search keyword not set appropriately")
def test_image_style(self): url = download.generate_search_url("wibble", style='clipart') baseurl, params = url.split('?') self.assertEqual(baseurl, "https://www.google.com/search") params_list = params.split('&') if 'tbm=isch' not in params_list: self.fail("Image search type not set appropriately") if 'q=wibble' not in params_list: self.fail("Search keyword not set appropriately") if 'tbs=itp%3Aclipart' not in params_list: self.fail("Image style not set appropriately") self.assertRaises(ValueError, download.generate_search_url, "wibble", style='broken')
def test_download(self): dl_dir = "./unittest-images/" if os.path.exists(dl_dir): shutil.rmtree('./unittest-images/') os.mkdir('./unittest-images/') try: url = download.generate_search_url("squirrel") download.download_images(url, './unittest-images/squirrel', 3) for i in range(0, 3): paths = ( './unittest-images/squirrel.%03i.jpeg' % i, './unittest-images/squirrel.%03i.png' % i, ) for path in paths: if not os.path.exists(path): break else: self.fail("Image file '%s' does not exist" % path) finally: shutil.rmtree('./unittest-images/')
def prepare_imageset(dataset, base_search_term, search_opts, output_dir, download_count=100): ''' pipeline.prepare_imageset(...) -> imagedir Downloads a base set of images corresponding to the entries in 'dataset', if they're not already cached. Returns the directory containing the ImageFolder-structured tree of images. ''' hashvalue = generate_hash(dataset, base_search_term, search_opts) imgdir = '%s/%s' % (output_dir, hashvalue) if not os.path.exists(imgdir): if not os.path.exists(output_dir): os.mkdir(output_dir) # Create directory structure os.mkdir(imgdir) os.mkdir("%s/base/" % imgdir) for label in dataset["Label"].unique(): os.mkdir("%s/base/%s" % (imgdir, label)) # Download images for row in dataset.itertuples(): name = row.Character search_term = "%s %s" % (base_search_term, name) pattern = "%s/base/%s/%s" % (imgdir, row.Label, download.pathify(name)) url = download.generate_search_url(search_term, **search_opts) download.download_images(url, pattern, download_count) filter_imageset(imgdir) generate_image_transforms(imgdir) return imgdir