Exemple #1
0
    def test_domain_filter(self):
        url = download.generate_search_url("wibble", domain='example.com')
        baseurl, params = url.split('?')

        self.assertEqual(baseurl, "https://www.google.com/search")

        params_list = params.split('&')
        if 'q=wibble+site%3Aexample.com' not in params_list:
            self.fail("Domain filtering not set appropriately")
Exemple #2
0
    def test_google_url_gen(self):
        url = download.generate_search_url('wibble')
        baseurl, params = url.split('?')

        self.assertEqual(baseurl, "https://www.google.com/search")

        params_list = params.split('&')
        if 'tbm=isch' not in params_list:
            self.fail("Image search type not set appropriately")
        if 'q=wibble' not in params_list:
            self.fail("Search keyword not set appropriately")
Exemple #3
0
    def test_image_style(self):
        url = download.generate_search_url("wibble", style='clipart')
        baseurl, params = url.split('?')

        self.assertEqual(baseurl, "https://www.google.com/search")

        params_list = params.split('&')
        if 'tbm=isch' not in params_list:
            self.fail("Image search type not set appropriately")
        if 'q=wibble' not in params_list:
            self.fail("Search keyword not set appropriately")
        if 'tbs=itp%3Aclipart' not in params_list:
            self.fail("Image style not set appropriately")

        self.assertRaises(ValueError,
                          download.generate_search_url,
                          "wibble",
                          style='broken')
Exemple #4
0
    def test_download(self):
        dl_dir = "./unittest-images/"

        if os.path.exists(dl_dir):
            shutil.rmtree('./unittest-images/')

        os.mkdir('./unittest-images/')
        try:
            url = download.generate_search_url("squirrel")
            download.download_images(url, './unittest-images/squirrel', 3)
            for i in range(0, 3):
                paths = (
                    './unittest-images/squirrel.%03i.jpeg' % i,
                    './unittest-images/squirrel.%03i.png' % i,
                )
                for path in paths:
                    if not os.path.exists(path):
                        break
                else:
                    self.fail("Image file '%s' does not exist" % path)
        finally:
            shutil.rmtree('./unittest-images/')
def prepare_imageset(dataset,
                     base_search_term,
                     search_opts,
                     output_dir,
                     download_count=100):
    ''' pipeline.prepare_imageset(...) -> imagedir

    Downloads a base set of images corresponding to the entries in
    'dataset', if they're not already cached. Returns the directory
    containing the ImageFolder-structured tree of images.
    '''

    hashvalue = generate_hash(dataset, base_search_term, search_opts)
    imgdir = '%s/%s' % (output_dir, hashvalue)

    if not os.path.exists(imgdir):
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)

        # Create directory structure
        os.mkdir(imgdir)
        os.mkdir("%s/base/" % imgdir)
        for label in dataset["Label"].unique():
            os.mkdir("%s/base/%s" % (imgdir, label))

        # Download images
        for row in dataset.itertuples():
            name = row.Character
            search_term = "%s %s" % (base_search_term, name)
            pattern = "%s/base/%s/%s" % (imgdir, row.Label,
                                         download.pathify(name))
            url = download.generate_search_url(search_term, **search_opts)
            download.download_images(url, pattern, download_count)

        filter_imageset(imgdir)
        generate_image_transforms(imgdir)

    return imgdir