def CASAS_download(directory, datasets): """Download CASAS datasets to directory Args: directory (:obj:`str`): path to directory to store the downloaded datasets (:obj:`tuple` of :obj:`str`): list of datasets to download """ for dataset in datasets: filename = dataset_dict.get(dataset, None) if filename is None: print('Cannot find dataset %s' % dataset) print('Here are the available datasets:') for key in dataset_dict.keys(): print(' * %s' % key) else: # Download zipped files default_downloader(directory=directory, urls=[master_url + filename], filenames=[filename], clear=False) # Expand it in place file_path = os.path.join(directory, filename) if os.path.exists(file_path): zip_ref = zipfile.ZipFile(file_path, 'r') zip_ref.extractall(directory) zip_ref.close()
def test_convert(self): tempdir = self.tempdir with remember_cwd(): os.chdir(tempdir) assert_raises(IOError, adult.convert_adult, directory=tempdir, output_directory=tempdir) default_downloader( directory=tempdir, urls=[ 'https://archive.ics.uci.edu/ml/' 'machine-learning-databases/adult/adult.data', 'https://archive.ics.uci.edu/ml/' 'machine-learning-databases/adult/adult.test' ], filenames=['adult.data', 'adult.test']) adult.convert_adult(directory=tempdir, output_directory=tempdir) output_file = "adult.hdf5" output_file = os.path.join(tempdir, output_file) with h5py.File(output_file, 'r') as h5: assert h5['features'].shape == (30162 + 15060, 104) assert h5['targets'].shape[0] == h5['features'].shape[0]
def test_default_downloader_clear(self): file_path = os.path.join(self.tempdir, 'tmp.data') open(file_path, 'a').close() args = dict(directory=self.tempdir, clear=True, urls=[None], filenames=['tmp.data']) default_downloader(**args) assert not os.path.isfile(file_path)
def test_convert(self): tempdir = self.tempdir with remember_cwd(): os.chdir(tempdir) assert_raises(IOError, adult.convert_adult, directory=tempdir, output_directory=tempdir) default_downloader( directory=tempdir, urls=['https://archive.ics.uci.edu/ml/' 'machine-learning-databases/adult/adult.data', 'https://archive.ics.uci.edu/ml/' 'machine-learning-databases/adult/adult.test'], filenames=['adult.data', 'adult.test']) adult.convert_adult(directory=tempdir, output_directory=tempdir) output_file = "adult.hdf5" output_file = os.path.join(tempdir, output_file) with h5py.File(output_file, 'r') as h5: assert h5['features'].shape == (30162 + 15060, 104) assert h5['targets'].shape[0] == h5['features'].shape[0]
def test_default_downloader_save_no_filename(self): args = dict(directory=self.tempdir, clear=False, urls=[mock_url], filenames=[None]) default_downloader(**args) with open(self.filepath, 'rb') as f: assert_equal(f.read(), mock_content)
def svhn_downloader(which_format, directory, clear=False): suffix = {1: '.tar.gz', 2: '_32x32.mat'}[which_format] sets = ['train', 'test', 'extra'] default_downloader(directory=directory, urls=[None for f in sets], filenames=['{}{}'.format(s, suffix) for s in sets], url_prefix='http://ufldl.stanford.edu/housenumbers/', clear=clear)
def silhouettes_downloader(size, **kwargs): if size not in (16, 28): raise ValueError("size must be 16 or 28") actual_filename = FILENAME.format(size) actual_url = BASE_URL + actual_filename default_downloader(urls=[actual_url], filenames=[actual_filename], **kwargs)
def test_default_downloader_save_no_url_url_prefix(self): iris_path = os.path.join(self.tempdir, 'iris.data') args = dict(directory=self.tempdir, clear=False, urls=[None], filenames=['iris.data'], url_prefix=iris_url[:-9]) default_downloader(**args) with open(iris_path, 'r') as f: assert hashlib.md5( f.read().encode('utf-8')).hexdigest() == iris_hash
def test_default_downloader_save_no_filename(self): iris_path = os.path.join(self.tempdir, 'iris.data') args = DummyArgs(directory=self.tempdir, clear=False, urls=[iris_url], filenames=[None]) default_downloader(args) with open(iris_path, 'r') as f: assert hashlib.md5( f.read().encode('utf-8')).hexdigest() == iris_hash os.remove(iris_path)
def svhn_downloader(which_format, directory, clear=False): suffix = {1: '.tar.gz', 2: '_32x32.mat'}[which_format] sets = ['train', 'test', 'extra'] default_downloader( directory=directory, urls=[None for f in sets], filenames=['{}{}'.format(s, suffix) for s in sets], url_prefix='http://ufldl.stanford.edu/housenumbers/', clear=clear)
def ensure_dataset_ready(basename, version, url_dir, kerosenedir): # setup names filename, url = paths_from_metadata(basename, version, url_dir) filetarget = os.path.join(kerosenedir, filename) # if file is not present, download it (also created directories if needed) if not os.path.isfile(filetarget): default_downloader(kerosenedir, [url], [filename]) # override fuel's centralized location temporarily fuel.config.data_path = kerosenedir return filename
def ensure_dataset_ready(basename, version, url_dir): # setup names filename, url = paths_from_metadata(basename, version, url_dir) kerosenedir = os.path.expanduser(os.path.join('~', '.kerosene', 'datasets')) filetarget = os.path.join(kerosenedir, filename) # if file is not present, download it (also created directories if needed) if not os.path.isfile(filetarget): default_downloader(kerosenedir, [url], [filename]) # override fuel's centralized location temporarily fuel.config.data_path = kerosenedir return filename
def download_model(model_name): # see if this is a known model if not model_name in model_download_table.keys(): print("Failure: unknown model {}".format(model_name)) sys.exit(1) # resolve url model_url = model_download_table[model_name] platzoo_dir = get_platzoo_dir() local_gz_filename = model_url.split("/")[-1] temp_dir = tempfile.mkdtemp() # download default_downloader(temp_dir, [model_url], [local_gz_filename]) if local_gz_filename.endswith(".gz"): local_filename = local_gz_filename[:-3] else: local_filename = "{}.2".format(local_gz_filename) # convert to absolute paths final_local_filepath = os.path.join(platzoo_dir, local_filename) final_local_linkpath = os.path.join(platzoo_dir, model_name) temp_gz_filepath = os.path.join(temp_dir, local_gz_filename) temp_filepath = os.path.join(temp_dir, local_filename) # decompress the file to temporary location print("Decompressing {}".format(model_name)) with open(temp_filepath, 'wb') as f_out, gzip.open(temp_gz_filepath, 'rb') as f_in: shutil.copyfileobj(f_in, f_out) # atomic rename (prevents half-downloaded files) print("Installing {}".format(model_name)) os.rename(temp_filepath, final_local_filepath) # symlink, removing old first if necessary if os.path.exists(final_local_linkpath): os.remove(final_local_linkpath) os.symlink(local_filename, final_local_linkpath) # cleanup temp directory # TODO: try/catch the download for failure cleanup shutil.rmtree(temp_dir)
def test_download_and_convert(self): tempdir = self.tempdir cwd = os.getcwd() os.chdir(tempdir) assert_raises(IOError, iris.convert_iris, directory=tempdir, output_directory=tempdir) default_downloader( directory=tempdir, urls=[ 'https://archive.ics.uci.edu/ml/machine-learning-databases/' 'iris/iris.data' ], filenames=['iris.data']) classes = { b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2 } data = numpy.loadtxt(os.path.join(tempdir, 'iris.data'), converters={4: lambda x: classes[x]}, delimiter=',') features = data[:, :-1].astype('float32') targets = data[:, -1].astype('uint8').reshape((-1, 1)) iris.convert_iris(directory=tempdir, output_directory=tempdir) os.chdir(cwd) output_file = "iris.hdf5" output_file = os.path.join(tempdir, output_file) with h5py.File(output_file, 'r') as h5: assert numpy.allclose(h5['features'], features) assert numpy.allclose(h5['targets'], targets)
def test_download_and_convert(self): tempdir = self.tempdir with remember_cwd(): os.chdir(tempdir) assert_raises(IOError, iris.convert_iris, directory=tempdir, output_directory=tempdir) default_downloader( directory=tempdir, urls=['https://archive.ics.uci.edu/ml/' 'machine-learning-databases/iris/iris.data'], filenames=['iris.data']) classes = { b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2} data = numpy.loadtxt( os.path.join(tempdir, 'iris.data'), converters={4: lambda x: classes[x]}, delimiter=',') features = data[:, :-1].astype('float32') targets = data[:, -1].astype('uint8').reshape((-1, 1)) iris.convert_iris(directory=tempdir, output_directory=tempdir) output_file = "iris.hdf5" output_file = os.path.join(tempdir, output_file) with h5py.File(output_file, 'r') as h5: assert numpy.allclose(h5['features'], features) assert numpy.allclose(h5['targets'], targets)
def downloader_wrapper(format, directory, **kwargs): # add the right format file to the download list files.insert(0, "{}.tgz".format(resolve_filename(format))) urls = map(lambda s: 'http://vis-www.cs.umass.edu/lfw/' + s, files) default_downloader(directory, urls=urls, filenames=files, **kwargs)
def test_default_downloader_save_no_url_url_prefix(self): args = dict(directory=self.tempdir, clear=False, urls=[None], filenames=[mock_filename], url_prefix=mock_url[:-9]) default_downloader(**args) with open(self.filepath, 'rb') as f: assert_equal(f.read(), mock_content)
def download(cls, directory=None): if not directory: directory = os.getcwd() return default_downloader(directory, cls.urls, cls.filenames)