def download(self): """ Download, and unzip in the correct location. Returns: """ import urllib import zipfile if self.check_exists(): return # download files try: os.makedirs(self.root) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, filename) ext = os.path.splitext(file_path)[1] with open(file_path, 'wb') as f: f.write(data.read()) if ext == '.zip': with zipfile.ZipFile(file_path) as zip_f: zip_f.extractall(self.root) os.unlink(file_path) print('Done!')
def download(self): """Download the EMNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip import shutil import zipfile if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise print('Downloading ' + self.url) data = urllib.request.urlopen(self.url) filename = self.url.rpartition('/')[2] raw_folder = os.path.join(self.root, self.raw_folder) file_path = os.path.join(raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) print('Extracting zip archive') with zipfile.ZipFile(file_path) as zip_f: zip_f.extractall(raw_folder) os.unlink(file_path) gzip_folder = os.path.join(raw_folder, 'gzip') for gzip_file in os.listdir(gzip_folder): if gzip_file.endswith('.gz'): print('Extracting ' + gzip_file) with open(os.path.join(raw_folder, gzip_file.replace('.gz', '')), 'wb') as out_f, \ gzip.GzipFile(os.path.join(gzip_folder, gzip_file)) as zip_f: out_f.write(zip_f.read()) shutil.rmtree(gzip_folder) # process and save as torch files for split in self.splits: print('Processing ' + split) training_set = ( read_image_file(os.path.join(raw_folder, 'emnist-{}-train-images-idx3-ubyte'.format(split))), read_label_file(os.path.join(raw_folder, 'emnist-{}-train-labels-idx1-ubyte'.format(split))) ) test_set = ( read_image_file(os.path.join(raw_folder, 'emnist-{}-test-images-idx3-ubyte'.format(split))), read_label_file(os.path.join(raw_folder, 'emnist-{}-test-labels-idx1-ubyte'.format(split))) ) with open(os.path.join(self.root, self.processed_folder, self._training_file(split)), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.root, self.processed_folder, self._test_file(split)), 'wb') as f: torch.save(test_set, f) print('Done!')
def save_dataset(self): """""" import gzip if self._check_exists(self.processed_folder): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_set = (read_image_file( os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), read_label_file( os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'))) test_set = (read_image_file( os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file( os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'))) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): """Download the Moving MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_set = torch.from_numpy( np.load( os.path.join(self.root, self.raw_folder, 'mnist_test_seq.npy')).swapaxes(0, 1)[:-self.split]) test_set = torch.from_numpy( np.load( os.path.join(self.root, self.raw_folder, 'mnist_test_seq.npy')).swapaxes(0, 1)[-self.split:]) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def _download_files(self): for key, url in self.urls.items(): filename = url.rpartition('/')[2] file_path = join(self.root, self.raw_folder, filename) if not os.path.exists(file_path) and\ not os.path.exists(file_path.replace('.zip', '')): print('Downloading ' + url) data = urllib.request.urlopen(url) with open(file_path, 'wb') as f: f.write(data.read())
def download(self): """Download the MusicNet data if it doesn't exist in ``raw_folder`` already.""" from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise filename = self.url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.exists(file_path): print('Downloading ' + self.url) data = urllib.request.urlopen(self.url) with open(file_path, 'wb') as f: # stream the download to disk (it might not fit in memory!) while True: chunk = data.read(16*1024) if not chunk: break f.write(chunk) if not all(map(lambda f: os.path.exists(os.path.join(self.root, f)), self.extracted_folders)): print('Extracting ' + filename) if call(["tar", "-xf", file_path, '-C', self.root, '--strip', '1']) != 0: raise OSError("Failed tarball extraction") # process and save as torch files print('Processing...') self.process_data(self.test_data) trees = self.process_labels(self.test_labels) with open(os.path.join(self.root, self.test_labels, self.test_tree), 'wb') as f: pickle.dump(trees, f) self.process_data(self.train_data) trees = self.process_labels(self.train_labels) with open(os.path.join(self.root, self.train_labels, self.train_tree), 'wb') as f: pickle.dump(trees, f) print('Download Complete')
def download(self): """Download the REDE data if it doesn't exist in `processed_folder` already.""" from six.moves import urllib if self._check_exists(): return # Make directories. try: os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise try: os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise # Download dataset. for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) # Process and save as torch files. print('Processing...') full_set = ( read_file( os.path.join(self.root, self.raw_folder, '1848-62-111-images.pkl')), read_file( os.path.join(self.root, self.raw_folder, '1848-5-parameters.pkl')) # read_file(os.path.join(self.root, self.raw_folder, '1848-frequencies_modes.pkl')) ) with open( os.path.join(self.root, self.processed_folder, self.full_file), 'wb') as f: torch.save(full_set, f) print('Done!')
def download(self): from six.moves import urllib import zipfile if self._check_exists(): return try: os.makedirs(os.path.join(self.root, self.splits_folder)) os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for k, url in self.vinyals_split_sizes.items(): print('== Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[-1] file_path = os.path.join(self.root, self.splits_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) for url in self.urls: print('== Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) file_processed = os.path.join(self.root, self.processed_folder) print("== Unzip from " + file_path + " to " + file_processed) zip_ref = zipfile.ZipFile(file_path, 'r') zip_ref.extractall(file_processed) zip_ref.close() print("Download finished.")
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise if not os.path.exists( os.path.join(self.root, self.raw_folder, 'notMNIST_small.mat')): for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) # process and save as torch files print('Processing...') import scipy.io as sio data = sio.loadmat( os.path.join(self.root, self.raw_folder, 'notMNIST_small.mat')) images = torch.ByteTensor(data['images']).permute( 2, 0, 1) # The data is stored as HxWxN, need to permute! labels = torch.LongTensor(data['labels']) data_set = ( images, labels, ) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(data_set, f) print('Done!')
def download_url(url, folder, log=True): if log: print('Downloading', url) makedirs(folder) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] path = osp.join(folder, filename) with open(path, 'wb') as f: f.write(data.read()) return path
def download(self): """Download the CamVid data if it doesn't exist in processed_folder already.""" self.raw_folder.mkdir(exist_ok=True, parents=True) self.processed_folder.mkdir(exist_ok=True, parents=True) print(f'Downloading {self.urls["raw"]}') data = urllib.request.urlopen(self.urls["raw"]) with tempfile.NamedTemporaryFile('w') as tmp: tmp.write(data.read()) with zipfile.ZipFile(tmp.name) as zip_f: zip_f.extractall(self.raw_folder) print(f'Downloading {self.urls["labels"]}') data = urllib.request.urlopen(self.urls["labels"]) with tempfile.NamedTemporaryFile('wb') as tmp: tmp.write(data.read()) with zipfile.ZipFile(tmp.name) as zip_f: zip_f.extractall(self.raw_folder / 'LabeledApproved_full') print(f'Downloading {self.urls["classes"]}') data = urllib.request.urlopen(self.urls["classes"]) with open(self.processed_folder / 'label_colors.txt', 'wb') as class_list: class_list.write(data.read()) # process and save as torch files print('Processing...') self.class_to_idx, colours = self.read_label_file(self.processed_folder / 'label_colors.txt') with h5py.File(self.training_file, 'w') as f_train, h5py.File(self.test_file, 'w') as f_test: self.process_raw_image_files(self.raw_folder / '701_StillsRaw_full', f_train, f_test) self.process_label_image_files(self.raw_folder / 'LabeledApproved_full', colours, f_train, f_test) print('Done!')
def download(self): """Download the rar files data if it doesn't exist in processed_folder already.""" from six.moves import urllib import rarfile if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[-1] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with rarfile.RarFile(file_path) as rar_f: rar_f.extractall(self.raw_folder) #os.unlink(file_path) # process and save as torch files print('Processing...') train_set = read_rar_file( rarfile.RarFile( os.path.join(self.root, self.raw_folder, 'experimental_dataset_2013.rar'))) test_set = read_rar_file( rarfile.RarFile( os.path.join(self.root, self.raw_folder, 'icdar2013_benchmarking_dataset.rar'))) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(train_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def load_l8(path, use_hr, use_mr, use_lr): bands_selected = [] if use_hr: bands_selected = bands_selected + L8_BANDS_HR if use_mr: bands_selected = bands_selected + L8_BANDS_MR if use_lr: bands_selected = bands_selected + L8_BANDS_LR bands_selected = sorted(bands_selected) with rasterio.open(path) as data: l8 = data.read(bands_selected) l8 = l8.astype(np.float32) l8 = np.clip(l8, 0, 1) l8 = normalize_L8(l8) return l8
def load_s2(path, use_hr, use_mr, use_lr): bands_selected = [] if use_hr: bands_selected = bands_selected + S2_BANDS_HR if use_mr: bands_selected = bands_selected + S2_BANDS_MR if use_lr: bands_selected = bands_selected + S2_BANDS_LR bands_selected = sorted(bands_selected) with rasterio.open(path) as data: s2 = data.read(bands_selected) s2 = s2.astype(np.float32) s2 = np.clip(s2, 0, 10000) s2 = normalize_S2(s2) return s2
def load_s1(path, imgTransform): with rasterio.open(path) as data: band1 = data.read(1) band2 = data.read(2) band1 = band1.astype(np.float32) band1 = np.nan_to_num(band1) band1 = np.clip(band1, -25, 0) if not imgTransform: band1 /= 25 band1 += 1 band2 = band2.astype(np.float32) band2 = np.nan_to_num(band2) band2 = np.clip(band2, -25, 0) if not imgTransform: band2 /= 25 band2 += 1 # band3 = abs(band2 - band1) # band3 /= 25 s1 = np.stack((band1, band2)) # , band3)) return s1
def download_data(url,save_path): ''' 数据下载工具,当原始数据不存在时会进行下载 :param url: :param save_path: :return: ''' print('downloading data from {}'.format(url)) if not os.path.exists(save_path): os.makedirs(save_path) data=urllib.request.urlopen(url) filename='DD.zip' with open(os.path.join(save_path,filename),'wb') as f: f.write(data.read()) return True
def download(self): from six.moves import urllib import zipfile print('\n-- Loading PhotoTour dataset: {}'.format(self.name)) if self._check_exists(): print('Found cached data {}'.format(self.data_file)) return # download files try: os.makedirs(self.root) except OSError as e: if e.errno == errno.EEXIST: pass else: raise if not self._check_downloaded(): url = self.urls[self.name] data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, filename) print('Downloading {}\nDownloading {}\n\nIt might take while. ' 'Please grab yourself a coffee and relax.\n'.format( url, file_path)) with open(file_path, 'wb') as f: f.write(data.read()) print('Extracting data {}\n'.format(self.data_down)) with zipfile.ZipFile(file_path, 'r') as z: z.extractall(self.data_dir) os.unlink(file_path) # process and save as torch files print('Caching data {}'.format(self.data_file)) data_set = (read_image_file(self.data_dir, self.image_ext, self.size, self.lens[self.name]), read_info_file(self.data_dir, self.info_file), read_matches_files(self.data_dir, self.matches_files)) with open(self.data_file, 'wb') as f: torch.save(data_set, f)
def __init__(self, root, json, vocab, transform=None): """Set the path for images, captions and vocabulary wrapper. Args: root: image directory. json: coco annotation file path. vocab: vocabulary wrapper. transform: image transformer. """ self.root = root with open(json, "r") as data: dict_data = ast.literal_eval(data.read()) self.coco = dict_data self.ids = list(self.coco.keys()) self.vocab = vocab self.transform = transform
def data_download(self): """Download the VCTK data if it doesn't exist in processed_folder already.""" from six.moves import urllib import tarfile raw_abs_dir = os.path.join(self.root, self.raw_folder) splits = ['train', 'valid', 'test'] processed_abs_dirs = [os.path.join(self.root, self.processed_folder, \ split) for split in splits] dset_abs_path = os.path.join(self.root, self.raw_folder, self.dset_path) if self._check_exists(): return raw_abs_dir, dset_abs_path, processed_abs_dirs, splits # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise try: for processed_abs_dir in processed_abs_dirs: os.makedirs(processed_abs_dir) except OSError as e: if e.errno == errno.EEXIST: pass else: raise url = self.url print('Downloading ' + url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.isfile(file_path): data = urllib.request.urlopen(url) with open(file_path, 'wb') as f: f.write(data.read()) if not os.path.exists(dset_abs_path): with tarfile.open(file_path) as zip_f: zip_f.extractall(raw_abs_dir) else: print("Using existing raw folder") if not self.dev_mode: os.unlink(file_path) return raw_abs_dir, dset_abs_path, processed_abs_dirs, splits
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') training_set = ( read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte')) ) test_set = ( read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte')) ) with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def maybe_download(root): from six.moves import urllib import zipfile processed_path = os.path.join(root, 'processed') splits_dirs = { 'meta_train': os.path.join(processed_path, 'images_background'), 'meta_test': os.path.join(processed_path, 'images_evaluation') } if check_exists(splits_dirs): return splits_dirs # download files data_urls = [ 'https://github.com/brendenlake/omniglot/raw/master/python/images_background.zip', 'https://github.com/brendenlake/omniglot/raw/master/python/images_evaluation.zip' ] raw_folder = 'raw' processed_folder = 'processed' try: os.makedirs(os.path.join(root, raw_folder)) os.makedirs(os.path.join(root, processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in data_urls: print('== Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(root, raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) file_processed = os.path.join(root, processed_folder) print("== Unzip from " + file_path + " to " + file_processed) zip_ref = zipfile.ZipFile(file_path, 'r') zip_ref.extractall(file_processed) zip_ref.close() print("Download finished.") return splits_dirs
def load_lc(path, no_savanna=False, igbp=True): # load labels with rasterio.open(path) as data: lc = data.read(1) # convert IGBP to dfc2020 classes if igbp: lc = np.take(DFC2020_CLASSES, lc) else: lc = lc.astype(np.int64) # adjust class scheme to ignore class savanna if no_savanna: lc[lc == 3] = 0 lc[lc > 3] -= 1 # convert to zero-based labels and set ignore mask lc -= 1 lc[lc == -1] = 255 return lc
def download(self): from six.moves import urllib if self._check_exists(): return # download files try: os.makedirs(self.root) except OSError as e: if e.errno == os.errno.EEXIST: pass else: raise print('Downloading ' + self.url) data = urllib.request.urlopen(self.url) file_path = os.path.join(self.root, self.mat_file) with open(file_path, 'wb') as f: f.write(data.read()) print('Done!')
def download(self): from six.moves import urllib import numpy as np if self._check_exists(): return # Download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise print('Downloading ' + self.url) data = urllib.request.urlopen(self.url) filename = os.path.basename(self.url) file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) # Process and save as torch files print('Processing...') with open(file_path, 'rb') as f: raw_dataset = np.load(f) training_set = (255 * torch.ByteTensor(raw_dataset['imgs']), torch.ByteTensor(raw_dataset['latents_classes'])) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) print('Done!')
def download(self): from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read())
def __download(self): """ Downloads the KMNIST dataset from the web if dataset hasn't already been downloaded. """ from six.moves import urllib if self.__check_exists(): return print("Downloading KMNIST dataset") urls = [ 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz', 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz', ] # download files try: os.makedirs(self.__path) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.__path, filename) with open(file_path, 'wb') as f: f.write(data.read()) print('Done!')
def download(self): """Download the FGVC-Aircraft data if it doesn't exist already.""" from six.moves import urllib import tarfile if self._check_exists(): return # prepare to download data to PARENT_DIR/fgvc-aircraft-2013.tar.gz print('Downloading %s ... (may take a few minutes)' % self.url) parent_dir = os.path.abspath(os.path.join(self.root, os.pardir)) tar_name = self.url.rpartition('/')[-1] tar_path = os.path.join(parent_dir, tar_name) data = urllib.request.urlopen(self.url) # download .tar.gz file with open(tar_path, 'wb') as f: f.write(data.read()) # extract .tar.gz to PARENT_DIR/fgvc-aircraft-2013b data_folder = tar_path.strip('.tar.gz') print('Extracting %s to %s ... (may take a few minutes)' % (tar_path, data_folder)) tar = tarfile.open(tar_path) tar.extractall(parent_dir) # if necessary, rename data folder to self.root if not os.path.samefile(data_folder, self.root): print('Renaming %s to %s ...' % (data_folder, self.root)) os.rename(data_folder, self.root) # delete .tar.gz file print('Deleting %s ...' % tar_path) os.remove(tar_path) print('Done!')
def download(self): """Download CODH char shapes data if it doesn't exist in processed_folder already.""" from six.moves import urllib import zipfile # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for book_id in self.book_ids: url = self.download_url_format.format(book_id, book_id) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if self._check_integrity(file_path, self.zips_md5[filename]): print('File already downloaded and verified: ' + filename) continue print('Downloading ' + url) with open(file_path, 'wb') as f: f.write(data.read()) print('Extracting data: ' + filename) with zipfile.ZipFile(file_path, 'r') as zip_ref: target_dir = file_path.replace('.zip', '') zip_ref.extractall(target_dir) # remove download zip file os.unlink(file_path)
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip print("download: trying to download") if self._check_exists(): print("download: already exists so exiting") return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') train_label, train_non_few_shot_ids, train_few_shot_ids = read_label_file( os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'), self.few_shot_class) train_img = read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte'), non_few_shot_ids=train_non_few_shot_ids) training_set = (train_img, train_label) test_label, test_non_few_shot_ids, test_few_shot_ids = read_label_file( os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'), self.few_shot_class) test_img = read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte'), few_shot_ids=test_few_shot_ids) if self.test_emnist: print("Download: Entering Emnist test") from emnist import extract_test_samples images, labels = extract_test_samples('letters') print(images.shape) print(labels.shape) #randomly grab a letter import random rand_letter_idx = random.randint(0, 25) #idx for selected letter clas test_sample_ids = np.where(labels < 10)[0] np.random.seed(10) np.random.shuffle(test_sample_ids) print('test_sample_ids_len', len(test_sample_ids)) #grab labels and images from that class labels = labels[test_sample_ids] images = images[test_sample_ids] print("After selecting one class") print(images.shape) print(labels.shape) #assert(self.few_shot_class not in labels) if self.max_test_sample: test_set = { torch.ByteTensor(list(images[:self.max_test_sample])).view( -1, 28, 28), torch.LongTensor(list(labels[:self.max_test_sample])) } else: test_set = { torch.ByteTensor(list(images)).view(-1, 28, 28), torch.LongTensor(list(labels)) } else: # test_label, test_non_few_shot_ids, test_few_shot_ids= read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'), self.few_shot_class) # test_img = read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte'), few_shot_ids=test_few_shot_ids) if (self.max_test_sample): print('testing max test sample') test_set = (test_img[:self.max_test_sample], test_label[:self.max_test_sample]) else: test_set = (test_img, test_label) print('confirming test size') #print(len(test_set[0]), len(test_set[1])) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): """Download the MNISTM data.""" # import essential packages from six.moves import urllib import gzip import pickle from torchvision import datasets # check if dataset already exists if self._check_exists(): return # make data dirs try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise # download pkl files logging.info("Downloading " + self.url) filename = self.url.rpartition("/")[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.exists(file_path.replace(".gz", "")): data = urllib.request.urlopen(self.url) with open(file_path, "wb") as f: f.write(data.read()) with open(file_path.replace(".gz", ""), "wb") as out_f, gzip.GzipFile( file_path ) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files logging.info("Processing...") # load MNIST-M images from pkl file with open(file_path.replace(".gz", ""), "rb") as f: mnist_m_data = pickle.load(f, encoding="bytes") mnist_m_train_data = torch.ByteTensor(mnist_m_data[b"train"]) mnist_m_test_data = torch.ByteTensor(mnist_m_data[b"test"]) # get MNIST labels mnist_train_labels = datasets.MNIST( root=self.mnist_root, train=True, download=True ).targets mnist_test_labels = datasets.MNIST( root=self.mnist_root, train=False, download=True ).targets # save MNIST-M dataset training_set = (mnist_m_train_data, mnist_train_labels) test_set = (mnist_m_test_data, mnist_test_labels) with open( os.path.join(self.root, self.processed_folder, self.training_file), "wb" ) as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), "wb" ) as f: torch.save(test_set, f) logging.info("[DONE]")
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise for url in self.urls: print('Downloading ' + url) data = urllib.request.urlopen(url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') train_label, train_remove_mask = read_label_file( os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte'), remove_label=self.get_rid_of) train_data = read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte'), remove_mask=train_remove_mask) training_set = (train_data, train_label) #training_set = ( # read_image_file(os.path.join(self.root, self.raw_folder, 'train-images-idx3-ubyte')), # read_label_file(os.path.join(self.root, self.raw_folder, 'train-labels-idx1-ubyte')) #) test_label, test_remove_mask = read_label_file( os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte'), remove_label=self.get_rid_of) test_data = read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte'), remove_mask=test_remove_mask) test_set = (test_data, test_label) #test_set = ( # read_image_file(os.path.join(self.root, self.raw_folder, 't10k-images-idx3-ubyte')), # read_label_file(os.path.join(self.root, self.raw_folder, 't10k-labels-idx1-ubyte')) #) with open( os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): """Download the EMNIST data if it doesn't exist in processed_folder already.""" from six.moves import urllib import gzip import shutil import zipfile if self._check_exists(): return # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise print('Downloading ' + self.url) data = urllib.request.urlopen(self.url) filename = self.url.rpartition('/')[2] raw_folder = os.path.join(self.root, self.raw_folder) file_path = os.path.join(raw_folder, filename) with open(file_path, 'wb') as f: f.write(data.read()) print('Extracting zip archive') with zipfile.ZipFile(file_path) as zip_f: zip_f.extractall(raw_folder) os.unlink(file_path) gzip_folder = os.path.join(raw_folder, 'gzip') for gzip_file in os.listdir(gzip_folder): if gzip_file.endswith('.gz'): print('Extracting ' + gzip_file) with open(os.path.join(raw_folder, gzip_file.replace('.gz', '')), 'wb') as out_f, \ gzip.GzipFile(os.path.join(gzip_folder, gzip_file)) as zip_f: out_f.write(zip_f.read()) shutil.rmtree(gzip_folder) # process and save as torch files for split in self.splits: print('Processing ' + split) training_set = ( read_image_file( os.path.join( raw_folder, 'emnist-{}-train-images-idx3-ubyte'.format(split))), read_label_file( os.path.join( raw_folder, 'emnist-{}-train-labels-idx1-ubyte'.format(split)))) test_set = ( read_image_file( os.path.join( raw_folder, 'emnist-{}-test-images-idx3-ubyte'.format(split))), read_label_file( os.path.join( raw_folder, 'emnist-{}-test-labels-idx1-ubyte'.format(split)))) with open( os.path.join(self.root, self.processed_folder, self._training_file(split)), 'wb') as f: torch.save(training_set, f) with open( os.path.join(self.root, self.processed_folder, self._test_file(split)), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): """Download the MNIST data.""" # import essential packages from six.moves import urllib import gzip import pickle from torchvision import datasets # check if dataset already exists if self._check_exists(): return # make data dirs try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise # download pkl files print('Downloading ' + self.url) filename = self.url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.exists(file_path.replace('.gz', '')): data = urllib.request.urlopen(self.url) with open(file_path, 'wb') as f: f.write(data.read()) with open(file_path.replace('.gz', ''), 'wb') as out_f, \ gzip.GzipFile(file_path) as zip_f: out_f.write(zip_f.read()) os.unlink(file_path) # process and save as torch files print('Processing...') # load MNIST-M images from pkl file with open(file_path.replace('.gz', ''), "rb") as f: mnist_m_data = pickle.load(f, encoding='bytes') mnist_m_train_data = torch.ByteTensor(mnist_m_data[b'train']) mnist_m_test_data = torch.ByteTensor(mnist_m_data[b'test']) # get MNIST labels mnist_train_labels = datasets.MNIST(root=self.mnist_root, train=True, download=True).train_labels mnist_test_labels = datasets.MNIST(root=self.mnist_root, train=False, download=True).test_labels # save MNIST-M dataset training_set = (mnist_m_train_data, mnist_train_labels) test_set = (mnist_m_test_data, mnist_test_labels) with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): if not os.path.isdir(self.root): os.makedirs(self.root) log_path = os.path.join(self.root, "train.csv") if not os.path.isfile(log_path): print("Download log...", flush=True) data = urllib.request.urlopen(self.url_train_log) with open(log_path, 'wb') as f: f.write(data.read()) keys = [ '', 'ID', 'x_crit', 'y_crit', 'source_ID', 'z_source', 'z_lens', 'mag_source', 'ein_area', 'n_crit', 'r_source', 'crit_area', 'n_pix_source', 'source_flux', 'n_pix_lens', 'lens_flux', 'n_source_im', 'mag_eff', 'sb_contrast', 'color_diff', 'n_gal_3', 'n_gal_5', 'n_gal_10', 'halo_mass', 'star_mass', 'mag_lens', 'n_sources' ] assert len(keys) == 27 with open(log_path, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',') data = [x for x in reader if len(x) == 27 and not 'ID' in x] data = [{k: float(x) if x else math.nan for k, x in zip(keys, xs)} for xs in data] self.data = {x['ID']: x for x in data} gz_path = os.path.join(self.root, "datapack2.0train.tar.gz") if not os.path.isfile(gz_path): print("Download...", flush=True) data = urllib.request.urlopen(self.url_train) with open(gz_path, 'wb') as f: f.write(data.read()) tar_path = os.path.join(self.root, "datapack2.0train.tar") if not os.path.isfile(tar_path): print("Decompress...", flush=True) import gzip import shutil with gzip.open(gz_path, 'rb') as f_in: with open(tar_path, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) dir_path = os.path.join(self.root, "datapack2.0train") if not os.path.isdir(dir_path): print("Extract...", flush=True) import tarfile tar = tarfile.open(tar_path) tar.extractall(dir_path) tar.close() # print("Open tar...", flush=True) # import tarfile # self.tar = tarfile.open(tar_path) self.files = list( zip(*(sorted( glob.glob( os.path.join(dir_path, "Public/{}/*.fits".format(band)))) for band in ("EUC_VIS", "EUC_J", "EUC_Y", "EUC_H")))) assert all( len({x.split('-')[-1] for x in fs}) == 1 for fs in self.files)