def download(self) -> None: """Download the MNIST data if it doesn't exist in processed_folder already.""" if self._check_exists(): return os.makedirs(self.raw_folder, exist_ok=True) os.makedirs(self.processed_folder, exist_ok=True) # download files for url, md5 in self.resources: filename = url.rpartition('/')[2] download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5) # process and save as torch files print('Processing...') training_set = (torch.load('/home/saeid/data/X_train.pt'), torch.load('/home/saeid/data/y_train.pt')) test_set = (torch.load('/home/saeid/data/X_test.pt'), torch.load('/home/saeid/data/y_test.pt')) with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def __init__(self, years, symmetric=False): self.url_dir = 'http://www.bic.mni.mcgill.ca/~vfonov/nihpd/obj1/' sym_string = 'sym' if symmetric else 'asym' if not isinstance(years, tuple) or years not in SUPPORTED_YEARS: message = f'Years must be a tuple in {SUPPORTED_YEARS}' raise ValueError(message) a, b = years file_id = f'{sym_string}_{format_age(a)}-{format_age(b)}' self.name = f'nihpd_{file_id}_nifti' self.filename = f'{self.name}.zip' self.url = urllib.parse.urljoin(self.url_dir, self.filename) download_root = get_torchio_cache_dir() / self.name if download_root.is_dir(): print(f'Using cache found in {download_root}') else: download_and_extract_archive( self.url, download_root=download_root, filename=self.filename, ) super().__init__( t1=Image(download_root / f'nihpd_{file_id}_t1w.nii'), t2=Image(download_root / f'nihpd_{file_id}_t2w.nii'), pd=Image(download_root / f'nihpd_{file_id}_pdw.nii'), mask=Image(download_root / f'nihpd_{file_id}_pdw.nii', type=LABEL), )
def download(self): if self._check_integrity(): print("Files already downloaded and verified") return download_and_extract_archive(self.url, self.root) os.rename(os.path.join(self.root, self.basedir), os.path.join(self.root, self.name))
def download(self, cleanup): if self._check_integrity(): print('Files already downloaded and verified') return download_and_extract_archive(self.images_url, os.path.join(self.root, "raw_data"), filename=self.raw_ims_file) download_url(self.labels_url, os.path.join(self.root, "raw_data"), self.label_file) download_url(self.splits_url, os.path.join(self.root, "raw_data"), self.splits_file) mat = scipy.io.loadmat(os.path.join(self.root, 'raw_data', self.label_file)) labels = mat['labels'][0] classes = [str(x) for x in set(list(labels))] mat = scipy.io.loadmat(os.path.join(self.root, 'raw_data', self.splits_file)) train = mat['trnid'][0] val = mat['valid'][0] test = mat['tstid'][0] def create_split(split_name, split_indices): os.mkdir(os.path.join(self.root, split_name)) for label_name in classes: os.mkdir(os.path.join(self.root, split_name, label_name)) for sample_idx in split_indices: file_name = f'image_{sample_idx:05d}.jpg' src = os.path.join(self.root, 'raw_data', 'jpg', file_name) dest = os.path.join(self.root, split_name, str(labels[sample_idx - 1]), file_name) shutil.copy(src, dest) create_split('train', train) create_split('val', val) create_split('test', test) if cleanup: shutil.rmtree(os.path.join(self.root, "raw_data"))
def download(dataroot: str): DOWNLOAD_URL = 'http://groups.csail.mit.edu/vision/LabelMe/NewImages/indoorCVPR_09.tar' with tempfile.TemporaryDirectory() as tempdir: download_and_extract_archive(DOWNLOAD_URL, tempdir, extract_root=dataroot, remove_finished=True)
def download(self): if not self._check_integrity(): download_and_extract_archive(self.url, self.root) os.rename( os.path.join(self.root, self.basedir), os.path.join(self.root, self.name), )
def download(root: str, file_name: str, archive_name: str, url_link: str): """ Download file from internet url link. :param root: (string) The directory to put downloaded files. :param file_name: (string) The name of the unzipped file. :param archive_name: (string) The name of archive(zipped file) downloaded. :param url_link: (string) The url link to download data. :return: None .. note:: If `file_name` already exists under path `root`, then it is not downloaded again. Else `archive_name` will be downloaded from `url_link` and extracted to `file_name`. """ if not os.path.exists(os.path.join(root, file_name)): print("Downloading {}".format(file_name)) if os.path.exists(os.path.join(root, archive_name)): os.remove(os.path.join(root, archive_name)) try: download_and_extract_archive(url_link, download_root=root, filename=archive_name, remove_finished=True) except Exception: print("Fail to download {} from url link {}".format( archive_name, url_link)) print('Please check you internet connection or ' "reinstall DALIB by 'pip install --upgrade dalib'") exit(0)
def download(self: 'HPatchesSequencesStereoPairs') -> None: if not self._check_raw_exists(): os.makedirs(self.raw_folder, exist_ok=True) tv_data.download_and_extract_archive( HPatchesSequencesStereoPairs.url, download_root=self.raw_folder, remove_finished=True) if not self._check_processed_exists(): hpatches_folders = [ os.path.join(self.raw_extracted_folder, seq_folder) for seq_folder in self._split_sequences ] sequences = [ HPatchesPairGenerator.read_raw_folder( seq_folder, self.convert_to_grayscale) for seq_folder in hpatches_folders ] # IMIPS downscales larges images if self.downsample_large_images: for seq in sequences: while np.any(np.array(seq.images[0].shape) > 1000): seq.downsample_in_place() os.makedirs(self.processed_folder, exist_ok=True) with open(self.processed_file, 'wb') as pickle_file: pickle.dump(sequences, pickle_file)
def _download(self): if self._check_exists(): return os.makedirs(self.raw_folder, exist_ok=True) os.makedirs(self.processed_folder, exist_ok=True) # download files for url, md5 in self.resources: filename = url.rpartition('/')[2] download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5) # process and save as torch files logging.info('Processing...') training_set = (read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')), read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte'))) test_set = (read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte'))) with open(os.path.join(self.processed_folder, 'train.pt'), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.processed_folder, 'test.pt'), 'wb') as f: torch.save(test_set, f) logging.info('Done downloading!') self._create_val_split()
def __init__(self, root, train, transform, download=True): self.url = "http://cs231n.stanford.edu/tiny-imagenet-200" self.root = root if download: if os.path.exists(f'{self.root}/tiny-imagenet-200/'): print('File already downloaded') else: download_and_extract_archive(self.url, root, filename="tiny-imagenet-200.zip") self.root = os.path.join(self.root, "tiny-imagenet-200") self.train = train self.transform = transform self.ids_string = np.sort(np.loadtxt(f"{self.root}/wnids.txt", "str")) self.ids = { class_string: i for i, class_string in enumerate(self.ids_string) } if train: self.paths = glob.glob(f"{self.root}/train/*/images/*") self.label = [self.ids[path.split("/")[-3]] for path in self.paths] else: self.val_annotations = np.loadtxt( f"{self.root}/val/val_annotations.txt", "str") self.paths = [ f"{self.root}/val/images/{sample[0]}" for sample in self.val_annotations ] self.label = [ self.ids[sample[1]] for sample in self.val_annotations ]
def __init__(self, root, split='train', task='all', download=True, **kwargs): if download: if not osp.exists(osp.join(root, "training")) or not osp.exists(osp.join(root, "evaluation")): download_and_extract_archive("https://lmb.informatik.uni-freiburg.de/data/freihand/FreiHAND_pub_v2.zip", download_root=root, filename="FreiHAND_pub_v2.zip", remove_finished=False, extract_root=root) assert split in ['train', 'test', 'all'] self.split = split assert task in ['all', 'gs', 'auto', 'sample', 'hom'] self.task = task if task == 'all': samples = self.get_samples(root, 'gs') + self.get_samples(root, 'auto') + self.get_samples(root, 'sample') + self.get_samples(root, 'hom') else: samples = self.get_samples(root, task) random.seed(42) random.shuffle(samples) samples_len = len(samples) samples_split = min(int(samples_len * 0.2), 3200) if self.split == 'train': samples = samples[samples_split:] elif self.split == 'test': samples = samples[:samples_split] super(FreiHand, self).__init__(root, samples, **kwargs)
def download(self): if self._check_integrity(): print("Files already downloaded and verified") return root = self.root download_and_extract_archive(self.url, root, root, self.filename, self.zip_md5, remove_finished=True) os.rename(join(root, "Graph Wavelets Demo"), join(root, "minnesota-usc")) files2keep = self.files2keep files2keep.append("Datasets") dirs = os.listdir(join(root, self.top_dir)) for file_or_dir in dirs: if file_or_dir not in files2keep: remove_file_or_dir(join(root, self.top_dir, file_or_dir)) for mat_file, md5sub in self.mat_list: abs_path = join(root, self.top_dir, "Datasets", mat_file) check_integrity(abs_path, md5sub) move(abs_path, join(root, self.top_dir, mat_file)) remove_file_or_dir(join(root, self.top_dir, "Datasets"))
def download(self, data_dir: str) -> None: """Download dataset Parameters ---------- data_dir : str Path to base dataset directory Returns ------- None """ if not os.path.exists(data_dir): os.mkdir(data_dir) if not os.path.exists(self.root_dir): os.makedirs(self.root_dir) download_and_extract_archive(self.url, data_dir, remove_finished=True) # Tidy up for d in ['train', 'val', 'test']: shutil.move(src=os.path.join(data_dir, 'BSR/BSDS500/data/images', d), dst=self.root_dir) os.remove(os.path.join(self.root_dir, d, 'Thumbs.db')) shutil.rmtree(os.path.join(data_dir, 'BSR'))
def download_dtd_dataset(root: str): """ Download the Oxford Pets dataset archives and expand them in the folder provided as parameter """ url = "https://www.robots.ox.ac.uk/~vgg/data/dtd/download/dtd-r1.0.1.tar.gz" download_and_extract_archive(url, root)
def prepare_data(self): """Download images and prepare images datasets.""" if not self.data_path.is_dir(): download_and_extract_archive(url=self.data_url, download_root=self.dl_path, remove_finished=True)
def prepare_data(self): """Download images and prepare images datasets.""" # 1. Download the images download_and_extract_archive(url=DATA_URL, download_root=self.dl_path, remove_finished=True) data_path = Path(self.dl_path).joinpath('cats_and_dogs_filtered') # 2. Load the data + preprocessing & data augmentation normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = ImageFolder(root=data_path.joinpath('train'), transform=transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) valid_dataset = ImageFolder(root=data_path.joinpath('validation'), transform=transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), normalize, ])) self.train_dataset = train_dataset self.valid_dataset = valid_dataset
def load_word_embedding(embedding_name, embedding_dim: int=300) -> WordEmbedding: assert Sent140.WORD_EMBEDDING is None, '只能加载一次' extract_root = os.path.join(EMBEEDING_PREFIX, embedding_name) if embedding_name.startswith('glove'): target_filename = os.path.join(extract_root, embedding_name + '.{}d.txt'.format(embedding_dim)) else: raise ValueError('不能加载 {} 对应的信息'.format(embedding_name)) if not os.path.exists(target_filename): tmp_root = os.path.join(EMBEEDING_PREFIX, 'tmp') download_and_extract_archive(url=Sent140.WORD_EMBEDDING_URLS[embedding_name], download_root=tmp_root, extract_root=extract_root, remove_finished=True) # 加载词向量, 这里默认用 GLOVE print('使用词向量文件: {}'.format(target_filename)) with open(target_filename, 'r') as inf: lines = inf.readlines() # lines[0] 词, 后面是向量的每个维度 lines = [l.split() for l in lines] # 所有的词 index2word = [l[0] for l in lines] # 词对应的向量列表 emb_floats = [np.asarray([float(n) for n in l[1:]], dtype=np.float32) for l in lines] # 默认最后一个维度作为 UNK, 这里的词嵌入增加了最后一维度, 作为 UNK, 但是词汇表数量是少一个的 emb_floats.append(np.zeros([embedding_dim], dtype=np.float32)) # for unknown word # embedding = np.stack(emb_floats, axis=0) # word2index = {v: k for k, v in enumerate(index2word)} return WordEmbedding(word2index=word2index, index2word=index2word, embedding=embedding, unk_index=len(index2word), num_vocabulary=len(index2word))
def download(dataroot: str): DOWNLOAD_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz' with tempfile.TemporaryDirectory() as tempdir: download_and_extract_archive(DOWNLOAD_URL, tempdir, extract_root=dataroot, remove_finished=True)
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" if self._check_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) # download files for url in self.urls: filename = url.rpartition('/')[2] download_and_extract_archive(url, download_root=self.raw_folder, filename=filename) # process and save as torch files print('Processing...') training_set = ( read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')), read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte')) ) test_set = ( read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte')) ) with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): if not check_integrity(self.meta_file): tmp_dir = tempfile.mkdtemp() archive_dict = self.archive_dict['devkit'] download_and_extract_archive(archive_dict['url'], self.root, extract_root=tmp_dir, md5=archive_dict['md5']) devkit_folder = _splitexts(os.path.basename( archive_dict['url']))[0] meta = parse_devkit(os.path.join(tmp_dir, devkit_folder)) self._save_meta_file(*meta) shutil.rmtree(tmp_dir) if not os.path.isdir(self.split_folder): archive_dict = self.archive_dict[self.split] download_and_extract_archive(archive_dict['url'], self.root, extract_root=self.split_folder, md5=archive_dict['md5']) if self.split == 'train': prepare_train_folder(self.split_folder) elif self.split == 'val': val_wnids = self._load_meta_file()[1] prepare_val_folder(self.split_folder, val_wnids) else: msg = ( "You set download=True, but a folder '{}' already exist in " "the root directory. If you want to re-download or re-extract the " "archive, delete the folder.") print(msg.format(self.split))
def download(self): if self._check_integrity(): print('Files already downloaded and verified') return utils.download_and_extract_archive(self.url, self.root, filename=self.filename, md5=self.tgz_md5)
def __init__(self, root: str, task: str, download: Optional[bool] = False, **kwargs): if download: for dir in self.directories.values(): if not os.path.exists(os.path.join(root, dir)): download_and_extract_archive( url= "https://cloud.tsinghua.edu.cn/f/e93f2e07d93243d6b57e/?dl=1", download_root=os.path.join(root, 'download'), filename="officecaltech.tgz", remove_finished=False, extract_root=root) break else: list( map(lambda dir, _: check_exits(root, dir), self.directories.values())) super(OfficeCaltech, self).__init__(os.path.join(root, self.directories[task]), default_loader, extensions=IMG_EXTENSIONS, **kwargs) self.classes = OfficeCaltech.CLASSES self.class_to_idx = { cls: idx for idx, clss in enumerate(self.classes) for cls in clss }
def download(self): if self._check_integrity(): print("Files already downloaded and verified") download_and_extract_archive(self.url, self.root, filename=self.filename, md5=None)
def download(self): url = "https://github.com/chiayewken/sutd-materials/releases/download/v0.1.0/numberbatch-en-19.08.txt.gz" path = self.cache_dir / Path(url).stem if not path.exists(): download_and_extract_archive(url, str(self.cache_dir)) assert path.exists() return path
def __init__(self, train=True, rootDir=None, transform=None, test=False): super(TinyImageNet, self).__init__() datasetURL = "http://cs231n.stanford.edu/tiny-imagenet-200.zip" if rootDir is None: rootDir = os.path.abspath(os.path.join(os.getcwd(), "../data")) if not os.path.exists( os.path.join(rootDir, "../data/tiny-imagenet-200")): print(f"Downloading TinyImageNet data to {rootDir}") download_and_extract_archive(datasetURL, rootDir) print("...done") self.rootDir = os.path.abspath( os.path.join(rootDir, "tiny-imagenet-200")) self.train = train self.test = test self.transforms = transforms trainDataset = ImageFolder(os.path.join(self.rootDir, "train"), transform) testDataset = ImageFolder(os.path.join(self.rootDir, "test"), transform) validDataset = TinyImagenetVal(self.rootDir, transform) if not self.test: if self.train: self._dataset = trainDataset else: self._dataset = validDataset self.targets = self._dataset.targets else: self._dataset = testDataset self.targets = None
def download(self) -> None: """Download data if it doesn't exist already.""" if self._check_exists(): return os.makedirs(self.raw_folder, exist_ok=True) # download files for filename, md5 in self.resources: for mirror in self.mirrors: url = "{}{}".format(mirror, filename) try: print("Downloading {}".format(url)) download_and_extract_archive( url, download_root=self.raw_folder, filename=filename, md5=md5 ) except URLError as error: print( "Failed to download (trying next):\n{}".format(error) ) continue finally: print() break else: raise RuntimeError("Error downloading {}".format(filename))
def download_mnist(data_dir): """ Taken from torchvision.datasets.mnist Dwonloads Mnist from the official site reshapes themas images, normalizes them and saves them as a tensor """ raw_folder = os.path.join(data_dir, 'raw') if not os.path.exists(raw_folder): os.makedirs(raw_folder, exist_ok=True) # download files train_imgs_url = "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", "f68b3c2dcbeaaa9fbdd348bbdeb94873" test_imgs_url = "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", "9fb629c4189551a2d022fa330f9573f3" for url, md5 in [train_imgs_url, test_imgs_url]: filename = url.rpartition('/')[2] download_and_extract_archive(url, download_root=raw_folder, filename=filename, md5=md5) if not os.path.exists(os.path.join(data_dir, 'train_data.pt')): # process and save as torch files print('Processing...') training_set = read_image_file(os.path.join(raw_folder, 'train-images-idx3-ubyte')) test_set = read_image_file(os.path.join(raw_folder, 't10k-images-idx3-ubyte')) # preprocess: reshape and normalize from [0,255] to [-1,1] training_set = training_set.reshape(-1, 1, MNIST_WORKING_DIM, MNIST_WORKING_DIM) / 127.5 - 1 test_set = test_set.reshape(-1, 1, MNIST_WORKING_DIM, MNIST_WORKING_DIM) / 127.5 - 1 with open(os.path.join(data_dir, 'train_data.pt'), 'wb') as f: torch.save(training_set, f) with open(os.path.join(data_dir, 'test_data.pt'), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): if os.path.exists(self.sonycust_folder): return os.makedirs(self.sonycust_folder, exist_ok=True) # Download files print("Downloading files") for url, filename, md5 in self.resources[4:]: filename = url.rpartition('/')[2] download_url(url, root=self.sonycust_folder, filename=filename, md5=md5) for url, filename, md5 in self.resources[0:4]: download_and_extract_archive(url, download_root=self.sonycust_folder, filename=filename + ".tar.gz", md5=md5, remove_finished=True) # Moving evaluation files to audio directory print("Moving files from eval to audio") for eval_num in range(3): for f in os.listdir(self.file_path_dict['audio-eval-' + str(eval_num)]): if f.endswith(".wav"): shutil.move( os.path.join( self.file_path_dict['audio-eval-' + str(eval_num)], f), self.file_path_dict['audio'])
def download(self): if self._check_integrity(): return download_and_extract_archive(self.url, self.root, filename=self.filename, md5=self.tgz_md5)
def __init__(self, root, split, bands=None, transform=None, target_transform=None, download=False, use_new_labels=True): self.root = Path(root) self.split = split self.bands = bands if bands is not None else RGB_BANDS self.transform = transform self.target_transform = target_transform self.use_new_labels = use_new_labels if download: download_and_extract_archive(self.url, self.root) download_url(self.list_file[self.split], self.root, f'{self.split}.txt') for url in self.bad_patches: download_url(url, self.root) bad_patches = set() for url in self.bad_patches: filename = Path(url).name with open(self.root / filename) as f: bad_patches.update(f.read().splitlines()) self.samples = [] with open(self.root / f'{self.split}.txt') as f: for patch_id in f.read().splitlines(): if patch_id not in bad_patches: self.samples.append(self.root / self.subdir / patch_id)