def get_movielens(root, version='ml_20m'): """Download the MovieLens data if it doesn't exist.""" urls = { 'ml-latest': 'http://files.grouplens.org/datasets/movielens/ml-latest.zip', 'ml-100k': 'http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'ml-1m': 'http://files.grouplens.org/datasets/movielens/ml-1m.zip', 'ml-10m': 'http://files.grouplens.org/datasets/movielens/ml-10m.zip', 'ml-20m': 'http://files.grouplens.org/datasets/movielens/ml-20m.zip' } assert version in urls.keys(), f"version must be one of {set(urls.keys())}" raw_folder = os.path.join(root, version, 'raw') processed_folder = os.path.join(root, version, 'processed') makedir_exist_ok(raw_folder) makedir_exist_ok(processed_folder) # download files and extract filename = urls[version].rpartition('/')[2] print('Downloading...') download_url(urls[version], root=raw_folder, filename=filename, md5=None) print('Extracting...') extract_file(os.path.join(raw_folder, filename), processed_folder) print('Done!') return Path(os.path.join(processed_folder, version))
def __init__(self, root, transform=None, target_transform=None, download=False): super(Caltech256, self).__init__(os.path.join(root, 'caltech256')) makedir_exist_ok(self.root) self.transform = transform self.target_transform = target_transform if download: self.download() if not self._check_integrity(): raise RuntimeError('Dataset not found or corrupted.' + ' You can use download=True to download it') self.categories = sorted( os.listdir(os.path.join(self.root, "256_ObjectCategories"))) self.index = [] self.y = [] for (i, c) in enumerate(self.categories): n = len( os.listdir(os.path.join(self.root, "256_ObjectCategories", c))) self.index.extend(range(1, n + 1)) self.y.extend(n * [i])
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" if self._check_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) # download files for url in self.urls: filename = url.rpartition('/')[2] download_and_extract_archive(url, download_root=self.raw_folder, filename=filename) # process and save as torch files print('Processing...') training_set = ( read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')), read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte')) ) test_set = ( read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')), read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte')) ) with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): """Downloads the amazon reviews dataset from the internet and preprocess it.""" if self.check_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) filename = 'amazon_cells_labelled.txt' download_url(self.resource, self.raw_folder, filename=filename) print('Processing...') train_data, test_data, train_targets, test_targets = self.read_csv( filename) # Converts array data into pytorch tensors train_tensor = self.vectorizer(train_data) train_targets = torch.from_numpy(np.array(train_targets)).long() test_tensor = self.vectorizer(test_data) test_targets = torch.from_numpy(np.array(test_targets)).long() with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: torch.save((train_tensor, train_targets, self.vectorizer), f) with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: torch.save((test_tensor, test_targets, self.vectorizer), f) print('Done!')
def download(self): """Download 2DShapesStructure data if it doesn't exist in processed_folder already.""" if self._check_raw_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) # download files for url_title, url in self.urls.items(): filename = url.rpartition('/')[2] file_path = os.path.join(self.raw_folder, filename) download_url(url, root=self.raw_folder) if filename.endswith('.zip'): self.extract_zip(zip_path=file_path, remove_finished=True) print(f'Fetched {filename}.') # training_set = ( # self.read_image_file(os.path.join(self.raw_folder, 'train-images-idx3-ubyte')), # self.read_label_file(os.path.join(self.raw_folder, 'train-labels-idx1-ubyte')) # ) # test_set = ( # self.read_image_file(os.path.join(self.raw_folder, 't10k-images-idx3-ubyte')), # self.read_label_file(os.path.join(self.raw_folder, 't10k-labels-idx1-ubyte')) # ) # with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: # torch.save(training_set, f) # with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: # torch.save(test_set, f) print('Done!')
def download(self): if self.check_exists(): return True else: makedir_exist_ok(self.directory) for url, md5, filename in self.resources_url: download_and_extract_archive(url, download_root=self.directory, filename=filename, md5=md5) return False
def download(self): if self._check_exists(): return makedir_exist_ok(self.root) print('Downloading Synthetic Digits...') gdown.download(self.url, os.path.join(self.root, self.filename), quiet=False) # untar the compressed files unpack_archive(os.path.join(self.root, self.filename), self.root)
def __init__(self, root, target_type="category", train=True, transform=None, target_transform=None, download=False): super(Caltech101, self).__init__(os.path.join(root, 'caltech101')) self.train = train self.dir_name = '101_ObjectCategories_split/train' if self.train else '101_ObjectCategories_split/test' makedir_exist_ok(self.root) if isinstance(target_type, list): self.target_type = target_type else: self.target_type = [target_type] self.transform = transform self.target_transform = target_transform if download: self.download() if not self._check_integrity(): raise RuntimeError('Dataset not found or corrupted.' + ' You can use download=True to download it') self.categories = sorted( os.listdir(os.path.join(self.root, "101_ObjectCategories"))) self.categories.remove("BACKGROUND_Google") # this is not a real class # For some reason, the category names in "101_ObjectCategories" and # "Annotations" do not always match. This is a manual map between the # two. Defaults to using same name, since most names are fine. name_map = { "Faces": "Faces_2", "Faces_easy": "Faces_3", "Motorbikes": "Motorbikes_16", "airplanes": "Airplanes_Side_2" } self.annotation_categories = list( map(lambda x: name_map[x] if x in name_map else x, self.categories)) self.index = [] self.y = [] for (i, c) in enumerate(self.categories): file_names = os.listdir(os.path.join(self.root, self.dir_name, c)) n = len(file_names) self.index.extend(file_names) self.y.extend(n * [i]) print(self.train, len(self.index))
def download_url(url, root, filename=None, md5=None): """Download a file from a url and place it in root. Args: url (str): URL to download file from root (str): Directory to place downloaded file in filename (str, optional): Name to save the file under. If None, use the basename of the URL md5 (str, optional): MD5 checksum of the download. If None, do not check """ from six.moves import urllib root = os.path.expanduser(root) if not filename: filename = os.path.basename(url) fpath = os.path.join(root, filename) makedir_exist_ok(root) # check if file is already present locally if check_integrity(fpath, md5): print('Using downloaded and verified file: ' + fpath) else: # download the file try: if 'dropbox' in url: # Handle dropbox links differently import requests headers = {'user-agent': 'Wget/1.16 (linux-gnu)'} r = requests.get(url, stream=True, headers=headers) with open(fpath, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) elif 'Manual' in url: raise urllib.error.URLError(url) else: print('Downloading ' + url + ' to ' + fpath) urllib.request.urlretrieve(url, fpath, reporthook=gen_bar_updater()) except (urllib.error.URLError, IOError) as e: if url[:5] == 'https': url = url.replace('https:', 'http:') print('Failed download. Trying https -> http instead.' ' Downloading ' + url + ' to ' + fpath) urllib.request.urlretrieve(url, fpath, reporthook=gen_bar_updater()) else: raise e # check integrity of downloaded file if not check_integrity(fpath, md5): raise RuntimeError("File not found or corrupted.")
def download(self): """Download the COIL20 data if it doesn't exist already.""" # download files if self._check_exists(): return makedir_exist_ok(self.unprocessed_folder) makedir_exist_ok(self.processed_folder) # download files url, filename = self.type_list['processed'] download_url(url, root=self.processed_folder, filename=filename) url, filename = self.type_list['unprocessed'] download_url(url, root=self.unprocessed_folder, filename=filename)
def download(self): """Download the MNIST data if it doesn't exist in processed_folder already.""" if self._check_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) # download files for url in self.resources: filename = url.rpartition('/')[2] download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=None) print('Downloaded!')
def download(self): """Download the weather data if it doesn't exist in data folder already.""" if self._check_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) # download files for url in self.urls: filename = self.__class__.__name__.lower() + '.csv' download_url(url, root=self.raw_folder, filename=filename) # process and save as torch files print('Processing...') filepath = os.path.join(self.raw_folder, filename) df = self.process(filepath) # Get dataset statistics mean_per_day = df.groupby(df.index.dayofyear).mean() std_per_day = df.groupby(df.index.dayofyear).std() # Split into training and testing train, test = {}, {} for variable, column in self.variables.items(): df_variable = df[[column, 'dayofyear', 'year']].dropna() train_var = df_variable.groupby( 'dayofyear', as_index=False).apply(lambda x: x.sample( min(self.num_years_train, len(x)))).droplevel(0) test_var = df_variable.drop(train_var.index) train[variable] = train_var test[variable] = test_var # Save data training_set = (train, mean_per_day, std_per_day) test_set = (test, mean_per_day, std_per_day) with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def get_criteo(root): """Download the Criteo data if it doesn't exist.""" url = 'https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz' raw_folder = os.path.join(root, 'criteo', 'raw') processed_folder = os.path.join(root, 'criteo', 'processed') makedir_exist_ok(raw_folder) makedir_exist_ok(processed_folder) # download files and extract filename = url.rpartition('/')[2] print('Downloading...') download_url(url, root=raw_folder, filename=filename, md5=None) print('Extracting...') extract_file(os.path.join(raw_folder, filename), processed_folder) print('Done!') return Path(processed_folder)
def download(self): if self._check_exists(): return makedir_exist_ok(os.path.join(self.root, self.raw_folder)) makedir_exist_ok(os.path.join(self.root, self.processed_folder)) print('Downloading ' + self.url) filename = self.url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) download_and_extract_archive(self.url, download_root=os.path.join(self.root, self.raw_folder), filename=filename) # process and save as torch files print('Processing...') # load MNIST-M images from pkl file with open(file_path.replace('.gz', ''), "rb") as f: mnist_m_data = pickle.load(f, encoding='bytes') mnist_m_train_data = torch.ByteTensor(mnist_m_data[b'train']) mnist_m_test_data = torch.ByteTensor(mnist_m_data[b'test']) # get MNIST labels mnist_train_labels = datasets.MNIST(root=self.mnist_root, train=True, download=True).train_labels mnist_test_labels = datasets.MNIST(root=self.mnist_root, train=False, download=True).test_labels # save MNIST-M dataset training_set = (mnist_m_train_data, mnist_train_labels) test_set = (mnist_m_test_data, mnist_test_labels) with open(os.path.join(self.root, self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.root, self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def download(self): """Download the FEMNIST data if it doesn't exist in processed_folder already.""" import shutil if self._check_exists(): return utils.makedir_exist_ok(self.raw_folder) utils.makedir_exist_ok(self.processed_folder) # download files for url, md5 in self.resources: filename = url.rpartition('/')[2] utils.download_and_extract_archive(url, download_root=self.raw_folder, filename=filename, md5=md5) # process and save as torch files print('Processing...') shutil.move(os.path.join(self.raw_folder, self.training_file), self.processed_folder) shutil.move(os.path.join(self.raw_folder, self.test_file), self.processed_folder)
def load_dataset(self): img_filename = "train-images-idx3-ubyte.gz" if self.train\ else "t10k-images-idx3-ubyte.gz" lbl_filename = "train-labels-idx1-ubyte.gz" if self.train\ else "t10k-labels-idx1-ubyte.gz" # Download data if not exist makedir_exist_ok(self.root) img_filepath = self.download(img_filename) lbl_filepath = self.download(lbl_filename) # Load image data with gzip.open(img_filepath, "rb") as f: self.img_data = np.frombuffer(f.read(), np.uint8, offset=16) self.img_data = self.img_data.reshape(-1, 1, 28, 28) # Load label data with gzip.open(lbl_filepath, "rb") as f: self.lbl_data = np.frombuffer(f.read(), np.uint8, offset=8)
def download_and_process_data(self): # if we need to download the all_data. if self.download: # create the root dir. data_utils.makedir_exist_ok(self.root) self.all_data_path = os.path.join(self.root, "all_data") self.all_data_tgz_file = self.all_data_path + ".tgz" if not os.path.exists(self.all_data_tgz_file): warnings.warn( "The compresssed file is missing. It will take a while (at least hours) to download, uncompress and process the data." ) # download and uncompress data. print("download and extract archive.") for name, url in self.download_urls.items(): torchvision.datasets.utils.download_and_extract_archive( url=url, download_root=self.root, filename=name, md5=None) # process the data. self._process_data(self.all_data_path) else: print("Files already downloaded.") if not os.path.exists(self.all_data_path): tar_decompress_folder(self.all_data_tgz_file) # perform sampling from the all_data and remove invalid information. data_path = self._sample_data() # data_path = self._remove_invalid_user(data_path) # split the dataset. splitted_data_paths = self._split_data(data_path) # display the stat of the train/test data. ( self.data_path, self.meta_data_path, self.meta_data, ) = self._load_meta_data_and_display_stat(splitted_data_paths)
def download(self): "Download MNIST-M if it does not exists and put into processed folder" # import packages import gzip import pickle from torchvision import datasets if self._check_exists(): return makedir_exist_ok(self.raw_folder) makedir_exist_ok(self.processed_folder) # download files filename = self.url.rpartition('/')[2] file_path = os.path.join(self.raw_folder, filename) download_and_extract_archive(self.url, download_root=self.raw_folder) # process and save as torch files print('Processing...') # load MNIST-M images from pkl file with open(file_path.replace('.gz', ''), 'rb') as f: mnistm_m_data = pickle.load(f, encoding='bytes') mnistm_m_train_data = torch.ByteTensor(mnistm_m_data[b"train"]) mnistm_m_test_data = torch.ByteTensor(mnistm_m_data[b"test"]) # get MNIST labels mnist_train_labels = MNIST(root=self.mnist_root, train=True, download=True).train_labels mnist_test_labels = MNIST(root=self.mnist_root, train=False, download=True).test_labels # save MNIST-M dataset training_set = (mnistm_m_train_data, mnist_train_labels) test_set = (mnistm_m_test_data, mnist_test_labels) with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def __init__(self, root, transform, download): self.root = root self.transform = transform self.download = download self.training_file = 'training.pt' self.training_dir_path = os.path.join(self.root, self.__class__.__name__) self.training_file_path = os.path.join(self.training_dir_path, self.training_file) if not os.path.exists(self.training_file_path): print("generating extended training data...") makedir_exist_ok(self.training_dir_path) self.data, self.targets = self.generate_extended_data() with open(self.training_file_path, 'wb') as f: torch.save((self.data, self.targets), f) else: print("loading extended training data from file...") self.data, self.targets = torch.load(self.training_file_path)
def load_dataset(self, train): img_filename = "train-images-idx3-ubyte.gz" if train\ else "t10k-images-idx3-ubyte.gz" lbl_filename = "train-labels-idx1-ubyte.gz" if train\ else "t10k-labels-idx1-ubyte.gz" # Download data if not exist # root = '/misc/kcgscratch1/ChoGroup/resnick/vidcaps/MovingMNist/' makedir_exist_ok(self.root) img_filepath = self.download(self.root, img_filename) lbl_filepath = self.download(self.root, lbl_filename) # Load image data with gzip.open(img_filepath, "rb") as f: img_data = np.frombuffer(f.read(), np.uint8, offset=16) img_data = img_data.reshape(-1, 1, 28, 28) img_data = img_data.astype(np.float) / 255. # Load label data with gzip.open(lbl_filepath, "rb") as f: lbl_data = np.frombuffer(f.read(), np.uint8, offset=8) lbl_data = lbl_data.astype(np.int) return img_data, lbl_data
def load_dataset(root, training=True): img_filename = "train-images-idx3-ubyte.gz" if training\ else "t10k-images-idx3-ubyte.gz" lbl_filename = "train-labels-idx1-ubyte.gz" if training\ else "t10k-labels-idx1-ubyte.gz" # Download data if not exist makedir_exist_ok(root) img_filepath = download(root, img_filename) lbl_filepath = download(root, lbl_filename) # Load image data with gzip.open(img_filepath, "rb") as f: imgs = np.frombuffer(f.read(), np.uint8, offset=16) imgs = imgs.reshape(-1, 1, 28, 28) # Load label data with gzip.open(lbl_filepath, "rb") as f: lbls = np.frombuffer(f.read(), np.uint8, offset=8) print('Wat: ', len(lbls), lbl_filename) print('Wat2: ', len(imgs), img_filename) return imgs, lbls
def download(self): makedir_exist_ok(self.root) for url in self.urls: filename = url.rpartition('/')[-1] file_path = os.path.join(self.root, filename) download_url(url, root=self.root, filename=filename, md5=None)
def prepare_colored_mnist(self): colored_mnist_dir = os.path.join(self.root, 'ColoredMNIST') if os.path.exists(os.path.join(colored_mnist_dir, 'train1.pt')) \ and os.path.exists(os.path.join(colored_mnist_dir, 'train2.pt')) \ and os.path.exists(os.path.join(colored_mnist_dir, 'test.pt')): print('Colored MNIST dataset already exists') return print('Preparing Colored MNIST') train_mnist = datasets.mnist.MNIST(self.root, train=True, download=True) train1_set = [] train2_set = [] test_set = [] for idx, (im, label) in enumerate(train_mnist): if idx % 10000 == 0: print(f'Converting image {idx}/{len(train_mnist)}') im_array = np.array(im) # Assign a binary label y to the image based on the digit binary_label = 0 if label < 5 else 1 # Flip label with 25% probability if np.random.uniform() < 0.25: binary_label = binary_label ^ 1 # Color the image either red or green according to its possibly flipped label color_red = binary_label == 0 # Flip the color with a probability e that depends on the environment if idx < 20000: # 20% in the first training environment if np.random.uniform() < 0.2: color_red = not color_red elif idx < 40000: # 10% in the first training environment if np.random.uniform() < 0.1: color_red = not color_red else: # 90% in the test environment if np.random.uniform() < 0.9: color_red = not color_red colored_arr = color_grayscale_arr(im_array, red=color_red) if idx < 20000: train1_set.append((Image.fromarray(colored_arr), binary_label)) elif idx < 40000: train2_set.append((Image.fromarray(colored_arr), binary_label)) else: test_set.append((Image.fromarray(colored_arr), binary_label)) # Debug # print('original label', type(label), label) # print('binary label', binary_label) # print('assigned color', 'red' if color_red else 'green') # plt.imshow(colored_arr) # plt.show() # break dataset_utils.makedir_exist_ok(colored_mnist_dir) torch.save(train1_set, os.path.join(colored_mnist_dir, 'train1.pt')) torch.save(train2_set, os.path.join(colored_mnist_dir, 'train2.pt')) torch.save(test_set, os.path.join(colored_mnist_dir, 'test.pt'))
def _split_data(self, data_path): print( f"split the data for the path={data_path}, split_by_sample={self.split_by_sample}." ) if not self.split_by_sample: # i.e. we will split by user. # 1 pass through all the json files to instantiate arr # containing all possible (user, .json file name) tuples user_files = [] for f in os.listdir(data_path): file_dir = os.path.join(data_path, f) with open(file_dir, "r") as inf: # Load data into an OrderedDict, to prevent ordering changes # and enable reproducibility data = json.load(inf, object_pairs_hook=collections.OrderedDict) include_hierarchy = "hierarchies" in data if include_hierarchy: user_files.extend([ (u, h, ns, f) for (u, h, ns) in zip(data["users"], data["hierarchies"], data["num_samples"]) ]) else: user_files.extend([ (u, ns, f) for (u, ns) in zip(data["users"], data["num_samples"]) ]) # randomly sample from user_files to pick training set users num_users = len(user_files) num_train_users = int(self.train_split_ratio * num_users) indices = [i for i in range(num_users)] train_indices = self.rng_state.sample(indices, num_train_users) train_blist = [False for i in range(num_users)] for i in train_indices: train_blist[i] = True train_user_files = [] test_user_files = [] for i in range(num_users): if train_blist[i]: train_user_files.append(user_files[i]) else: test_user_files.append(user_files[i]) # todo.... assert False, "TODO..." else: train_data_path = data_path + "_train" test_data_path = data_path + "_test" train_meta_data_path = train_data_path + "_meta.json" test_meta_data_path = test_data_path + "_meta.json" is_finished_splitting = (os.path.exists(train_data_path) and os.path.exists(test_data_path) and len(os.listdir(train_data_path)) == len(os.listdir(test_data_path)) and len(os.listdir(train_data_path)) > 100 and os.path.exists(train_meta_data_path) and os.path.exists(test_meta_data_path)) if is_finished_splitting: print(f"exist the splitted files (exit).") return train_data_path, test_data_path print("\tsplitting the dataset into train/test by sample.") data_utils.makedir_exist_ok(train_data_path) data_utils.makedir_exist_ok(test_data_path) meta_train = collections.defaultdict(list) meta_test = collections.defaultdict(list) for f in os.listdir(data_path): file_dir = os.path.join(data_path, f) with open(file_dir, "r") as inf: # Load data into an OrderedDict, to prevent ordering changes # and enable reproducibility data = json.load(inf, object_pairs_hook=collections.OrderedDict) print(f'\twe have {len(data["users"])} users.') for i, u in enumerate(data["users"]): curr_num_samples = len(data["user_data"][u]["y"]) if curr_num_samples >= 2: # ensures number of train and test samples both >= 1 num_train_samples = max( 1, int(self.train_split_ratio * curr_num_samples)) if curr_num_samples == 2: num_train_samples = 1 indices = [j for j in range(curr_num_samples)] train_indices = self.rng_state.sample( indices, num_train_samples) test_indices = [ i for i in range(curr_num_samples) if i not in train_indices ] # if we have a valid train/test split. if len(train_indices) >= 1 and len(test_indices) >= 1: user_data_train = {"x": [], "y": []} user_data_test = {"x": [], "y": []} train_blist = [ False for j in range(curr_num_samples) ] test_blist = [ False for j in range(curr_num_samples) ] for j in train_indices: train_blist[j] = True for j in test_indices: test_blist[j] = True for j in range(curr_num_samples): if train_blist[j]: user_data_train["x"].append( data["user_data"][u]["x"][j]) user_data_train["y"].append( data["user_data"][u]["y"][j]) elif test_blist[j]: user_data_test["x"].append( data["user_data"][u]["x"][j]) user_data_test["y"].append( data["user_data"][u]["y"][j]) # save the data to disk. all_data_train = { "user_data": user_data_train, "hierarchies": data["hierarchies"][i] if "hierarchies" in data else None, } all_data_test = { "user_data": user_data_test, "hierarchies": data["hierarchies"][i] if "hierarchies" in data else None, } meta_train["users"].append(u) meta_test["users"].append(u) meta_train["num_samples"].append( len(user_data_train["x"])) meta_test["num_samples"].append( len(user_data_test["x"])) # save to path. jump_json( all_data_train, os.path.join(train_data_path, f"{u}.json"), ) jump_json( all_data_test, os.path.join(test_data_path, f"{u}.json")) print( f"\tsplitted {f}. processed {len(meta_train['users'])} users." ) # save the meta data to the disk. jump_json(meta_train, train_meta_data_path) jump_json(meta_test, test_meta_data_path) return train_data_path, test_data_path
def preprocess(self, fold_list): makedir_exist_ok(self.audio_folder) makedir_exist_ok(self.processed_folder) self._read_metadata(fold_list) # pre-process file_names = self.metadata['file_names'] labels = self.metadata['labels'] folders = self.metadata['folders'] data = [] targets = [] start = time.time() for idx, (file_name, label, folder) in enumerate(zip(file_names, labels, folders)): wav_file_path = os.path.join(self.audio_folder, "fold{}".format(folder), file_name) sound, sr = librosa.load(wav_file_path, mono=True, res_type='kaiser_fast') # # 4초 임시데이터 생성 # tempSound = torch.zeros(4*8000) # 4sec. * 8KHz # if len(sound) < 4*8000: # tempSound[:len(sound)] = torch.FloatTensor(sound[:]) # else: # tempSound[:] = torch.FloatTensor(sound[:4*8000]) # sound = tempSound target = label # frame_length = 0.025 # 25(ms) # frame_stride = 0.010 # 10(ms) # n_fft = int(round(sr*frame_length)) # 200 (sample) # hop_length = int(round(sr*frame_stride)) # 80 (sample) X, sample_rate = sound, sr # stft = np.abs(librosa.stft(X)) # mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0) # chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) # mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0) # contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T, axis=0) # tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0) melspec = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=40) # logspec = np.log(melspec) logspec = librosa.power_to_db(melspec, ref=np.max) log_mel = np.mean(logspec.T, axis=0) # S = np.hstack([mfccs, chroma, mel, contrast]) S = np.hstack([log_mel]) # S = np.mean(librosa.feature.melspectrogram(y=sound, n_mels=40, n_fft=n_fft, hop_length=hop_length).T, axis=0) # mfccs = np.mean(librosa.feature.mfcc(y=sound.numpy(), sr=sr, n_mfcc=40).T, axis=0) # print(S.shape, S) # print(mfccs.shape, mfccs) # break S = torch.FloatTensor(S) data.append(S) targets.append(target) end = time.time() if idx % 100 == 0: print("(%s) %04d/%04d processed. (%.4f (sec.))" % ("train" if self.train else "test", idx + 1, len(file_names), (end - start))) if self.train: training_set = (torch.stack(data), torch.LongTensor(targets)) with open(os.path.join(self.processed_folder, self.training_file), 'wb') as f: torch.save(training_set, f) else: test_set = (torch.stack(data), torch.LongTensor(targets)) with open(os.path.join(self.processed_folder, self.test_file), 'wb') as f: torch.save(test_set, f) print('Done!')
def _remove_invalid_user(self, data_path): if self.min_samples_per_user == 0: print("skip the filtering due to min_samples_per_user=0.") return data_path # build folder and filter user. folder_name = os.path.join(self.root, "filtered_" + data_path) folder_tgz = folder_name + ".tgz" # init. print(f"filter sampled data and will save to {folder_name}") data_utils.makedir_exist_ok(folder_name) is_finished_filtering = len(os.listdir(folder_name)) == len( os.listdir(self.all_data_path)) if os.path.exists(folder_tgz) and is_finished_filtering: print("has finished filtering and compressed the sampled data.") return folder_name # start filtering. if not is_finished_filtering: for f in [f for f in os.listdir(data_path) if f.endswith(".json")]: users = [] hierarchies = [] num_samples = [] user_data = {} # load the data. file_dir = os.path.join(data_path, f) data = load_json(file_dir) num_users = len(data["users"]) for i in range(num_users): curr_user = data["users"][i] curr_hierarchy = None if "hierarchies" in data: curr_hierarchy = data["hierarchies"][i] curr_num_samples = data["num_samples"][i] if curr_num_samples >= self.min_samples_per_user: user_data[curr_user] = data["user_data"][curr_user] users.append(curr_user) if curr_hierarchy is not None: hierarchies.append(curr_hierarchy) num_samples.append(data["num_samples"][i]) # save the valid data. all_data = {} all_data["users"] = users if len(hierarchies) == len(users): all_data["hierarchies"] = hierarchies all_data["num_samples"] = num_samples all_data["user_data"] = user_data file_path = os.path.join(folder_name, f) print( f"\tsave filtered and sampled json file to the path={file_path}." ) jump_json(all_data, file_path=file_path) print(f"save data to tgz file.") tar_compress_folder(folder_name) return folder_name
def _process_data(self, final_data_path): # get file dirs. # create the intermediate dir. intermediate_path = os.path.join(self.root, "intermediate") if not os.path.exists(intermediate_path): data_utils.makedir_exist_ok(intermediate_path) # extract file directories of images (by class). class_file_dirs_path = os.path.join(intermediate_path, "class_file_dirs.pickle") if not os.path.exists(class_file_dirs_path): print("extract file directories of images by class.") class_files = [] # (class, file directory) # init dir. class_dir = os.path.join(self.root, "by_class") classes = [c for c in os.listdir(class_dir) if len(c) == 2] # extract files. for cl in classes: cldir = os.path.join(class_dir, cl) subcls = [ s for s in os.listdir(cldir) if (("hsf" in s) and ("mit" not in s)) ] for subcl in subcls: subcldir = os.path.join(cldir, subcl) image_dirs = [ os.path.join(subcldir, i) for i in os.listdir(subcldir) ] for image_dir in image_dirs: class_files.append((cl, image_dir)) print( f"extract file by class: # of samples={len(class_files)}. saving to path={class_file_dirs_path}." ) save_obj(class_files, class_file_dirs_path) # extract file directories of images (by user). writer_file_dirs_path = os.path.join(intermediate_path, "writer_file_dirs.pickle") if not os.path.exists(writer_file_dirs_path): print("extract file directories of images by writer.") writer_files = [] # (writer, file directory) writer_dir = os.path.join(self.root, "by_write") writer_parts = os.listdir(writer_dir) # init dir. for writer_part in writer_parts: writers_dir = os.path.join(writer_dir, writer_part) writers = os.listdir(writers_dir) for writer in writers: _writer_dir = os.path.join(writers_dir, writer) wtypes = os.listdir(_writer_dir) for wtype in wtypes: type_dir = os.path.join(_writer_dir, wtype) images = os.listdir(type_dir) image_dirs = [ os.path.join(type_dir, i) for i in images ] for image_dir in image_dirs: writer_files.append((writer, image_dir)) print( f"extract file by writer: # of samples={len(writer_files)}. saving to path={writer_file_dirs_path}." ) save_obj(writer_files, writer_file_dirs_path) # get the hash. # get the hash for class. class_file_hashes_path = os.path.join(intermediate_path, "class_file_hashes.pickle") if not os.path.exists(class_file_hashes_path): # init. count = 0 class_file_hashes = [] class_file_dirs = load_obj(class_file_dirs_path) print("get the image hashes (by class).") # get the hashes. for tup in class_file_dirs: if count % 100000 == 0: print("\thashed %d class images" % count) (cclass, cfile) = tup chash = hashlib.md5(open(cfile, "rb").read()).hexdigest() class_file_hashes.append((cclass, cfile, chash)) count += 1 save_obj(class_file_hashes, class_file_hashes_path) # get the hash for writer. writer_file_hashes_path = os.path.join(intermediate_path, "writer_file_hashes.pickle") if not os.path.exists(writer_file_hashes_path): # init. count = 0 writer_file_hashes = [] writer_file_dirs = load_obj(writer_file_dirs_path) print("get the image hashes (by writer).") for tup in writer_file_dirs: if count % 100000 == 0: print("hashed %d write images" % count) (cclass, cfile) = tup chash = hashlib.md5(open(cfile, "rb").read()).hexdigest() writer_file_hashes.append((cclass, cfile, chash)) count += 1 save_obj(writer_file_hashes, writer_file_hashes_path) # check the hash and assign class labels to writer. class_file_hashes = load_obj( class_file_hashes_path) # each elem is (class, file dir, hash) writer_file_hashes = load_obj( writer_file_hashes_path) # each elem is (writer, file dir, hash) writer_with_class_path = os.path.join(intermediate_path, "writer_with_class.pickle") if not os.path.exists(writer_with_class_path): print("assigning class labels to writer images.") class_hash_dict = {} for i in range(len(class_file_hashes)): (c, f, h) = class_file_hashes[len(class_file_hashes) - i - 1] class_hash_dict[h] = (c, f) writer_classes = [] for tup in writer_file_hashes: (w, f, h) = tup writer_classes.append((w, f, class_hash_dict[h][0])) save_obj(writer_classes, writer_with_class_path) # grouping images by writer. writer_class = load_obj(writer_with_class_path) images_by_writer_path = os.path.join(intermediate_path, "images_by_writer.pickle") if not os.path.exists(images_by_writer_path): print("write images_by_writer") # each entry is a (writer, [list of (file, class)]) tuple writers = [] cimages = [] (cw, _, _) = writer_class[0] for (w, f, c) in writer_class: if w != cw: writers.append((cw, cimages)) cw = w cimages = [(f, c)] cimages.append((f, c)) writers.append((cw, cimages)) # save obj. save_obj(writers, images_by_writer_path) # create for the final data json. # converts a list of (writer, [list of (file,class)]) tuples into a json object # of the form: # {users: [bob, etc], num_samples: [124, etc.], # user_data: {bob : {x:[img1,img2,etc], y:[class1,class2,etc]}, etc}} # where 'img_' is a vectorized representation of the corresponding image. def relabel_class(c): """ maps hexadecimal class value (string) to a decimal number returns: - 0 through 9 for classes representing respective numbers - 10 through 35 for classes representing respective uppercase letters - 36 through 61 for classes representing respective lowercase letters """ if c.isdigit() and int(c) < 40: return int(c) - 30 elif int(c, 16) <= 90: # uppercase return int(c, 16) - 55 else: return int(c, 16) - 61 def write_to_json_file(users, num_samples, user_data, json_index): all_data = {} all_data["users"] = users all_data["num_samples"] = num_samples all_data["user_data"] = user_data file_name = "all_data_%d.json" % json_index file_path = os.path.join(final_data_path, file_name) print("writing %s" % file_name) jump_json(all_data, file_path) def write_to_json_files(all_writers, max_writers): writer_count = 0 json_index = 0 users = [] num_samples = [] user_data = {} for (w, l) in all_writers: users.append(w) num_samples.append(len(l)) user_data[w] = {"x": [], "y": []} size = self.img_size, self.img_size # original image size is 128, 128 for (f, c) in l: img = Image.open(f) gray = img.convert("L") gray.thumbnail(size, Image.ANTIALIAS) arr = np.asarray(gray).copy() vec = arr.flatten() vec = vec / 255 # scale all pixel values to between 0 and 1 vec = vec.tolist() nc = relabel_class(c) user_data[w]["x"].append(vec) user_data[w]["y"].append(nc) writer_count += 1 # redirect a new json file. if writer_count == max_writers: write_to_json_file(users, num_samples, user_data, json_index) # reinit. writer_count = 0 json_index += 1 users[:] = [] num_samples[:] = [] user_data.clear() # in case we have something left. if writer_count > 0: write_to_json_file(users, num_samples, user_data, json_index) # start the final processing. if not os.path.exists(final_data_path): data_utils.makedir_exist_ok(final_data_path) MAX_WRITERS = 100 # max number of writers per json file. writers = load_obj(images_by_writer_path) num_json_files = int(math.ceil(len(writers) / MAX_WRITERS)) if (len([x for x in os.listdir(final_data_path) if "json" in x]) != num_json_files): print( f"final step for creating all data 1: save the json files to disks. we have {num_json_files} json files." ) write_to_json_files(writers, MAX_WRITERS) if not os.path.exists(self.all_data_tgz_file): print( f"final step for creating all data 2: save them to tgz file.") tar_compress_folder(final_data_path)
from torchvision.datasets.utils import download_url, makedir_exist_ok raw_folder = './emnist_data' url = 'http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip' md5 = "58c8d27c78d21e728a6bc7b3cc06412e" makedir_exist_ok(raw_folder) # download files print('Downloading zip archive') download_url(url, root=raw_folder, filename="emnist.zip", md5=md5)
def _sample_data(self): new_user_count = 0 # for iid case folder_name = os.path.join( self.root, f"sampled_{'iid' if self.is_iid_sample else 'niid'}_data_fraction-{self.data_fraction}{f'_user_fraction-{self.user_fraction}' if self.is_iid_sample else ''}", ) folder_tgz = folder_name + ".tgz" # build or extract. print(f"sample data and will save to {folder_name}") data_utils.makedir_exist_ok(folder_name) is_finished_sampling = len(os.listdir(folder_name)) == len( os.listdir(self.all_data_path)) if os.path.exists(folder_tgz) and is_finished_sampling: print("has finished sampling and compressed the sampled data.") return folder_name elif os.path.exists(folder_tgz): print( "the sampling has not been finished (but we have the tgz file). So let's decompress the tgz file." ) tar_decompress_folder(folder_tgz) return folder_name # (rough) check the number of files in folder_name. if not is_finished_sampling: print( "the number of sampled json file is incorrect. sample it again." ) for f in os.listdir(self.all_data_path): file_dir = os.path.join(self.all_data_path, f) with open(file_dir, "r") as inf: # Load data into an OrderedDict, to prevent ordering changes # and enable reproducibility data = json.load(inf, object_pairs_hook=collections.OrderedDict) # get some meta info. num_users = len(data["users"]) tot_num_samples = sum(data["num_samples"]) num_new_samples = int(self.data_fraction * tot_num_samples) hierarchies = None # if it is iid sample. if self.is_iid_sample: raw_list = list(data["user_data"].values()) raw_x = [elem["x"] for elem in raw_list] raw_y = [elem["y"] for elem in raw_list] x_list = [item for sublist in raw_x for item in sublist] # flatten raw_x y_list = [item for sublist in raw_y for item in sublist] # flatten raw_y # get new users and new indices. num_new_users = max( int(round(self.user_fraction * num_users)), 1) indices = [i for i in range(tot_num_samples)] new_indices = self.rng_state.sample( indices, num_new_samples) users = [ str(i + new_user_count) for i in range(num_new_users) ] # get the new data and divide them (iid). user_data = dict((user, collections.defaultdict(list)) for user in users) all_x_samples = [x_list[i] for i in new_indices] all_y_samples = [y_list[i] for i in new_indices] x_groups = iid_divide(all_x_samples, num_new_users) y_groups = iid_divide(all_y_samples, num_new_users) # assign the info. for i in range(num_new_users): user_data[users[i]]["x"] = x_groups[i] user_data[users[i]]["y"] = y_groups[i] num_samples = [len(user_data[u]["y"]) for u in users] new_user_count += num_new_users else: ctot_num_samples = 0 users = data["users"] users_and_hiers = None if "hierarchies" in data: users_and_hiers = list(zip(users, data["hierarchies"])) self.rng_state.shuffle(users_and_hiers) hierarchies = [] else: self.rng_state.shuffle(users) # init for the sampling (by user). user_i = 0 num_samples = [] user_data = {} # sample by user. while ctot_num_samples < num_new_samples: if users_and_hiers is not None: user, hier = users_and_hiers[user_i] hierarchies.append(hier) else: user = users[user_i] cdata = data["user_data"][user] cnum_samples = len(data["user_data"][user]["y"]) if ctot_num_samples + cnum_samples > num_new_samples: cnum_samples = num_new_samples - ctot_num_samples indices = [i for i in range(cnum_samples)] new_indices = self.rng_state.sample( indices, cnum_samples) x = [] y = [] for i in new_indices: x.append(data["user_data"][user]["x"][i]) y.append(data["user_data"][user]["y"][i]) cdata = {"x": x, "y": y} num_samples.append(cnum_samples) user_data[user] = cdata ctot_num_samples += cnum_samples user_i += 1 if "hierarchies" in data: users = [u for u, h in users_and_hiers][:user_i] else: users = users[:user_i] # create the .json file. all_data = {} all_data["users"] = users if hierarchies is not None: all_data["hierarchies"] = hierarchies all_data["num_samples"] = num_samples all_data["user_data"] = user_data # save to json file. file_path = os.path.join(folder_name, f) print(f"\tsave sampled json file to the path={file_path}.") jump_json(all_data, file_path=file_path) print(f"save data to tgz file.") tar_compress_folder(folder_name) return folder_name