def download_dataset(data_version: str): """ Download the data and stores the tar.gz file in the specified path :param data_version: specifies the version of the data to use (str {"0.01", "0.02"}) """ url = "http://download.tensorflow.org/data/speech_commands_v{}.tar.gz".format(data_version) urllib.request.urlretrieve(url, get_dataset_filepath(data_version=data_version))
def decompress_dataset(data_version: str): """ Retrieves the downloaded data and decompresses it :param data_version: specifies the version of the data to use (str {"0.01", "0.02"}) """ fname = get_dataset_filepath(data_version=data_version) assert os.path.exists(fname) tar = tarfile.open(fname, "r:gz") tar.extractall(path=get_training_data_path(data_version=data_version)) tar.close()
def setUp(self): self.wav_filepath = "./tests/examples/testaudio.wav" self.data_version = "0.02" if not os.path.exists( get_dataset_filepath(data_version=self.data_version)): download_dataset(data_version=self.data_version) decompress_dataset(data_version=self.data_version) self.known_commands = [ "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "marvin", "sheila", "forward", "backward", "bed", "bird", "cat", "dog", "down", "follow", "go", "happy", "house", "learn", "no", "yes", "off", "on", "right", "left", "stop", "tree", "up", "visual", "wow" ]
def test_download_and_decompress_data(self): data_version = "0.02" filepath = get_dataset_filepath(data_version=data_version) if os.path.exists(filepath): self.assertTrue(True) # skip else: # this will run in Travis download_dataset(data_version=data_version) self.assertTrue(os.path.exists(filepath)) decompress_dataset(data_version=data_version) self.assertLess( 10, len( os.listdir( get_training_data_path(data_version=data_version))))
def test_get_dataset_filepath(self): path = get_dataset_filepath(data_version="unit_testing") self.assertTrue(os.path.exists(os.path.split(path)[0])) self.assertTrue(path.endswith(".tar.gz"))
alias = f"{task}_m-{model_alias}_d-{data_version}" assert task in available_tasks known_commands = commands[task][:] include_unknown = unknown_class_addition[task] n_jobs = multiprocessing.cpu_count() n_epochs = json.load(open(experiment_settings_filepath))["n_epochs"] batch_size = json.load(open(experiment_settings_filepath))["batch_size"] run_in_gpu = json.load(open(experiment_settings_filepath))["run_in_gpu"] n_augmentations = json.load(open(experiment_settings_filepath))["n_augmentations"] bn_momentum = json.load(open(experiment_settings_filepath))["bn_momentum"] weight_decay = json.load(open(experiment_settings_filepath))["weight_decay"] # Download and decompress the data if necessary if not os.path.exists(get_dataset_filepath(data_version)): download_dataset(data_version) decompress_dataset(data_version) # Generate data augmentations if required random.seed(random_seed) np.random.seed(random_seed) torch.random.manual_seed(random_seed) if n_augmentations > 0: train_files, _, _ = get_list_of_wav_paths(data_version=data_version) for i in range(n_augmentations): if len(os.listdir(get_augmented_data_folder(data_version, str(i)))) == 0: print("Generating augmentation no. {}".format(i)) batch_augment_files(data_version=data_version, list_of_files=train_files, folder_name=str(i), n_jobs=n_jobs)