parser.add_argument('--data_location', type=str, default="/Users/quentin/Computer/DataSet/Music/speech_music_detection/", help='the location of the data') parser.add_argument('--model', type=str, default="trained/model.hdf5", help='path of the model to load when the starting is resumed') parser.add_argument('--mean_path', type=str, default="trained/mean.npy", help='path of the mean of the normalization applied with the model') parser.add_argument('--std_path', type=str, default="trained/std.npy", help='path of the std of the normalization applied with the model') args = parser.parse_args() experiments = utils.load_json('experiments.json') cfg = experiments[args.config] print("Creating the dataset..") datasets_config = utils.load_json("datasets.json") dataset = DatasetLoader( cfg["dataset"], args.data_location, datasets_config) print("Creating the data generator..") val_set = DataGenerator(dataset.get_val_set(), cfg["batch_size"], cfg["target_seq_length"], validation_data_processing, dataset.get_training_mean(),
def __init__(self, datasets, dataset_folder, datasets_config, verify=False): self.cfg = datasets_config self.verify = verify self.train_set = { "mixed": [], "music": [], "speech": [], "noise": [], "n_frame": 0, "n_frame_mixed": 0, "n_frame_speech": 0, "n_frame_music": 0, "n_frame_noise": 0 } self.val_set = { "mixed": [], "speech": [], "music": [], "noise": [], "n_frame": 0 } self.test_set = {"mixed": [], "n_frame": 0} dataset_str = "" for dataset in datasets: dataset_str += "_" + dataset CACHED_MEAN_STD = False ns_val, ns_test, ns, ns_mixed, ns_speech, ns_music, ns_noise = [], [], [], [], [], [], [] if os.path.isfile("checkpoint/mean" + dataset_str + ".npy") and os.path.isfile("checkpoint/std" + dataset_str + ".npy"): CACHED_MEAN_STD = True self.train_mean = np.load("checkpoint/mean" + dataset_str + ".npy") self.train_std = np.load("checkpoint/std" + dataset_str + ".npy") else: ms = [] vs = [] for dataset in datasets: suffix = '_' + str(config.SAMPLING_RATE) + '_' + str( config.AUDIO_MAX_LENGTH) + '_' + str(config.N_MELS) filelist_path = os.path.join( dataset_folder, self.cfg[dataset]["filelists_folder"] + suffix) data_path = os.path.join(dataset_folder, self.cfg[dataset]["data_folder"] + suffix) if not (os.path.exists(filelist_path)) or not ( os.path.exists(data_path)): raise ValueError( "The datatset " + dataset + " is unfound or has not been preprocessed for the chosen hyper-parameters." ) files = glob.glob(filelist_path + "/*") for file in files: if "mixed_train" in file: self.load_list(file, "mixed", self.train_set, data_path) elif "music_train" in file: self.load_list(file, "music", self.train_set, data_path) elif "speech_train" in file: self.load_list(file, "speech", self.train_set, data_path) elif "noise_train" in file: self.load_list(file, "noise", self.train_set, data_path) elif "noise_val" in file: self.load_list(file, "noise", self.val_set, data_path) elif "mixed_val" in file: self.load_list(file, "mixed", self.val_set, data_path) elif "speech_val" in file: self.load_list(file, "speech", self.val_set, data_path) elif "music_val" in file: self.load_list(file, "music", self.val_set, data_path) elif "mixed_test" in file: self.load_list(file, "mixed", self.test_set, data_path) elif "info.json" in file: data = utils.load_json(file) ns.append(data["N_FRAME_TRAIN"]) ns_val.append(data["N_FRAME_VAL"]) ns_test.append(data["N_FRAME_TEST"]) ns_mixed.append(data["N_FRAME_TRAIN_MIXED"]) ns_speech.append(data["N_FRAME_TRAIN_SPEECH"]) ns_music.append(data["N_FRAME_TRAIN_MUSIC"]) ns_noise.append(data["N_FRAME_TRAIN_NOISE"]) elif "mean.npy" in file and not (CACHED_MEAN_STD): ms.append(np.load(file)) elif "var.npy" in file and not (CACHED_MEAN_STD): vs.append(np.load(file)) if not (CACHED_MEAN_STD): self.train_mean = self.combine_means(ms, ns) self.train_std = np.sqrt( self.combine_var(vs, ns, ms, self.train_mean)) np.save("checkpoint/mean" + dataset_str + ".npy", self.train_mean) np.save("checkpoint/std" + dataset_str + ".npy", self.train_std) self.train_set["n_frame"] = np.sum(ns) self.val_set["n_frame"] = np.sum(ns_val) self.test_set["n_frame"] = np.sum(ns_test) self.train_set["n_frame_mixed"] = np.sum(ns_mixed) self.train_set["n_frame_speech"] = np.sum(ns_speech) self.train_set["n_frame_music"] = np.sum(ns_music) self.train_set["n_frame_noise"] = np.sum(ns_noise)
def data(): n_eval = 0 cfg = { "dataset": ["ofai", "muspeak", "esc-50"], "data_location": "/Users/quentin/Computer/DataSet/Music/speech_music_detection/", "target_seq_length": 270, "batch_size": 16 } def training_data_processing(spec_file, annotation_file, mean, std, spec_file2=None, annotation_file2=None): spec = np.load(spec_file) spec, stretching_rate = pitch_time_deformation_spec(spec) spec = random_filter_spec(spec) spec = random_loudness_spec(spec) label = preprocessing.get_label(annotation_file, spec.shape[1], stretching_rate=stretching_rate) if not (spec_file2 is None): spec2 = np.load(spec_file2) spec2, stretching_rate2 = pitch_time_deformation_spec(spec2) spec2 = random_filter_spec(spec2) spec2 = random_loudness_spec(spec2) label2 = preprocessing.get_label(annotation_file2, spec2.shape[1], stretching_rate=stretching_rate2) spec, label = block_mixing_spec(spec, spec2, label, label2) mels = preprocessing.get_scaled_mel_bands(spec) mels = preprocessing.normalize(mels, mean, std) return mels, label def validation_data_processing(spec_file, annotation_file, mean, std): spec = np.load(spec_file) mels = preprocessing.get_scaled_mel_bands(spec) mels = preprocessing.normalize(mels, mean, std) n_frame = mels.shape[1] label = preprocessing.get_label(annotation_file, n_frame, stretching_rate=1) return mels, label datasets_config = utils.load_json("datasets.json") dataset = DatasetLoader(cfg["dataset"], cfg["data_location"], datasets_config) train_set = DataGenerator(dataset.get_train_set(), cfg["batch_size"], cfg["target_seq_length"], training_data_processing, dataset.get_training_mean(), dataset.get_training_std(), set_type="train") val_set = DataGenerator(dataset.get_val_set(), cfg["batch_size"], cfg["target_seq_length"], validation_data_processing, dataset.get_training_mean(), dataset.get_training_std(), set_type="val") return train_set, val_set
def resample_dataset(dataset_folder, dataset): cfg = utils.load_json('../datasets.json') DATA_PATH = os.path.join(dataset_folder, cfg[dataset]["data_folder"]) NEW_DATA_PATH = DATA_PATH + "_" + \ str(audio_config.SAMPLING_RATE) + "_" + \ str(audio_config.AUDIO_MAX_LENGTH) + "_" + \ str(audio_config.N_MELS) FILELISTS_FOLDER = os.path.join(dataset_folder, cfg[dataset]["filelists_folder"]) NEW_FILELISTS_FOLDER = FILELISTS_FOLDER + "_" + \ str(audio_config.SAMPLING_RATE) + "_" + \ str(audio_config.AUDIO_MAX_LENGTH) + "_" + \ str(audio_config.N_MELS) if os.path.isdir(NEW_DATA_PATH): raise ValueError(NEW_DATA_PATH + " already exists.") else: os.makedirs(NEW_DATA_PATH) print("Output folder created: " + NEW_DATA_PATH) if os.path.isdir(NEW_FILELISTS_FOLDER): raise ValueError(NEW_FILELISTS_FOLDER + " already exists.") None else: os.makedirs(NEW_FILELISTS_FOLDER) print("Output folder created: " + NEW_FILELISTS_FOLDER) audio_files = glob.glob(DATA_PATH + "/*.WAV") audio_files += glob.glob(DATA_PATH + "/*.wav") audio_files += glob.glob(DATA_PATH + "/*.mp3") new_audio_files = [] filelists = utils.read_filelists(FILELISTS_FOLDER) n, n_mixed, n_music, n_speech, n_noise, n_val, n_test, n_tot = 0, 0, 0, 0, 0, 0, 0, 0 mean = np.zeros(audio_config.N_MELS) var = np.zeros(audio_config.N_MELS) for file in tqdm(audio_files): basename = os.path.basename(file) new_file = os.path.join(NEW_DATA_PATH, basename).replace( os.path.splitext(file)[1], '.wav') new_files = run_sox(file, new_file, audio_config.SAMPLING_RATE, audio_config.AUDIO_MAX_LENGTH) new_audio_files += new_files for new_file in new_files: for key in filelists.keys(): if basename in filelists[key]: audio_type, set_type = key.split('_') if set_type == 'train': length, bands = savespec_and_get_bands(new_file) if length > 0: n += length n_tot += length delta1 = bands - mean[:, None] mean += np.sum(delta1, axis=1) / n delta2 = bands - mean[:, None] var += np.sum(delta1 * delta2, axis=1) if audio_type == "mixed": n_mixed += length elif audio_type == "speech": n_speech += length elif audio_type == "music": n_music += length elif audio_type == "noise": n_noise += length elif set_type == 'val': length = savespec(new_file) if length > 0: n_tot += length n_val += length elif set_type == 'test': length = savespec(new_file) if length > 0: n_tot += length n_test += length if length > 0: with open(os.path.join(NEW_FILELISTS_FOLDER, key), 'a') as f: f.write( os.path.basename(new_file).replace(".wav", '') + '\t' + str(length) + '\n') var /= (n - 1) infos = { "N_FRAME_TOT": n_tot, "N_FRAME_TRAIN": n, "N_FRAME_VAL": n_val, "N_FRAME_TEST": n_test, "N_FRAME_TRAIN_MIXED": n_mixed, "N_FRAME_TRAIN_SPEECH": n_speech, "N_FRAME_TRAIN_MUSIC": n_music, "N_FRAME_TRAIN_NOISE": n_noise, "N_MELS": audio_config.N_MELS, "SAMPLING_RATE": audio_config.SAMPLING_RATE, "FFT_WINDOW_SIZE": audio_config.FFT_WINDOW_SIZE, "HOP_LENGTH": audio_config.HOP_LENGTH, "F_MIN": audio_config.F_MIN, "F_MAX": audio_config.F_MAX, "AUDIO_MAX_LENGTH": audio_config.AUDIO_MAX_LENGTH } np.save(os.path.join(NEW_FILELISTS_FOLDER, "mean.npy"), mean) np.save(os.path.join(NEW_FILELISTS_FOLDER, "var.npy"), var) with open(os.path.join(NEW_FILELISTS_FOLDER, "info.json"), 'w') as f: json.dump(infos, f)