def create_rules_for_mixing_speech_with_noises(workspace, speech_dir, noise_dir, data_type, magnification): """Create csv containing mixture information. Each row in the .csv file contains [speech_filename, noise_filename, noise_onset, noise_offset] Args: workspace: str, path of workspace. speech_dir: str, path of speech data. noise_dir: str, path of noise data. data_type: str, 'train' | 'test'. magnification: int, only used when data_type='train', number of noise selected to mix with a speech. E.g., when magnification=3, then 4620 speech with create 4620*3 mixtures. magnification should not larger than the species of noises. """ time_start = time.time() random_state = np.random.RandomState(42) rules_dir = os.path.join(workspace, "mixing_rules") create_directory(rules_dir) rules_filename = os.path.join(rules_dir, "{}.csv".format(data_type)) with open(rules_filename, "w", newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(["speech_file_name", "noise_file_name", "noise_begin", "noise_end"]) noise_paths = glob.glob(noise_dir + "*.wav") speech_paths = glob.glob(speech_dir + "*.wav") for speech_path in speech_paths: (speech_audio, _) = read_audio(speech_path) # For training data, mix each speech with randomly picked #magnification noises. # For test data, mix each speech with all noises. if data_type == "train": noise_paths = random_state.choice(noise_paths, size=magnification, replace=False) for noise_path in noise_paths: (noise_audio, _) = read_audio(noise_path) if noise_audio.shape[0] <= speech_audio.shape[0]: noise_begin = 0 noise_end = noise_audio.shape[0] else: # If noise longer than speech then randomly select a segment of noise. noise_begin = random_state.randint(0, noise_audio.shape[0] - speech_audio.shape[0], size=1)[0] noise_end = noise_begin + speech_audio.shape[0] writer.writerow([os.path.basename(speech_path), os.path.basename(noise_path), noise_begin, noise_end]) print() print("Mixing clean {} speech with noises time: {}".format(data_type, time.time() - time_start)) print()
def extract_features(workspace, speech_to_enhance_dir, snr): time_start = time.time() sample_rate = cfg.sample_rate audios_to_enhance_dir = os.path.join(workspace, speech_to_enhance_dir) for audio_id, audio_path in enumerate( glob.glob(audios_to_enhance_dir + "/*.wav")): speech_audio = read_audio(audio_path, target_fs=sample_rate)[0] speech_audio_complex_spectrogram = calculate_spectrogram( speech_audio, mode="complex", window_size=cfg.n_window, n_overlap=cfg.n_overlap) # Save features. features = [ speech_audio_complex_spectrogram, os.path.basename(audio_path).split(".")[0] ] features_filename = "{}.pickle".format( os.path.basename(audio_path).split(".")[0]) features_dir = os.path.join(workspace, "data", "speech_to_enhance", "features", "spectrogram", "{}db".format(int(snr))) create_directory(features_dir) features_path = os.path.join(features_dir, features_filename) pickle.dump(features, open(features_path, "wb"), protocol=pickle.HIGHEST_PROTOCOL) print() print("Extracting features time: %s" % (time.time() - time_start)) print()
def calculate_mixture_features(workspace, speech_dir, noise_dir, data_type, snr): """Calculate spectrogram for mixed, speech and noise audio. Then write the features to disk. Args: workspace: str, path of workspace. speech_dir: str, path of speech data. noise_dir: str, path of noise data. data_type: str, 'train' | 'test'. snr: float, signal to noise ratio to be mixed. """ time_start = time.time() fs = cfg.sample_rate # Open mixture csv. rules_filename = os.path.join(workspace, "mixing_rules", "{}.csv".format(data_type)) with open(rules_filename, "r", encoding="utf-8") as f: rules_reader = csv.reader(f) next(rules_reader, None) # skip the headers for i, rule in enumerate(rules_reader): [speech_filename, noise_filename, noise_begin, noise_end] = rule speech_path = os.path.join(speech_dir, speech_filename) speech_audio = read_audio(speech_path, target_fs=fs)[0] noise_path = os.path.join(noise_dir, noise_filename) noise_audio = read_audio(noise_path, target_fs=fs)[0] # Repeat noise n_repeat times to cover entire clean speech sample. if noise_audio.shape[0] < speech_audio.shape[0]: n_repeat = int(np.ceil(speech_audio.shape[0] / noise_audio.shape[0])) noise_audio = np.tile(noise_audio, n_repeat)[:speech_audio.shape[0]] # Truncate noise to the same length as speech. else: noise_audio = noise_audio[int(noise_begin):int(noise_end)] # Scale speech to given SNR. scaler = get_amplitude_scaling_factor(speech_audio, noise_audio, snr=snr) speech_audio *= scaler # Get normalized mixture, speech, noise. mixed_audio, speech_audio, noise_audio, alpha = additive_mixing(speech_audio, noise_audio) rule_name = "{}.{}".format(speech_filename.split(".")[0], noise_filename.split(".")[0]) # Save mixed audio. mixed_audio_filename = "{}.wav".format(rule_name) mixed_audio_dir = os.path.join(workspace, "mixed_audios", "spectrogram", data_type, "{}db".format(int(snr))) create_directory(mixed_audio_dir) write_audio(os.path.join(mixed_audio_dir, mixed_audio_filename), mixed_audio, fs) # Extract spectrograms. mixed_audio_complex_spectrogram = calculate_spectrogram(mixed_audio, mode='complex', window_size=cfg.n_window, n_overlap=cfg.n_overlap) speech_spectrogram = calculate_spectrogram(speech_audio, mode='magnitude', window_size=cfg.n_window, n_overlap=cfg.n_overlap) noise_spectrogram = calculate_spectrogram(noise_audio, mode='magnitude', window_size=cfg.n_window, n_overlap=cfg.n_overlap) # Save features. features_filename = "{}.{}.pickle".format(speech_filename.split(".")[0], noise_filename.split(".")[0]) features_dir = os.path.join(workspace, "features", "spectrogram", data_type, "{}db".format(int(snr))) create_directory(features_dir) features = [mixed_audio_complex_spectrogram, speech_spectrogram, noise_spectrogram, alpha, rule_name] feature_path = os.path.join(features_dir, features_filename) pickle.dump(features, open(feature_path, "wb"), protocol=pickle.HIGHEST_PROTOCOL) if (i + 1) % 101 == 0: print("Iteration # {}".format(i)) print() print("Extracting features time: %s" % (time.time() - time_start)) print()
def test(args): if not os.path.exists('experiments'): os.makedirs('experiments') transfs = transforms.Compose([ # transforms.Scale(), prepro.DB_Spec(sr=11025, n_fft=400, hop_t=0.010, win_t=0.025) ]) # mel_basis = librosa.filters.mel(16000, 256, n_mels=80, norm=1) # sr = 16000 if args.model_type == 'vae_g_l': model = vae_g_l.VAE(args) model.load_state_dict( torch.load('experiments/' + args.model_name, map_location=lambda storage, loc: storage)) elif args.model_type == 'vae_l': model = vae_l.VAE(args) model.load_state_dict( torch.load('experiments/' + args.model_name, map_location=lambda storage, loc: storage)) model.eval() if args.dataset == "VCTK": # male example # data, sr = prepro.read_audio('/work/invx030/datasets/VCTK-Corpus/wav48/p245/p245_002.wav') # Female example data, sr = prepro.read_audio( '/work/invx030/datasets/VCTK-Corpus/wav48/p233/p233_003.wav') elif args.dataset == "LibriSpeech": # male # data, sr = prepro.read_audio('/work/invx030/datasets/LibriSpeech/test-clean/1089/134686/1089-134686-0001.flac') # female data, sr = prepro.read_audio( '/work/invx030/datasets/LibriSpeech/test-clean/4507/16021/4507-16021-0001.flac' ) else: raise Exception('No valid dataset provided (use --dataset)') hop_length = int(sr * 0.010) n_fft = 400 win_length = int(sr * 0.025) data = transfs(data) data = data / (torch.min(data)) data = Variable(data) data = data.unsqueeze(0) data = data.transpose(1, 2) original = data if args.predictive: data = F.pad(data, (0, 0, 1, 0), "constant", 1.) original = F.pad(original, (0, 0, 0, 1), "constant", 1.) outs = model(data) reconstruction = outs.decoder_out reconstruction = reconstruction.transpose(1, 2) reconstruction = reconstruction.squeeze(0) reconstruction = (reconstruction.data.cpu()).numpy() reconstruction = reconstruction * -80. original = original.transpose(1, 2) original = original.squeeze(0).squeeze(0) original = (original.data.cpu()).numpy() original = original * -80. librosa.display.specshow(original, sr=sr, hop_length=hop_length, x_axis='time', y_axis='linear', cmap='viridis') plt.colorbar(format='%+2.0f dB') plt.title('Original DB spectrogram') pylab.savefig('experiments/original_spec.png') plt.clf() librosa.display.specshow(reconstruction, sr=sr, hop_length=hop_length, x_axis='time', y_axis='linear', cmap='viridis') plt.colorbar(format='%+2.0f dB') plt.title('Reconstruction DB spectrogram') pylab.savefig('experiments/reconstruction_spec.png') inverse = to_audio(original, sr=sr, n_fft=n_fft, hop_t=0.010, win_t=0.025) librosa.output.write_wav('experiments/original.wav', inverse, sr, norm=True) inverse = to_audio(reconstruction, sr, n_fft=n_fft, hop_t=0.010, win_t=0.025) librosa.output.write_wav('experiments/reconstruction.wav', inverse, sr, norm=True)
def __init__(self, root, downsample=True, transform=None, target_transform=None, dev_mode=False, preprocessed=False, person_filter=None, filter_mode = 'exclude', max_len=201, split='train'): self.person_filter = person_filter self.filter_mode = filter_mode self.root = os.path.expanduser(root) self.downsample = downsample self.transform = transform self.target_transform = target_transform self.dev_mode = dev_mode self.num_samples = 0 self.max_len = max_len self.split = split if preprocessed: self.root_dir = os.path.expanduser('librispeech_preprocessed/') if self.split == 'train': self.data_paths = os.listdir(os.path.join(self.root_dir,'train')) self.root_dir = os.path.join(self.root_dir,'train/') elif self.split == 'test': self.data_paths = os.listdir(os.path.join(self.root_dir,'test')) self.root_dir = os.path.join(self.root_dir,'test/') if person_filter: if self.filter_mode == 'include': self.data_paths = [sample for sample in self.data_paths if any(sample.startswith(pers+'-') for pers in self.person_filter)] elif self.filter_mode == 'exclude': self.data_paths = [sample for sample in self.data_paths if not any(sample.startswith(pers+'-') for pers in self.person_filter)] self.num_samples = len(self.data_paths) else: paths = make_manifest(self.root) os.mkdir('librispeech_preprocessed') os.mkdir('librispeech_preprocessed/train') os.mkdir('librispeech_preprocessed/test') test_splits = open("librispeech_splits/test_split.txt") train_splits = open("librispeech_splits/train_split.txt") split_reader = csv.reader(test_splits) test_data = [r[0] for r in split_reader] split_reader = csv.reader(train_splits) train_data = [r[0] for r in split_reader] with open(os.path.join(self.root,"SPEAKERS.TXT")) as csvfile: csvreader = csv.reader(csvfile, delimiter='|') for i in range(12): next(csvreader) rows = [r for r in csvreader] dict = {x[0].strip():[x[1].strip()] for x in rows} for z, path in enumerate(paths): keyword = 'train-clean-100/' before_keyword, keyword, after_keyword = path.partition(keyword) before_keyword, keyword, after_keyword = after_keyword.partition('/') pers = before_keyword before_keyword, keyword, after_keyword = after_keyword.partition('/') before_keyword, keyword, after_keyword = after_keyword.partition('.flac') sig = read_audio(path) if self.transform is not None: sig = self.transform(sig[0]) else: sig = sig[0] try: data = (sig.tolist(), dict[pers] + [pers]) if before_keyword in train_data: ujson.dump(data,open("librispeech_preprocessed/train/{}.json".format(before_keyword), 'w')) elif before_keyword in test_data: ujson.dump(data,open("librispeech_preprocessed/test/{}.json".format(before_keyword), 'w')) if z % 100 == 0: print "{} iterations".format(z) self.train_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/train/')) self.test_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/test/')) except: continue self.train_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/train/')) self.test_data_paths = os.listdir(os.path.expanduser('librispeech_preprocessed/test/')) self.num_samples = len(self.train_data_paths) print "{} samples processed".format(self.num_samples)
def __init__(self, root, downsample=True, transform=None, target_transform=None, dev_mode=False, preprocessed=False, person_filter=None, filter_mode='exclude', max_len=201): self.person_filter = person_filter self.filter_mode = filter_mode self.root = os.path.expanduser(root) self.downsample = downsample self.transform = transform self.target_transform = target_transform self.dev_mode = dev_mode self.num_samples = 0 self.max_len = max_len if preprocessed: self.root_dir = os.path.expanduser('vctk_preprocessed/') self.data_paths = os.listdir(self.root_dir) if person_filter: if self.filter_mode == 'include': self.data_paths = [ sample for sample in self.data_paths if any(pers in sample for pers in self.person_filter) ] elif self.filter_mode == 'exclude': self.data_paths = [ sample for sample in self.data_paths if not any(pers in sample for pers in self.person_filter) ] self.num_samples = len(self.data_paths) else: paths = make_manifest(self.root) os.mkdir('vctk_preprocessed/') with open(os.path.join(self.root, "speaker-info.txt")) as csvfile: csvreader = csv.reader(csvfile, delimiter=' ') next(csvreader) rows = [r for r in csvreader] dict = {x[0]: [x[4], x[2], x[8]] for x in rows} for z, path in enumerate(paths): keyword = 'wav48/' befor_keyowrd, keyword, after_keyword = path.partition( keyword) pers = after_keyword[1:4] sig = read_audio(path) if self.transform is not None: sig = self.transform(sig[0]) else: sig = sig[0] try: self.data = (sig.tolist(), dict[pers] + [pers]) ujson.dump( self.data, open( "vctk_preprocessed/{}.json".format( after_keyword[5:13]), 'w')) if z % 100 == 0: print "{} iterations".format(z) self.data_paths = os.listdir( os.path.expanduser('vctk_preprocessed/')) except: continue self.data_paths = os.listdir( os.path.expanduser('vctk_preprocessed/')) self.num_samples = len(self.data_paths) print "{} samples processed".format(self.num_samples)