def main(args): # MODEL num_features = [args.features*i for i in range(1, args.levels+1)] if args.feature_growth == "add" else \ [args.features*2**i for i in range(0, args.levels)] target_outputs = int(args.output_size * args.sr) model = Waveunet(args.channels, num_features, args.channels, args.instruments, kernel_size=args.kernel_size, target_output_size=target_outputs, depth=args.depth, strides=args.strides, conv_type=args.conv_type, res=args.res, separate=args.separate) if args.cuda: model = utils.DataParallel(model) print("move model to gpu") model.cuda() print("Loading model from checkpoint " + str(args.load_model)) state = utils.load_model(model, None, args.load_model) print('Step', state['step']) preds = predict_song(args, args.input, model) output_folder = os.path.dirname( args.input) if args.output is None else args.output for inst in preds.keys(): utils.write_wav( os.path.join(output_folder, os.path.basename(args.input) + "_" + inst + ".wav"), preds[inst], args.sr)
def write_wav(self, file_path, track_order=None, bit_depth=32): """Writes impulse responses to a WAV file Args: file_path: Path to output WAV file track_order: List of speaker-side names for the order of impulse responses in the output file bit_depth: Number of bits per sample. 16, 24 or 32 Returns: None """ # Duplicate speaker names as left and right side impulse response names if track_order is None: track_order = HEXADECAGONAL_TRACK_ORDER # Add all impulse responses to a list and save channel names irs = [] ir_order = [] for speaker, pair in self.irs.items(): for side, ir in pair.items(): irs.append(ir.data) ir_order.append(f'{speaker}-{side}') # Add silent tracks for ch in track_order: if ch not in ir_order: irs.append(np.zeros(len(irs[0]))) ir_order.append(ch) irs = np.vstack(irs) # Sort to output order irs = irs[[ir_order.index(ch) for ch in track_order], :] # Write to file write_wav(file_path, self.fs, irs, bit_depth=bit_depth)
def record_target(file_path, length, fs, channels=2, append=False): """Records audio and writes it to a file. Args: file_path: Path to output file length: Audio recording length in samples fs: Sampling rate channels: Number of channels in the recording append: Add track(s) to an existing file? Silence will be added to end of each track to make all equal in length Returns: None """ recording = sd.rec(length, samplerate=fs, channels=channels, blocking=True) recording = np.transpose(recording) max_gain = 20 * np.log10(np.max(np.abs(recording))) if append and os.path.isfile(file_path): # Adding to existing file, read the file _fs, data = read_wav(file_path, expand=True) # Zero pad shorter to the length of the longer if recording.shape[1] > data.shape[1]: n = recording.shape[1] - data.shape[1] data = np.pad(data, [(0, 0), (0, n)]) elif data.shape[1] > recording.shape[1]: recording = np.pad(data, [(0, 0), (0, data.shape[1] - recording.shape[1])]) # Add recording to the end of the existing data recording = np.vstack([data, recording]) write_wav(file_path, fs, recording) print(f'Headroom: {-1.0*max_gain:.1f} dB')
def stop_enroll_record(self): self.stop_record() print self.recordData[:300] signal = np.array(self.recordData, dtype=NPDtype) self.enrollWav = (Main.FS, signal) # TODO To Delete write_wav('enroll.wav', *self.enrollWav)
def reco_do_predict(self, fs, signal): label = self.backend.predict(fs, signal) if not label: label = "Nobody" print label self.recoUsername.setText(label) self.Alading.setPixmap(QPixmap(u"image/a_result.png")) self.recoUserImage.setPixmap(self.get_avatar(label)) # TODO To Delete write_wav('reco.wav', fs, signal)
def speaker_diarization(fs, signal, mt_size=2.0, mt_step=0.2, st_win=0.05): """ unsupervised speaker count """ st_step = st_win [mid_term_features, short_term_features] = mt_feature_extraction(signal, fs, mt_size * fs, mt_step * fs, round(fs * st_win)) [mid_term_features_norm, _, _] = normalize_features([mid_term_features.T]) mid_term_features_norm = mid_term_features_norm[0].T num_of_windows = mid_term_features.shape[1] # VAD: reserved_time = 1 segment_limits = vad(short_term_features, st_step, smooth_window=0.5, weight=0.3) i_vad = ivad(segment_limits, mt_step, reserved_time, num_of_windows) mid_term_features_norm = mid_term_features_norm[:, i_vad] # remove outliers: distances_all = numpy.sum(distance.squareform(distance.pdist(mid_term_features_norm.T)), axis=0) m_distances_all = numpy.mean(distances_all) i_non_outliers = numpy.nonzero(distances_all < 1.2 * m_distances_all)[0] mid_term_features_norm = mid_term_features_norm[:, i_non_outliers] i_features_select = [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53] mid_term_features_norm = mid_term_features_norm[i_features_select, :] num_range = range(2, 10)##人数范围[2,10) [n_speakers_final, imax, num_speaker_cls] = \ kmeans_silhouette(mid_term_features_norm, num_range) cls = numpy.zeros((num_of_windows,))-1 valid_pos = i_vad[i_non_outliers] for i in range(num_of_windows): if i in valid_pos: j = numpy.argwhere(valid_pos == i)[0][0] cls[i] = num_speaker_cls[imax][j] # median filtering: cls = scipy.signal.medfilt(cls, 11) start = 0 end = 0 for i in range(1, len(cls)): if cls[i] == cls[i-1]: end = i else: write_wav(os.path.join(os.path.pardir, "result", "result_wav", str(cls[i-1]) + "-" + str(start*mt_step) + "-" + str(end*mt_step) + ".wav"), fs, signal[int(start * mt_step * fs):int(end * mt_step * fs)]) start = i return n_speakers_final, cls
def run(args): mix_input = WaveReader(args.input, sample_rate=args.fs) computer = NnetComputer(args.checkpoint, args.gpu) for key, mix_samps in mix_input: logger.info("Compute on utterance {}...".format(key)) spks = computer.compute(mix_samps) norm = np.linalg.norm(mix_samps, np.inf) for idx, samps in enumerate(spks): samps = samps[:mix_samps.size] # norm samps = samps * norm / np.max(np.abs(samps)) write_wav(os.path.join(args.dump_dir, "spk{}/{}.wav".format(idx + 1, key)), samps, fs=args.fs) logger.info("Compute over {:d} utterances".format(len(mix_input)))
def main(): # Open HRIR estimator = ImpulseResponseEstimator.from_pickle(TEST_SIGNAL) hrir = HRIR(estimator) hrir.open_recording(os.path.join(DIR_PATH, 'FL,FR.wav'), speakers=['FL', 'FR']) hrir.crop_heads() hrir.crop_tails() # Create test signal sequence speakers = ['FL', 'FR'] seq_data = estimator.sweep_sequence(speakers, 'stereo') fig, ax = plot_stereo_track(seq_data, estimator.fs) fig.suptitle('Sweep sequence') left = np.vstack([ hrir.irs['FL']['left'].convolve(seq_data[0]), hrir.irs['FL']['right'].convolve(seq_data[0]) ]) right = np.vstack([ hrir.irs['FR']['left'].convolve(seq_data[1]), hrir.irs['FR']['right'].convolve(seq_data[1]) ]) virtualized = left + right fig, ax = plot_stereo_track(virtualized, estimator.fs) fig.suptitle('Sweep sequence convolved with HRIR') plt.show() # Virtualize sine sweep sequence with HRIR # virtualized = [] # for i, speaker in enumerate(speakers): # track = seq_data[i, :] # virtualized.append(np.sum([ # hrir.irs[speaker]['left'].convolve(track), # hrir.irs[speaker]['right'].convolve(track) # ], axis=0)) virtualized = np.vstack(virtualized) # Normalized to 0 dB virtualized /= np.max(np.abs(virtualized)) # Write virtualized sequence to disk file_path = os.path.join(DIR_PATH, f'headphones-sweep-seq-{",".join(speakers)}-stereo-{estimator.file_name(32)}.wav') write_wav(file_path, estimator.fs, virtualized, bit_depth=32)
def run(args): voice_spliter = VoiceSpliter(args.voiced_threshold, args.tolerated_size) wave_reader = WaveReader(args.wav_scp) L, S = args.frame_length, args.frame_shift samp_rate = args.sample_rate for key, wave in wave_reader: voice_spliter.reset() num_frames = (wave.size - L) // S + 1 for idx in range(num_frames): voice_spliter.run(wave[idx * S:idx * S + L]) segments = voice_spliter.segments if len(segments) % 2: segments.append(num_frames) logger.info("{} segments: {}".format(key, segments)) for idx in range(len(segments) // 2): beg, end = segments[idx * 2:idx * 2 + 2] if (end - beg) * S / samp_rate < args.min_dur: continue voiced_segement = wave[beg * S:end * S] write_wav( os.path.join(args.dump_dir, "{}-{:d}-{:d}.wav".format(key, beg, end)), voiced_segement, samp_rate)
def getMUSDBHQ(database_path): subsets = list() for subset in ["train", "test"]: print("Loading " + subset + " set...") tracks = glob.glob(os.path.join(database_path, subset, "*")) samples = list() # Go through tracks for track_folder in sorted(tracks): # Skip track if mixture is already written, assuming this track is done already example = dict() for stem in ["mix", "bass", "drums", "other", "vocals"]: filename = stem if stem != "mix" else "mixture" audio_path = os.path.join(track_folder, filename + ".wav") example[stem] = audio_path # Add other instruments to form accompaniment acc_path = os.path.join(track_folder, "accompaniment.wav") if not os.path.exists(acc_path): print("Writing accompaniment to " + track_folder) stem_audio = [] for stem in ["bass", "drums", "other"]: audio, sr = load(example[stem], sr=None, mono=False) stem_audio.append(audio) acc_audio = np.clip(sum(stem_audio), -1.0, 1.0) write_wav(acc_path, acc_audio, sr) example["accompaniment"] = acc_path samples.append(example) subsets.append(samples) return subsets
def evaluate_for_enhanced(args, dataset, model): dB_list_pesq = dict() dB_list_name_pesq = dict() dB_list_stoi = dict() dB_list_name_stoi = dict() if args.outside_test: for i in ['-7.5', '-2.5', '2.5', '7.5']: dB_list_pesq[i] = list() dB_list_name_pesq[i] = list() dB_list_stoi[i] = list() dB_list_name_stoi[i] = list() test_noise_file = "outside_test/noise" else: for i in ['-10', '-5', '0', '5', '10']: dB_list_pesq[i] = list() dB_list_name_pesq[i] = list() dB_list_stoi[i] = list() dB_list_name_stoi[i] = list() test_noise_file = "test/noise" noise_dir = os.path.join(args.dataset_dir, test_noise_file) noise_file = os.listdir(noise_dir) dB_noise_pesq = {} for i in noise_file: dB_noise_pesq[os.path.splitext(i)[0]] = list() model.eval() with torch.no_grad(): with tqdm(total=len(dataset)) as pbar: for example in dataset: # Load source references in their original sr and channel number target_sources = utils.load(example['target'], sr=16000, mono=True)[0].flatten() # Predict using mixture pred_sources = predict_song(args, example["input"], model).flatten() # write wav file_name = os.path.basename(example['input']) if args.write_to_wav: utils.write_wav( os.path.join(args.output, 'enhance_' + file_name), pred_sources.T, args.sr) fname, ext = os.path.splitext(file_name) text = fname.split("_", 4) # Evaluate pesq enhance_pesq = pesq(target_sources, pred_sources, 16000) # Evaluate stoi enhance_stoi = stoi(target_sources, pred_sources, 16000, extended=False) filename = os.path.basename(example['input']) noise_name = filename.split("_")[0] dB_noise_pesq[noise_name].append([enhance_pesq]) dB_list_pesq[text[4]].append(enhance_pesq) dB_list_name_pesq[text[4]].append([enhance_pesq, filename]) dB_list_stoi[text[4]].append(enhance_stoi) dB_list_name_stoi[text[4]].append([enhance_stoi, filename]) pbar.update(1) dB_list_name_pesq['avg'] = 0 dB_list_name_stoi['avg'] = 0 num = len(dB_list_pesq) for key, value in dB_list_pesq.items(): avg_pesq = np.mean(value, 0) dB_list_name_pesq[key].append([avg_pesq, "avg_pesq"]) dB_list_name_pesq['avg'] += avg_pesq / num for key, value in dB_list_stoi.items(): avg_stoi = np.mean(value, 0) dB_list_name_stoi[key].append([avg_stoi, "avg_stoi"]) dB_list_name_stoi['avg'] += avg_stoi / num noise_avg = list() for key, value in dB_noise_pesq.items(): avg_pesq = np.mean(value, 0) noise_avg.append([key, avg_pesq]) print(noise_avg) pesq_avg = dB_list_name_pesq['avg'] stoi_avg = dB_list_name_stoi['avg'] print(f'pesq_avg:{pesq_avg} stoi_avg:{stoi_avg} ') return { 'pesq': dB_list_name_pesq, 'stoi': dB_list_name_stoi, 'noise': noise_avg }
def train_fn(args): device = torch.device("cuda" if args.use_cuda else "cpu") upsample_factor = int(args.frame_shift_ms / 1000 * args.sample_rate) model = create_model(args) model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate) for state in optimizer.state.values(): for key, value in state.items(): if torch.is_tensor(value): state[key] = value.to(device) if args.resume is not None: print("Resume checkpoint from: {}:".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage) if torch.cuda.device_count() > 1: model.module.load_state_dict(checkpoint['model']) else: model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint["optimizer"]) global_step = checkpoint['steps'] else: global_step = 0 print("receptive field: {0} ({1:.2f}ms)".format( model.receptive_field, model.receptive_field / args.sample_rate * 1000)) if args.feature_type == "mcc": # mfccs have already been scaled for Ryan # scaler = StandardScaler() # scaler.mean_ = np.load(os.path.join(args.data_dir, 'mean.npy')) # scaler.scale_ = np.load(os.path.join(args.data_dir, 'scale.npy')) # feat_transform = transforms.Compose([lambda x: scaler.transform(x)]) feat_transform = None else: feat_transform = None dataset = FilterbankDataset( data_dir=args.data_dir, receptive_field=model.receptive_field, sample_size=args.sample_size, upsample_factor=upsample_factor, quantization_channels=args.quantization_channels, use_local_condition=args.use_local_condition, noise_injecting=args.noise_injecting, feat_transform=feat_transform) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) criterion = nn.CrossEntropyLoss() ema = ExponentialMovingAverage(args.ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) while global_step < args.training_steps: for i, data in enumerate(dataloader, 0): audio, target, local_condition = data target = target.squeeze(-1) local_condition = local_condition.transpose(1, 2) audio, target, h = audio.to(device), target.to( device), local_condition.to(device) optimizer.zero_grad() output = model(audio[:, :-1, :], h[:, :, 1:]) loss = criterion(output, target) print('step [%3d]: loss: %.3f' % (global_step, loss.item())) loss.backward() optimizer.step() # update moving average if ema is not None: apply_moving_average(model, ema) global_step += 1 if global_step % args.checkpoint_interval == 0: save_checkpoint(device, args, model, optimizer, global_step, args.checkpoint_dir, ema) out = output[1, :, :] samples = out.argmax(0) waveform = mu_law_decode( np.asarray(samples[model.receptive_field:]), args.quantization_channels) write_wav( waveform, args.sample_rate, os.path.join(args.checkpoint_dir, "train_eval_{}.wav".format(global_step)))
def getMUSDB(database_path): mus = musdb.DB(root=database_path, is_wav=False) subsets = list() for subset in ["train", "test"]: tracks = mus.load_mus_tracks(subset) samples = list() # Go through tracks for track in sorted(tracks): # Skip track if mixture is already written, assuming this track is done already track_path = track.path[:-4] mix_path = track_path + "_mix.wav" acc_path = track_path + "_accompaniment.wav" if os.path.exists(mix_path): print("WARNING: Skipping track " + mix_path + " since it exists already") # Add paths and then skip paths = {"mix": mix_path, "accompaniment": acc_path} paths.update({ key: track_path + "_" + key + ".wav" for key in ["bass", "drums", "other", "vocals"] }) samples.append(paths) continue rate = track.rate # Go through each instrument paths = dict() stem_audio = dict() for stem in ["bass", "drums", "other", "vocals"]: path = track_path + "_" + stem + ".wav" audio = track.targets[stem].audio write_wav(path, audio, rate) stem_audio[stem] = audio paths[stem] = path # Add other instruments to form accompaniment acc_audio = np.clip( sum([ stem_audio[key] for key in list(stem_audio.keys()) if key != "vocals" ]), -1.0, 1.0) write_wav(acc_path, acc_audio, rate) paths["accompaniment"] = acc_path # Create mixture mix_audio = track.audio write_wav(mix_path, mix_audio, rate) paths["mix"] = mix_path diff_signal = np.abs(mix_audio - acc_audio - stem_audio["vocals"]) print( "Maximum absolute deviation from source additivity constraint: " + str(np.max(diff_signal))) # Check if acc+vocals=mix print( "Mean absolute deviation from source additivity constraint: " + str(np.mean(diff_signal))) samples.append(paths) subsets.append(samples) print("DONE preparing dataset!") return subsets
def reverberate_and_mix(out_folder, sources_folder, rir_folder, mix_info, scale_rirs=10.0, part=0, nparts=8, num_mics=1, chat=True, output_align='causal'): """Reverberate and mix sources. Args: out_folder: Output folder to write reverberated sources and mixtures. sources_folder: Sources folder to read sources from. rir_folder: RIR folder to read rirs from. mix_info: A dictionary : mix_file_name -> (sources, rirs) where sources and rirs are paired lists of relative paths to source and rir signal wav files used in reverberate and mix operation to be performed. scale_rirs: A value to scale the RIR signals (float). part: Integer value indicating which part of parallel jobs to run (int). nparts: Number of parts considered for parallel runs (int). num_mics: Number of mics to use at the output (int). chat: If True, display more messages (bool). output_align: Output signal alignment type. 'causal: Uses causal RIR filtering with no additional shift. ' 'align_sources': Find the average peak index of RIR(s) corresponding ' ' each source and advance each source with that index. This has an ' ' effect of aligning each source with their non-reverberated version.' Returns: None, but writes reverberated sources and mixtures into files. """ list_mix = sorted(mix_info.keys()) list_len = len(list_mix) partsize = list_len // nparts assert part < nparts start = part * partsize end = list_len if part == nparts-1 else (part + 1) * partsize if start == end: raise ValueError('Not enough mixtures to generate. Part {} of {} to ' 'generate a total of {} mixtures.'.format( part, nparts, list_len)) print('Reverberating and mixing from {} to {} ' 'out of {}.'.format(start, end, list_len)) for mix in list_mix[start:end]: sources, rirs = mix_info[mix] mix_to_data = [] rir_peak_delays = [] max_src_len = -1 if chat: print('--\n{} ='.format(mix)) for source, rir in zip(sources, rirs): source_path = os.path.join(sources_folder, source) src_data, samplerate_src = read_wav(source_path, always_2d=True) rir_path = os.path.join(rir_folder, rir) rir_data, samplerate_rir = read_wav(rir_path, always_2d=True) assert samplerate_src == samplerate_rir # Pick channel 0 of src_data. src_data = src_data[:, 0] # Pick num_mics channels of rirs and scale them. if len(rir_data.shape) == 2: rir_mics = np.shape(rir_data)[1] if rir_mics < num_mics: raise ValueError(f'The rir {rir_path} has only {rir_mics} channel ' f'data where the specified num_mics={num_mics}') rir_data = rir_data[:, :num_mics] else: if num_mics > 1: raise ValueError(f'The rir {rir_path} has only single channel data ' f'but specified num_mics={num_mics}') rir_data = np.reshape(rir_data, [-1, 1]) rir_data = scale_rirs * rir_data rir_len = len(rir_data[:, 0]) src_len = len(src_data) rir_max = np.max(np.abs(rir_data)) rir_peaks = np.argmax(np.abs(rir_data), axis=0) src_max = np.max(np.abs(src_data)) max_src_len = np.maximum(src_len, max_src_len) if chat: print('+ {} [{}, {:1.2f}] * {} [{}, {:1.2f}, {}]'.format( source, src_len, src_max, rir, rir_len, rir_max, rir_peaks)) mix_to_data.append([src_data, rir_data, source, rir, rir_peaks]) mix_rev_sources = [] rir_paths_used = [] for data in mix_to_data: src_data, rir_data, source_relpath, rir_relpath, rir_peaks = data rir_paths_used.append(rir_relpath) src_len = len(src_data) if src_len < max_src_len: print('WARNING: original source data has {} samples ' 'for source file {}, zero padding ' 'to size {}.'.format(src_len, source_relpath, max_src_len)) src_data = np.concatenate((src_data, np.zeros( max_src_len - src_len)), axis=0) if output_align == 'align_sources': output_advance = np.round(np.mean(np.asarray( rir_peaks))).astype(np.int32) elif output_align == 'causal': output_advance = 0 else: raise ValueError(f'Unknown output_align={output_align}') if chat and output_advance != 0: print(f'Source {source_relpath} advanced by {output_advance} samples.') rev_src_data = multimic_convolve(src_data, rir_data, output_advance=output_advance) # Write reverberated source data. rev_src_path = os.path.join(out_folder, source_relpath) os.makedirs(os.path.dirname(rev_src_path), exist_ok=True) write_wav(rev_src_path, rev_src_data, samplerate_src) mix_rev_sources.append(rev_src_data) mixed_rev_data = np.sum(np.stack(mix_rev_sources, axis=0), axis=0) mix_wav_path = os.path.join(out_folder, mix) mix_wav_base = mix_wav_path.rstrip('.wav') write_wav(mix_wav_path, mixed_rev_data, samplerate_src) in_wav_path = os.path.join(sources_folder, mix) in_wav_base = in_wav_path.rstrip('.wav') if os.path.exists(in_wav_base + '.jams'): shutil.copyfile(in_wav_base + '.jams', mix_wav_base + '.jams') if os.path.exists(in_wav_base + '.txt'): with open(in_wav_base + '.txt', 'r') as f: lines = f.readlines() with open(mix_wav_base + '.txt', 'w') as f: f.write(''.join(lines)) f.write('\nroom impulse responses used:\n{}'.format( '\n'.join(rir_paths_used)))
def run(args): target_reader = WaveReader(args.target_spk) others_reader = [WaveReader(spk_scp) for spk_scp in args.disturb_spks] bg_noise_scp, fg_noise_scp = args.bg_noise, args.fg_noise bg_noise_reader = WaveReader(bg_noise_scp) if bg_noise_scp else None fg_noise_reader = WaveReader(fg_noise_scp) if fg_noise_scp else None # for each iteration for it in tqdm(range(args.iters)): # for each target utts for key, target in target_reader: noise = np.zeros_like(target) # add noise if exists for index, noise_reader in enumerate( [bg_noise_reader, fg_noise_reader]): if noise_reader: # sample noise # randint: [a, b] noise_index = random.randint(0, len(noise_reader) - 1) bg_or_fg_noise = noise_reader[noise_index] # sample snr snr = random.uniform(args.min_snr, args.max_snr) # add noise noise_seg = add_noise(target, bg_or_fg_noise, snr, period=(index == 0)) # accumulate noise noise = noise + noise_seg if len(others_reader): # sample speaker num_samp_spk = random.randint(args.min_spk, args.max_spk) samp_reader = random.sample(others_reader, num_samp_spk) # for each interference speaker for spk_noise_reader in samp_reader: # sample interference utt_index = random.randint(0, len(spk_noise_reader) - 1) spk_noise = spk_noise_reader[utt_index] # sample sdr sdr = random.uniform(args.min_sdr, args.max_sdr) # add interference noise_seg = add_noise(target, spk_noise, sdr) # accumulate noise noise = noise + noise_seg # sample norm sample_norm = random.uniform(0.6, 0.9) coef = sample_norm / np.maximum(np.linalg.norm(noise, np.inf), np.linalg.norm(target, np.inf)) write_wav( os.path.join(args.target_dump_dir, '{}_{:d}.wav'.format(key, it)), target * coef) write_wav( os.path.join(args.noise_dump_dir, '{}_{:d}.wav'.format(key, it)), noise * coef) mixture = (target + noise) * coef mixture = sample_norm * mixture / np.linalg.norm(mixture, np.inf) write_wav( os.path.join(args.noisy_dump_dir, '{}_{:d}.wav'.format(key, it)), mixture)
start = 0 end = 0 #fig = plt.figure(figsize=(15,4)) #imageCoordinate = 100 + 10*n_speakers_final + 1 #i = 0 #times = numpy.arange(len(cls))/float(fs) for i in range(1, len(cls)): if cls[i] == cls[i - 1]: end = i else: newpath = "result_wav/" + audioname if not os.path.exists(newpath): os.makedirs(newpath) audiofile = str(cls[i - 1]) + ":" + str(start * mt_step) + "-" + str( end * mt_step) + ".wav" write_wav(os.path.join(newpath, audiofile), fs, signal[int(start * mt_step * fs):int(end * mt_step * fs)]) # #next steps #check whether speaker is known #determine gmm for audiofile #compare it with previous avaliable gmm models #produce result #c[i] = speaker number and name matched with gmm model print(n_speakers_final, cls)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--file', type=str, required=True, help='Path to HRIR or HeSuVi file.') parser.add_argument( '--track_order', type=str, required=True, help='Track order in HRIR file. "hesuvi" or "hexadecagonal"') parser.add_argument( '--reverb', type=str, default=argparse.SUPPRESS, help= 'Reverberation times for different channels in milliseconds. During this time the ' 'reverberation tail will be reduced by 100 dB. A comma separated list of channel name and ' 'reverberation time pairs, separated by colon. If only a single numeric value is given, ' 'it is used for all channels. When some channel names are give but not all, the missing ' 'channels are not affected. Must be at least 3 ms smaller than the HRIR length. ' 'For example "--reverb=300" or ' '"--reverb=FL:500,FC:100,FR:500,SR:700,BR:700,BL:700,SL:700" or ' '"--reverb=FC:100".') args = parser.parse_args() file_path = args.file track_order = args.track_order reverb = dict() try: # Single float value reverb = {ch: float(args.reverb) / 1000 for ch in SPEAKER_NAMES} except ValueError: # Channels separated for ch_t in args.reverb.split(','): reverb[ch_t.split(':')[0].upper()] = float( ch_t.split(':')[1]) / 1000 fs, data = read_wav(file_path) for ch, t in reverb.items(): print(f'{ch}: {t*1000:.0f}ms') n_ones = int(fs * 0.003) n_win = int(fs * t) win = np.concatenate([ np.ones(n_ones), signal.windows.hann(n_win * 2)[n_win:], np.zeros(data.shape[1] - n_ones - n_win) ]) - 1.0 win *= 100 # 100 dB win = 10**(win / 20) # Linear scale if track_order == 'hesuvi': tracks = [ i for i in range(len(HESUVI_TRACK_ORDER)) if ch in HESUVI_TRACK_ORDER[i] ] elif track_order == 'hexadecagonal': tracks = [ i for i in range(len(HEXADECAGONAL_TRACK_ORDER)) if ch in HEXADECAGONAL_TRACK_ORDER[i] ] else: raise ValueError( f'Invalid track_order "{track_order}", allowed values are "hesuvi" and "hexadecagonal"' ) for i in tracks: data[i, :] *= win # Write WAV write_wav(os.path.join(DIR_PATH, 'cropped.wav'), fs, data)
[args.features*2**i for i in range(0, args.levels)] target_outputs = int(args.output_size * args.sr) model = Waveunet(args.channels, num_features, args.channels, INSTRUMENTS, kernel_size=args.kernel_size, target_output_size=target_outputs, depth=args.depth, strides=args.strides, conv_type=args.conv_type, res=args.res, separate=args.separate) if args.cuda: model = utils.DataParallel(model) print("move model to gpu") model.cuda() print("Loading model from checkpoint " + str(args.load_model)) state = utils.load_model(model, None, args.load_model) preds = predict_song(args, args.input, model) output_folder = os.path.dirname( args.input) if args.output is None else args.output for inst in preds.keys(): utils.write_wav( os.path.join(output_folder, os.path.basename(args.input) + "_" + inst + ".wav"), preds[inst], args.sr)
def run(args): min_sdr, max_sdr = list(map(float, args.sdr.split(","))) wav_reader = WaveReader(args.wav_scp, sample_rate=args.fs) logger.info( "Start simulate {:d} utterances from {}, with sdr = {} ...".format( args.num_utts, args.wav_scp, args.sdr)) statsf = open(args.simu_stats, "w") if args.simu_stats else None # 640 = 0.04 * 16000 frame_shift = int(args.fs * args.shift) for _ in tqdm.trange(args.num_utts): # list of dict object min_dur, spks = sample_spks(wav_reader, args.num_spks, args.min_dur) mixture = np.zeros(min_dur) # treat first speaker as target ref_pow = spks[0]["pow"] ref_dur = spks[0]["dur"] ref_spk = spks[0]["wav"] stats = [] # shift for target video shift = random.randint(0, (ref_dur - min_dur) // frame_shift) stats.append((spks[0]["key"], shift)) # target segment segment = ref_spk[shift * frame_shift:shift * frame_shift + min_dur] mixture += segment # interference speakers sdrs = [] infs = [] for spk in spks[1:]: sdr_db = random.uniform(min_sdr, max_sdr) scaler = np.sqrt(ref_pow / spk["pow"] * 10**(-sdr_db / 10)) # video shift shift = random.randint(0, (spk["dur"] - min_dur) // frame_shift) stats.append((spk["key"], shift)) # mixture spkseg = spk["wav"][shift * frame_shift:shift * frame_shift + min_dur] mixture += scaler * spkseg infs.append(scaler * spkseg) sdrs.append("{:+.2f}".format(sdr_db)) uttid = "{0}_{1}".format("_".join([d["key"] for d in spks]), "_".join(sdrs)) scaler = random.uniform(0.6, 0.9) / np.linalg.norm(mixture, np.inf) write_wav(os.path.join(args.dump_dir, "mix/{}.wav".format(uttid)), mixture * scaler, fs=args.fs) write_wav(os.path.join(args.dump_dir, "spk1/{}.wav".format(uttid)), segment * scaler, fs=args.fs) if not args.target_only: for idx, spk in enumerate(infs): write_wav(os.path.join(args.dump_dir, "spk{}/{}.wav".format(idx + 2, uttid)), spk * scaler, fs=args.fs) if statsf: record = uttid for pair in stats: record += " {0} {1}".format(pair[0], pair[1]) statsf.write("{}\n".format(record)) if statsf: statsf.close() logger.info( "Start simulate {:d} utterances from {}, with sdr = {} done".format( args.num_utts, args.wav_scp, args.sdr))
def reverberate_and_mix(out_folder, sources_folder, rir_folder, mix_info, scale_rirs=10.0, part=0, nparts=8, num_mics=1, chat=True): """Reverberate and mix sources.""" list_mix = sorted(mix_info.keys()) list_len = len(list_mix) partsize = list_len // nparts assert part < nparts start = part * partsize end = list_len if part == nparts - 1 else (part + 1) * partsize if start == end: raise ValueError('Not enough mixtures to generate. Part {} of {} to ' 'generate a total of {} mixtures.'.format( part, nparts, list_len)) print('Reverberating and mixing from {} to {} ' 'out of {}.'.format(start, end, list_len)) for mix in list_mix[start:end]: sources, rirs = mix_info[mix] mix_to_data = [] max_src_len = -1 if chat: print('--\n{} ='.format(mix)) for source, rir in zip(sources, rirs): source_path = os.path.join(sources_folder, source) src_data, samplerate_src = read_wav(source_path, always_2d=True) rir_path = os.path.join(rir_folder, rir) rir_data, samplerate_rir = read_wav(rir_path, always_2d=True) assert samplerate_src == samplerate_rir # Pick channel 0 of src_data. src_data = src_data[:, 0] # Pick num_mics channels of rirs and scale them. if len(rir_data.shape) == 2: rir_mics = np.shape(rir_data)[1] if rir_mics < num_mics: raise ValueError( f'The rir {rir_path} has only {rir_mics} channel ' f'data but specified num_mics={num_mics}') rir_data = rir_data[:, :num_mics] else: if num_mics > 1: raise ValueError( f'The rir {rir_path} has only single channel data ' f'but specified num_mics={num_mics}') rir_data = np.reshape(rir_data, [-1, 1]) rir_data = scale_rirs * rir_data rir_len = len(rir_data[:, 0]) src_len = len(src_data) rir_max = np.max(np.abs(rir_data)) src_max = np.max(np.abs(src_data)) max_src_len = np.maximum(src_len, max_src_len) if chat: print('+ {} [{}, {:1.2f}] * {} [{}, {:1.2f}]'.format( source, src_len, src_max, rir, rir_len, rir_max)) mix_to_data.append([src_data, rir_data, source, rir]) mix_rev_sources = [] rir_paths_used = [] for data in mix_to_data: src_data, rir_data, source_relpath, rir_relpath = data rir_paths_used.append(rir_relpath) src_len = len(src_data) if src_len < max_src_len: print('WARNING: original source data has {} samples ' 'for source file {}, zero padding ' 'to size {}.'.format(src_len, source_relpath, max_src_len)) src_data = np.concatenate( (src_data, np.zeros(max_src_len - src_len)), axis=0) rev_src_data = multimic_convolve(src_data, rir_data, 'same') # Write reverberated source data. rev_src_path = os.path.join(out_folder, source_relpath) os.makedirs(os.path.dirname(rev_src_path), exist_ok=True) write_wav(rev_src_path, rev_src_data, samplerate_src) mix_rev_sources.append(rev_src_data) mixed_rev_data = np.sum(np.stack(mix_rev_sources, axis=0), axis=0) mix_wav_path = os.path.join(out_folder, mix) mix_wav_base = mix_wav_path.rstrip('.wav') write_wav(mix_wav_path, mixed_rev_data, samplerate_src) in_wav_path = os.path.join(sources_folder, mix) in_wav_base = in_wav_path.rstrip('.wav') if os.path.exists(in_wav_base + '.jams'): shutil.copyfile(in_wav_base + '.jams', mix_wav_base + '.jams') if os.path.exists(in_wav_base + '.txt'): with open(in_wav_base + '.txt', 'r') as f: lines = f.readlines() with open(mix_wav_base + '.txt', 'w') as f: f.write(''.join(lines)) f.write('\nroom impulse responses used:\n{}'.format( '\n'.join(rir_paths_used)))
def wirteSignal(signal, filename): write_wav(signal, filename, sr=SAMPLE_RATE)
def wav_to_vad(wav_file, vad_file, sr=8000): audio, rate = librosa.load(wav_file, sr=sr) v = VoiceActivityDetector() write_wav(vad_file, v.get_speech(audio), rate)
def evaluate(args, dataset, model): dB_list_pesq = dict() dB_list_name_pesq = dict() dB_list_stoi = dict() dB_list_name_stoi = dict() dB_list_SISDR = dict() dB_list_name_SISDR = dict() if args.outside_test: for i in ['-7.5', '-2.5', '2.5', '7.5']: dB_list_pesq[i] = list() dB_list_name_pesq[i] = list() dB_list_stoi[i] = list() dB_list_name_stoi[i] = list() dB_list_SISDR[i] = list() dB_list_name_SISDR[i] = list() test_noise_file = "outside_test/noise" else: for i in ['-7.5', '-2.5', '2.5', '7.5']: dB_list_pesq[i] = list() dB_list_name_pesq[i] = list() dB_list_stoi[i] = list() dB_list_name_stoi[i] = list() dB_list_SISDR[i] = list() dB_list_name_SISDR[i] = list() test_noise_file = "test/noise" noise_dir = os.path.join(args.dataset_dir, test_noise_file) noise_file = os.listdir(noise_dir) dB_noise_pesq = {} for i in noise_file: dB_noise_pesq[os.path.splitext(i)[0]] = list() model.eval() with torch.no_grad(): with tqdm(total=len(dataset)) as pbar: for example in dataset: # Load source references in their original sr and channel number input_data = nussl.AudioSignal(example['input']) target_data = nussl.AudioSignal(example['target']) # Predict using mixture pred_sources = predict_song(args, example["input"], model).flatten() file_name = os.path.basename(example['input']) utils.write_wav( os.path.join(args.output, 'enhance_' + file_name), pred_sources.T, args.sr) fname, ext = os.path.splitext(file_name) text = fname.split("_", 4) # Evaluate pesq input_sources = input_data.audio_data.flatten() target_sources = target_data.audio_data.flatten() input_pesq = pesq(target_sources, input_sources, 16000) enhance_pesq = pesq(target_sources, pred_sources, 16000) # Evaluate stoi input_stoi = stoi(target_sources, input_sources, 16000, extended=False) enhance_stoi = stoi(target_sources, pred_sources, 16000, extended=False) # scores[target_sources.path_to_input_file]['SI-SDR'][0] enhance_data = nussl.AudioSignal(audio_data_array=pred_sources, sample_rate=16000) evaluator = nussl.evaluation.BSSEvalScale( target_data, input_data) scores = evaluator.evaluate() input_SISDR = scores[ target_data.path_to_input_file]['SI-SDR'][0] evaluator = nussl.evaluation.BSSEvalScale( target_data, enhance_data) scores = evaluator.evaluate() enhance_SISDR = scores[ target_data.path_to_input_file]['SI-SDR'][0] filename = os.path.basename(example['input']) noise_name = filename.split("_")[0] dB_noise_pesq[noise_name].append([ input_pesq, enhance_pesq, enhance_pesq - input_pesq, enhance_SISDR, enhance_SISDR - input_SISDR ]) dB_list_pesq[text[4]].append( [input_pesq, enhance_pesq, enhance_pesq - input_pesq]) dB_list_name_pesq[text[4]].append( [[input_pesq, enhance_pesq, enhance_pesq - input_pesq], file_name]) dB_list_stoi[text[4]].append( [input_stoi, enhance_stoi, enhance_stoi - input_stoi]) dB_list_name_stoi[text[4]].append( [[input_stoi, enhance_stoi, enhance_stoi - input_stoi], file_name]) dB_list_SISDR[text[4]].append( [input_SISDR, enhance_SISDR, enhance_SISDR - input_SISDR]) dB_list_name_SISDR[text[4]].append( [[input_SISDR, enhance_SISDR, enhance_SISDR - input_SISDR], file_name]) pbar.update(1) num = len(dB_list_pesq) dB_list_name_pesq['avg'] = 0 dB_list_name_stoi['avg'] = 0 dB_list_name_SISDR['avg'] = 0 improve_pesq = 0 for key, value in dB_list_pesq.items(): avg_pesq = np.mean(value, 0) pesq_list = [[avg_pesq[0], avg_pesq[1], avg_pesq[2]], "avg_pesq"] dB_list_name_pesq[key].append([pesq_list]) dB_list_name_pesq['avg'] += avg_pesq[1] / num improve_pesq += avg_pesq[2] / num for key, value in dB_list_stoi.items(): avg_stoi = np.mean(value, 0) stoi_list = [[avg_stoi[0], avg_stoi[1], avg_stoi[2]], "avg_stoi"] dB_list_name_stoi[key].append([stoi_list]) dB_list_name_stoi['avg'] += avg_stoi[1] / num for key, value in dB_list_SISDR.items(): avg_SISDR = np.mean(value, 0) SISDR_list = [[avg_SISDR[0], avg_SISDR[1], avg_SISDR[2]], "avg_SISDR"] dB_list_name_SISDR[key].append([SISDR_list]) dB_list_name_SISDR['avg'] += avg_SISDR[1] / num noise_avg = list() for key, value in dB_noise_pesq.items(): avg_pesq = np.mean(value, 0) noise_avg.append([key, np.round(avg_pesq, decimals=3)]) # if key==dB_noise_pesq.keys[-1]: # print(noise_avg) print(noise_avg) dB_list_name_pesq['avg'] = round(dB_list_name_pesq['avg'], 3) dB_list_name_stoi['avg'] = round(dB_list_name_stoi['avg'], 3) dB_list_name_SISDR['avg'] = round(dB_list_name_SISDR['avg'], 3) pesq_avg = dB_list_name_pesq['avg'] stoi_avg = dB_list_name_stoi['avg'] SISDR_avg = dB_list_name_SISDR['avg'] print( f'pesq_avg:{pesq_avg} stoi_avg:{stoi_avg} improve_pesq:{round(improve_pesq,3)} SISDR:{SISDR_avg} ' ) return { 'pesq': dB_list_name_pesq, 'stoi': dB_list_name_stoi, 'SISDR': dB_list_name_SISDR, 'noise': noise_avg }
def seg_ditail(fid, trained_model, mt_size, mt_step, st_win): """segment ditail""" st_step = st_win results = {} ### fs, signal = read_wav(fid) [_, st_features] = mt_feature_extraction(signal, fs, mt_size * fs, mt_step * fs, round(fs * st_win)) # VAD: segments = vad(st_features, st_step, smooth_window=0.5, weight=0) i = 0 delta_t = 0.4 for seg in segments: if seg[1] - seg[0] > 2*delta_t: start_seg = seg[0] end_seg = seg[0] + delta_t while start_seg < end_seg: label = trained_model.predict(fs, signal[int(start_seg * fs):int(end_seg * fs)]) print(fid, '--', [start_seg, end_seg], '->', label) # # *********** # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # os.path.basename(fid)[:-3] + "-" + str(start_seg) + "-" + # str(end_seg) + "-" + label + ".wav"), # fs, signal[int(start_seg * fs):int(end_seg * fs)]) results[i] = {"label": label, "start": start_seg, "end": end_seg} i = i + 1 start_seg = end_seg end_seg = start_seg + delta_t if start_seg + 2*delta_t < seg[1] else seg[1] else: label = trained_model.predict(fs, signal[int(seg[0] * fs):int(seg[1] * fs)]) print(fid, '--', seg, '->', label) results[i] = {"label": label, "start": seg[0], "end": seg[1]} i = i + 1 # # *********** # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # os.path.basename(fid)[:-3] + "-" + str(last) + "-" + # str(seg[0]) + "-静音.wav"), # fs, signal[int(last * fs):int(seg[0] * fs)]) # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # os.path.basename(fid)[:-3] + "-" + str(seg[0]) + "-" + # str(seg[1]) + "-" + label + ".wav"), # fs, signal[int(seg[0] * fs):int(seg[1] * fs)]) # last = seg[1] data = {"video_info": {}, "results": []} min_duration = 0.5 start_seg = results[0]["start"] end_seg = results[0]["end"] label = results[0]["label"] # k = 0 ### # test = {}### # last = 0 ### for j in range(1, i-1): if results[j]["start"] - end_seg < min_duration \ and results[j]["label"] == label: end_seg = results[j]["end"] else: if end_seg - start_seg >= 2*min_duration: data["results"].append({"start": start_seg, "end": end_seg, "speaker_id": label}) # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # str(int(k/10))+str(k%10)+os.path.basename(fid)[:-3] + "-" + str(last) + "-" + # str(start_seg) + "-静音.wav"), # fs, signal[int(last * fs):int(start_seg * # write_wav(os.path.join(os.path.pardir, "result", "result_wav", # str(int(k / 10)) + str(k % 10) +os.path.basename(fid)[:-3] + # "-" + str(start_seg) + "-" + # str(end_seg) + "-" + label + ".wav"), # fs, signal[int(start_seg * fs):int(end_seg * fs)]) # if start_seg - last > 0.5: # test[k] = {"label": "无人声", "start": last, "end": start_seg} ### # k = k + 1 # if end_seg - start_seg > 0.5: # test[k] = {"label": label, "start": start_seg, "end": end_seg} ### # k = k + 1 # last = end_seg start_seg = results[j]["start"] end_seg = results[j]["end"] label = results[j]["label"] # test[k] = {"start": start_seg, "end": end_seg, "label": label} # with open("D:\\pro_file\\untitled\\Amber_SpeechSeparation\\test\\result_wav\\example.json", # 'w', encoding='utf-8') as fid_exam: # json.dump(test, fid_exam, ensure_ascii=False) data["results"].append({"start": start_seg, "end": end_seg, "speaker_id": label}) write_wav(os.path.join(os.path.pardir, "result", "result_wav", os.path.basename(fid)[:-4] + "-" + str(start_seg) + "-" + str(end_seg) + "-" + label + ".wav"), fs, signal[int(start_seg * fs):int(end_seg * fs)]) with open(os.path.join(os.path.pardir, "result", "test_json", os.path.basename(fid)[:-3] + "json"), 'w', encoding='utf-8') as json_file: print("..\\result\\test_json\\" + os.path.basename(fid)[:-3] + "json->Generated") json.dump(data, json_file, ensure_ascii=False)