def cut_audio(output_path, sample_rate, mono, dilate, strip_prefix, audio_backend, add_sub_paths, audio_transcripts): audio_path_res = [] prev_audio_path = '' for t in audio_transcripts: audio_path = t['audio_path'] signal = audio.read_audio( audio_path, sample_rate, backend=audio_backend )[0] if audio_path != prev_audio_path else signal if signal.numel( ) == 0: # bug with empty audio files witch produce empty cut file print('Empty audio_path ', audio_path) return [] t['channel'] = 0 if len(signal) == 1 else None if mono else t.get( 'channel') segment = signal[ slice(t['channel'], 1 + t['channel']) if t['channel'] is not None else ..., int(max(t['begin'] - dilate, 0) * sample_rate):int((t['end'] + dilate) * sample_rate)] segment_file_name = os.path.basename( audio_path) + '.{channel}-{begin:.06f}-{end:.06f}.wav'.format(**t) digest = hashlib.md5(segment_file_name.encode('utf-8')).hexdigest() sub_path = [digest[-1:], digest[:2], segment_file_name ] if add_sub_paths else [segment_file_name] segment_path = os.path.join(output_path, *sub_path) os.makedirs(os.path.dirname(segment_path), exist_ok=True) audio.write_audio(segment_path, segment, sample_rate, mono=True) if strip_prefix: segment_path = segment_path[len(strip_prefix ):] if segment_path.startswith( strip_prefix) else segment_path t['audio_path'] = t['audio_path'][len(strip_prefix):] if t['audio_path'].startswith(strip_prefix) else \ t['audio_path'] t = dict(audio_path=segment_path, audio_name=os.path.basename(segment_path), channel=0 if len(signal) == 1 else None, begin=0.0, end=segment.shape[-1] / sample_rate, speaker=t.pop('speaker', None), ref=t.pop('ref', None), hyp=t.pop('hyp', None), cer=t.pop('cer', None), wer=t.pop('wer', None), alignment=t.pop('alignment', []), words=t.pop('words', []), meta=t) prev_audio_path = audio_path audio_path_res.append(t) return audio_path_res
def main(args): with open(args.diarization_dataset) as data_file: for line in tqdm(data_file): example = json.loads(line) mask = transcripts.intervals_to_mask(example.pop('intervals'), example['sample_rate'], example['duration']).numpy() path, ext = os.path.splitext(example['audio_path']) signal, sample_rate = audio.read_audio(path + '_mix' + ext, mono=True) speaker_1 = signal[:, :mask.shape[-1]] * mask[1, :signal.shape[-1]] audio.write_audio(path + '_s1' + ext, speaker_1.T, sample_rate) speaker_2 = signal[:, :mask.shape[-1]] * mask[2, :signal.shape[-1]] audio.write_audio(path + '_s2' + ext, speaker_2.T, sample_rate)
def audio_data_uri(audio_path, sample_rate=None): if isinstance(audio_path, str): wav_bytes = open(audio_path, 'rb').read() else: wav_bytes = audio.write_audio(io.BytesIO(), audio_path, sample_rate).getvalue() return 'data:audio/wav;base64,' + base64.b64encode(wav_bytes).decode()
def audio_data_uri(audio_path, sample_rate = None, audio_backend = 'scipy', audio_format = 'wav'): data_uri = lambda audio_format, audio_bytes: f'data:audio/{audio_format};base64,' + base64.b64encode(audio_bytes).decode() if isinstance(audio_path, str): assert audio_path.endswith('.wav') audio_bytes, audio_format = open(audio_path, 'rb').read(), 'wav' else: audio_bytes = audio.write_audio(io.BytesIO(), audio_path, sample_rate, backend = audio_backend, format = audio_format).getvalue() return data_uri(audio_format = audio_format, audio_bytes = audio_bytes)
def synth(): tokens_ph = tf.placeholder(tf.int32, [None], "tokens_ph") tokens_op = tf.expand_dims(tokens_ph, 0) token_lengths = tf.expand_dims(tf.shape(tokens_ph)[0], 0) tacotron = model.Model(hyperparams, is_training=False, inputs=tokens_op, input_lengths=token_lengths) melspectrum_op = tacotron.decoder.mel_outputs spectrum_op = tacotron.decoder.linear_outputs alignments_op = tacotron.decoder.alignments saver = tf.train.Saver() with tf.Session() as sess: restore_path = input('Restore path: ') saver.restore(sess, restore_path) while True: sentence = input('Input: ') if sentence == '': sentence = "In the beginning God created the heavens and the earth." tokens = text.encode(sentence) melspectrum, spectrum, alignments = sess.run( [melspectrum_op, spectrum_op, alignments_op], {tokens_ph: tokens}) plt.figure() plt.imshow(melspectrum[0]) plt.figure() plt.imshow(spectrum[0]) plt.figure() plt.imshow(alignments[0]) plt.show() signal = audio.reconstruct(hyperparams, spectrum[0].T) audio.write_audio(sentence + ".wav", signal, hyperparams.sample_rate)
def test_audio_conv(audio_path): # Audio file wav_path = os.path.join(hyperparams.dataset_path, filename + '.wav') wave = audio.read_audio(wav_path, hyperparams.sample_rate) audio_length = wave.shape[0] / hyperparams.sample_rate # Calculate spectrum mel, linear = audio.spectrogram(hyperparams, wave) #plt.imshow(mel) from_mel = audio.mel_to_linear(mel, (hyperparams.num_freq - 1) * 2, hyperparams.sample_rate, hyperparams.num_mels) plt.imshow(from_mel) plt.show() plt.imshow(linear) plt.show() signal = audio.reconstruct(hyperparams, linear) audio.write_audio('test.wav', signal, hyperparams.sample_rate) signal = audio.reconstruct(hyperparams, mel, from_mel=True) audio.write_audio('test_mel.wav', signal, hyperparams.sample_rate)
def generate_utterances(audio_path: str, output_path: str, sample_rate: int, vad, utterance_duration: float, stride: int, min_utterance_score: float): signal, _ = audio.read_audio(audio_path, sample_rate=sample_rate, mono=False, dtype=vad.input_dtype, __array_wrap__=vad.input_type) speaker_masks = vad.detect(signal, allow_overlap=True) if vad.input_type == torch.tensor: speaker_masks = speaker_masks.cpu().numpy() utterance_duration = math.ceil(utterance_duration * sample_rate) assert utterance_duration % stride == 0 ## https://habr.com/ru/post/489734/#1d sliding_window = np.lib.stride_tricks.as_strided( speaker_masks, shape=( speaker_masks.shape[0], int((speaker_masks.shape[-1] - utterance_duration) / stride) + 1, utterance_duration, ), strides=(speaker_masks.strides[0], stride, 1)) n_samples_by_speaker = sliding_window.sum(-1) # speakers ratio in range [0;1] - silence ratio in range [0;1] utterance_scores = n_samples_by_speaker[1:].min(0) / ( n_samples_by_speaker[1:].max(0) + 1) - n_samples_by_speaker[0] / utterance_duration n = 0 audio_name, extension = os.path.splitext(os.path.basename(audio_path)) while utterance_scores.max() > min_utterance_score: i = np.argmax(utterance_scores) utterance = signal[:, i * stride:i * stride + utterance_duration] utterance_scores[max(0, i - int(utterance_duration / stride) + 1):i + int(utterance_duration / stride)] = 0.0 audio.write_audio(os.path.join(output_path, 'mix', f'{audio_name}.{n}{extension}'), utterance.T, sample_rate, mono=True) audio.write_audio(os.path.join(output_path, 'spk1', f'{audio_name}.{n}{extension}'), utterance[0:1, :].T, sample_rate, mono=True) audio.write_audio(os.path.join(output_path, 'spk2', f'{audio_name}.{n}{extension}'), utterance[1:2, :].T, sample_rate, mono=True) n += 1
def __getitem__(self, index): waveform_transform_debug = ( lambda audio_path, sample_rate, signal: audio.write_audio( os.path.join(self.waveform_transform_debug_dir, os.path.basename(audio_path) + '.wav'), signal, sample_rate)) if self.waveform_transform_debug_dir else None audio_path = self.audio_path[index] transcript = self.load_example(index) signal, sample_rate = audio.read_audio( audio_path, sample_rate=self.sample_rate, mono=self.mono, backend=self.audio_backend, duration=self.max_duration, dtype=self.audio_dtype ) if self.frontend is None or self.frontend.read_audio else ( audio_path, self.sample_rate) #TODO: support forced mono even if transcript is given #TODO: subsample speaker labels according to features some_segments_have_not_begin_end = any( t['begin'] == transcripts.time_missing and t['end'] == transcripts.time_missing for t in transcript) some_segments_have_ref = any(bool(t['ref']) for t in transcript) replace_transcript = self.join_transcript or (not transcript) or ( some_segments_have_not_begin_end and some_segments_have_ref) if replace_transcript: assert len(signal) == 1, 'only mono supported for now' ref_full = [t['ref'] for t in transcript] speaker = torch.cat([ torch.full( (len(ref) + 1, ), t['speaker'], dtype=torch.int64).scatter_(0, torch.tensor(len(ref)), transcripts.speaker_missing) for t, ref in zip(transcript, ref_full) ])[:-1].unsqueeze(0) transcript = [ dict(audio_path=audio_path, ref=' '.join(ref_full), example_id=self.example_id(dict(audio_path=audio_path)), channel=0, begin_samples=0, end_samples=None) ] else: transcript = [ dict(audio_path=audio_path, ref=t['ref'], example_id=self.example_id(t), channel=channel, begin_samples=int(t['begin'] * sample_rate) if t['begin'] != transcripts.time_missing else 0, end_samples=1 + int(t['end'] * sample_rate) if t['end'] != transcripts.time_missing else signal.shape[1], speaker=t['speaker']) for t in sorted(transcript, key=transcripts.sort_key) for channel in ([t['channel']] if t['channel'] != transcripts. channel_missing else range(len(signal))) ] speaker = torch.LongTensor([t.pop('speaker') for t in transcript]).unsqueeze(-1) ## TODO check logic features = [] for t in transcript: channel = t.pop('channel') time_slice = slice(t.pop('begin_samples'), t.pop( 'end_samples')) # pop is required independent of segmented if self.segmented and not self.debug_short_long_records_features_from_whole_normalized_signal: segment = signal[None, channel, time_slice] else: segment = signal[ None, channel, :] # begin, end meta could be corrupted, thats why we dont use it here if self.frontend is not None: if self.debug_short_long_records_features_from_whole_normalized_signal: segment_features = self.frontend(segment) hop_length = self.frontend.hop_length segment_features = segment_features[:, :, time_slice.start // hop_length:time_slice. stop // hop_length] features.append(segment_features.squeeze(0)) else: features.append( self.frontend( segment, waveform_transform_debug=waveform_transform_debug). squeeze(0)) else: features.append(segment) targets = [] for pipeline in self.text_pipelines: encoded_transcripts = [] for t in transcript: processed = pipeline.preprocess(t['ref']) tokens = torch.tensor(pipeline.encode([processed])[0], dtype=torch.long, device='cpu') encoded_transcripts.append(tokens) targets.append(encoded_transcripts) # not batch mode if not self.segmented: transcript, speaker, features = transcript[0], speaker[ 0], features[0] targets = [target[0] for target in targets] return [transcript, speaker, features] + targets
def ref(input_path, output_path, sample_rate, window_size, device, max_duration, debug_audio, html, ext): os.makedirs(output_path, exist_ok=True) audio_source = ([ (input_path, audio_name) for audio_name in os.listdir(input_path) ] if os.path.isdir(input_path) else [(os.path.dirname(input_path), os.path.basename(input_path))]) for i, (input_path, audio_name) in enumerate(audio_source): print(i, '/', len(audio_source), audio_name) audio_path = os.path.join(input_path, audio_name) noextname = audio_name[:-len(ext)] transcript_path = os.path.join(output_path, noextname + '.json') rttm_path = os.path.join(output_path, noextname + '.rttm') signal, sample_rate = audio.read_audio(audio_path, sample_rate=sample_rate, mono=False, dtype='float32', duration=max_duration) speaker_id_ref, speaker_id_ref_ = select_speaker( signal.to(device), silence_absolute_threshold=0.05, silence_relative_threshold=0.2, kernel_size_smooth_signal=128, kernel_size_smooth_speaker=4096, kernel_size_smooth_silence=4096) transcript = [ dict(audio_path=audio_path, begin=float(begin) / sample_rate, end=(float(begin) + float(duration)) / sample_rate, speaker=speaker, speaker_name=transcripts.default_speaker_names[speaker]) for speaker in range(1, len(speaker_id_ref_)) for begin, duration, mask in zip( *models.rle1d(speaker_id_ref_[speaker])) if mask == 1 ] #transcript = [dict(audio_path = audio_path, begin = float(begin) / sample_rate, end = (float(begin) + float(duration)) / sample_rate, speaker_name = str(int(speaker)), speaker = int(speaker)) for begin, duration, speaker in zip(*models.rle1d(speaker_id_ref.cpu()))] transcript_without_speaker_missing = [ t for t in transcript if t['speaker'] != transcripts.speaker_missing ] transcripts.save(transcript_path, transcript_without_speaker_missing) print(transcript_path) transcripts.save(rttm_path, transcript_without_speaker_missing) print(rttm_path) if debug_audio: audio.write_audio( transcript_path + '.wav', torch.cat([ signal[..., :speaker_id_ref.shape[-1]], convert_speaker_id(speaker_id_ref[..., :signal.shape[-1]], to_bipole=True).unsqueeze(0).cpu() * 0.5, speaker_id_ref_[..., :signal.shape[-1]].cpu() * 0.5 ]), sample_rate, mono=False) print(transcript_path + '.wav') if html: html_path = os.path.join(output_path, audio_name + '.html') vis.transcript(html_path, sample_rate=sample_rate, mono=True, transcript=transcript, duration=max_duration)
params.grain_spacing = (sample_rate * params.grain_spacing_ms) / 1000 [grain_groups, event_list, event_groups, features] = grp.group_events(source_audio, params) if params.debug > 0: stats.num_events = len(event_list) if params.mode == 'loop': streams = gen.group_loop(sample_rate, params, grain_groups, features, stats) elif params.mode == 'block': streams = gen.block_generator(sample_rate, params, grain_groups, features, stats) print "Mixing down.." output_audio = au.post_process(streams, params) au.write_audio(params.outfile, sample_rate, output_audio) if params.debug > 0: print "Run stats:" print " Number of events: %d" % stats.num_events print " Number of grains: %d" % stats.num_grains print " Number of effect convolutions: %d" % stats.convolutions print " Number of filter uses: %d" % stats.filterings if params.debug > 1: import plotting as pl print "Plotting.." pl.plot_features(event_groups, features, params.num_groups) pl.plot_source_audio(source_audio, sample_rate, event_list, event_groups) pl.plot_generated_audio(output_audio, sample_rate) pl.show()
def __getitem__(self, index): waveform_transform_debug = ( lambda audio_path, sample_rate, signal: audio.write_audio( os.path.join(self.waveform_transform_debug_dir, os.path.basename(audio_path) + '.wav'), signal, sample_rate)) if self.waveform_transform_debug_dir else None audio_path = self.audio_path[index] transcript = self.load_example(index) signal, sample_rate = audio.read_audio( audio_path, sample_rate=self.sample_rate, mono=self.mono, backend=self.audio_backend, duration=self.max_duration ) if self.frontend is None or self.frontend.read_audio else ( audio_path, self.sample_rate) #TODO: support forced mono even if transcript is given #TODO: subsample speaker labels according to features some_segments_have_not_begin_end = any( t['begin'] == self.time_missing and t['end'] == self.time_missing for t in transcript) some_segments_have_ref = any(bool(t['ref']) for t in transcript) replace_transcript = self.join_transcript or (not transcript) or ( some_segments_have_not_begin_end and some_segments_have_ref) if replace_transcript: assert len(signal) == 1, 'only mono supported for now' # replacing ref by normalizing only with default preprocessor ref_full = [ self.labels[0].normalize_text(t['ref']) for t in transcript ] speaker = torch.cat([ torch.full((len(ref) + 1, ), t['speaker'], dtype=torch.int64).scatter_(0, torch.tensor(len(ref)), self.speaker_missing) for t, ref in zip(transcript, ref_full) ])[:-1] transcript = [ dict(audio_path=audio_path, ref=' '.join(ref_full), example_id=self.example_id(dict(audio_path=audio_path)), channel=0, begin_samples=0, end_samples=None) ] normalize_text = False else: transcript = [ dict(audio_path=audio_path, ref=t['ref'], example_id=self.example_id(t), channel=channel, begin_samples=int(t['begin'] * sample_rate) if t['begin'] != self.time_missing else 0, end_samples=1 + int(t['end'] * sample_rate) if t['end'] != self.time_missing else signal.shape[1], speaker=t['speaker']) for t in sorted(transcript, key=transcripts.sort_key) for channel in ([t['channel']] if t['channel'] != self. channel_missing else range(len(signal))) ] speaker = torch.LongTensor([t.pop('speaker') for t in transcript]).unsqueeze(-1) normalize_text = True features = [ self.frontend(segment.unsqueeze(0), waveform_transform_debug=waveform_transform_debug) if self.frontend is not None else segment.unsqueeze(0) for t in transcript for segment in [ signal[t.pop('channel'), t.pop('begin_samples'):t.pop('end_samples')] ] ] targets = [[ labels.encode(t['ref'], normalize=normalize_text)[1] for t in transcript ] for labels in self.labels] # not batch mode if not self.segmented: transcript, speaker, features = transcript[0], speaker[ 0], features[0][0] targets = [target[0] for target in targets] return [transcript, speaker, features] + targets