Esempio n. 1
0
def cut_audio(output_path, sample_rate, mono, dilate, strip_prefix,
              audio_backend, add_sub_paths, audio_transcripts):
    audio_path_res = []
    prev_audio_path = ''
    for t in audio_transcripts:
        audio_path = t['audio_path']
        signal = audio.read_audio(
            audio_path, sample_rate, backend=audio_backend
        )[0] if audio_path != prev_audio_path else signal

        if signal.numel(
        ) == 0:  # bug with empty audio files witch produce empty cut file
            print('Empty audio_path ', audio_path)
            return []

        t['channel'] = 0 if len(signal) == 1 else None if mono else t.get(
            'channel')
        segment = signal[
            slice(t['channel'], 1 +
                  t['channel']) if t['channel'] is not None else ...,
            int(max(t['begin'] - dilate, 0) *
                sample_rate):int((t['end'] + dilate) * sample_rate)]

        segment_file_name = os.path.basename(
            audio_path) + '.{channel}-{begin:.06f}-{end:.06f}.wav'.format(**t)
        digest = hashlib.md5(segment_file_name.encode('utf-8')).hexdigest()
        sub_path = [digest[-1:], digest[:2], segment_file_name
                    ] if add_sub_paths else [segment_file_name]

        segment_path = os.path.join(output_path, *sub_path)
        os.makedirs(os.path.dirname(segment_path), exist_ok=True)
        audio.write_audio(segment_path, segment, sample_rate, mono=True)

        if strip_prefix:
            segment_path = segment_path[len(strip_prefix
                                            ):] if segment_path.startswith(
                                                strip_prefix) else segment_path
            t['audio_path'] = t['audio_path'][len(strip_prefix):] if t['audio_path'].startswith(strip_prefix) else \
                                       t['audio_path']

        t = dict(audio_path=segment_path,
                 audio_name=os.path.basename(segment_path),
                 channel=0 if len(signal) == 1 else None,
                 begin=0.0,
                 end=segment.shape[-1] / sample_rate,
                 speaker=t.pop('speaker', None),
                 ref=t.pop('ref', None),
                 hyp=t.pop('hyp', None),
                 cer=t.pop('cer', None),
                 wer=t.pop('wer', None),
                 alignment=t.pop('alignment', []),
                 words=t.pop('words', []),
                 meta=t)

        prev_audio_path = audio_path
        audio_path_res.append(t)
    return audio_path_res
Esempio n. 2
0
def main(args):
    with open(args.diarization_dataset) as data_file:
        for line in tqdm(data_file):
            example = json.loads(line)
            mask = transcripts.intervals_to_mask(example.pop('intervals'),
                                                 example['sample_rate'],
                                                 example['duration']).numpy()
            path, ext = os.path.splitext(example['audio_path'])
            signal, sample_rate = audio.read_audio(path + '_mix' + ext,
                                                   mono=True)
            speaker_1 = signal[:, :mask.shape[-1]] * mask[1, :signal.shape[-1]]
            audio.write_audio(path + '_s1' + ext, speaker_1.T, sample_rate)
            speaker_2 = signal[:, :mask.shape[-1]] * mask[2, :signal.shape[-1]]
            audio.write_audio(path + '_s2' + ext, speaker_2.T, sample_rate)
Esempio n. 3
0
def audio_data_uri(audio_path, sample_rate=None):
    if isinstance(audio_path, str):
        wav_bytes = open(audio_path, 'rb').read()
    else:
        wav_bytes = audio.write_audio(io.BytesIO(), audio_path,
                                      sample_rate).getvalue()

    return 'data:audio/wav;base64,' + base64.b64encode(wav_bytes).decode()
Esempio n. 4
0
def audio_data_uri(audio_path, sample_rate = None, audio_backend = 'scipy', audio_format = 'wav'):
	data_uri = lambda audio_format, audio_bytes: f'data:audio/{audio_format};base64,' + base64.b64encode(audio_bytes).decode()
	
	if isinstance(audio_path, str):
		assert audio_path.endswith('.wav')
		audio_bytes, audio_format = open(audio_path, 'rb').read(), 'wav'
	else:
		audio_bytes = audio.write_audio(io.BytesIO(), audio_path, sample_rate, backend = audio_backend, format = audio_format).getvalue()
		
	return data_uri(audio_format = audio_format, audio_bytes = audio_bytes)
Esempio n. 5
0
def synth():
    tokens_ph = tf.placeholder(tf.int32, [None], "tokens_ph")

    tokens_op = tf.expand_dims(tokens_ph, 0)
    token_lengths = tf.expand_dims(tf.shape(tokens_ph)[0], 0)

    tacotron = model.Model(hyperparams,
                           is_training=False,
                           inputs=tokens_op,
                           input_lengths=token_lengths)

    melspectrum_op = tacotron.decoder.mel_outputs
    spectrum_op = tacotron.decoder.linear_outputs
    alignments_op = tacotron.decoder.alignments

    saver = tf.train.Saver()

    with tf.Session() as sess:

        restore_path = input('Restore path: ')
        saver.restore(sess, restore_path)

        while True:
            sentence = input('Input: ')
            if sentence == '':
                sentence = "In the beginning God created the heavens and the earth."
            tokens = text.encode(sentence)
            melspectrum, spectrum, alignments = sess.run(
                [melspectrum_op, spectrum_op, alignments_op],
                {tokens_ph: tokens})
            plt.figure()
            plt.imshow(melspectrum[0])
            plt.figure()
            plt.imshow(spectrum[0])
            plt.figure()
            plt.imshow(alignments[0])
            plt.show()
            signal = audio.reconstruct(hyperparams, spectrum[0].T)
            audio.write_audio(sentence + ".wav", signal,
                              hyperparams.sample_rate)
Esempio n. 6
0
def test_audio_conv(audio_path):
    # Audio file
    wav_path = os.path.join(hyperparams.dataset_path, filename + '.wav')
    wave = audio.read_audio(wav_path, hyperparams.sample_rate)
    audio_length = wave.shape[0] / hyperparams.sample_rate

    # Calculate spectrum
    mel, linear = audio.spectrogram(hyperparams, wave)

    #plt.imshow(mel)
    from_mel = audio.mel_to_linear(mel, (hyperparams.num_freq - 1) * 2,
                                   hyperparams.sample_rate,
                                   hyperparams.num_mels)
    plt.imshow(from_mel)
    plt.show()
    plt.imshow(linear)
    plt.show()

    signal = audio.reconstruct(hyperparams, linear)
    audio.write_audio('test.wav', signal, hyperparams.sample_rate)

    signal = audio.reconstruct(hyperparams, mel, from_mel=True)
    audio.write_audio('test_mel.wav', signal, hyperparams.sample_rate)
Esempio n. 7
0
def generate_utterances(audio_path: str, output_path: str, sample_rate: int,
                        vad, utterance_duration: float, stride: int,
                        min_utterance_score: float):
    signal, _ = audio.read_audio(audio_path,
                                 sample_rate=sample_rate,
                                 mono=False,
                                 dtype=vad.input_dtype,
                                 __array_wrap__=vad.input_type)
    speaker_masks = vad.detect(signal, allow_overlap=True)
    if vad.input_type == torch.tensor:
        speaker_masks = speaker_masks.cpu().numpy()
    utterance_duration = math.ceil(utterance_duration * sample_rate)
    assert utterance_duration % stride == 0

    ## https://habr.com/ru/post/489734/#1d
    sliding_window = np.lib.stride_tricks.as_strided(
        speaker_masks,
        shape=(
            speaker_masks.shape[0],
            int((speaker_masks.shape[-1] - utterance_duration) / stride) + 1,
            utterance_duration,
        ),
        strides=(speaker_masks.strides[0], stride, 1))
    n_samples_by_speaker = sliding_window.sum(-1)
    # speakers ratio in range [0;1] - silence ratio in range [0;1]
    utterance_scores = n_samples_by_speaker[1:].min(0) / (
        n_samples_by_speaker[1:].max(0) +
        1) - n_samples_by_speaker[0] / utterance_duration

    n = 0
    audio_name, extension = os.path.splitext(os.path.basename(audio_path))
    while utterance_scores.max() > min_utterance_score:
        i = np.argmax(utterance_scores)
        utterance = signal[:, i * stride:i * stride + utterance_duration]
        utterance_scores[max(0, i - int(utterance_duration / stride) + 1):i +
                         int(utterance_duration / stride)] = 0.0
        audio.write_audio(os.path.join(output_path, 'mix',
                                       f'{audio_name}.{n}{extension}'),
                          utterance.T,
                          sample_rate,
                          mono=True)
        audio.write_audio(os.path.join(output_path, 'spk1',
                                       f'{audio_name}.{n}{extension}'),
                          utterance[0:1, :].T,
                          sample_rate,
                          mono=True)
        audio.write_audio(os.path.join(output_path, 'spk2',
                                       f'{audio_name}.{n}{extension}'),
                          utterance[1:2, :].T,
                          sample_rate,
                          mono=True)
        n += 1
Esempio n. 8
0
    def __getitem__(self, index):
        waveform_transform_debug = (
            lambda audio_path, sample_rate, signal: audio.write_audio(
                os.path.join(self.waveform_transform_debug_dir,
                             os.path.basename(audio_path) + '.wav'), signal,
                sample_rate)) if self.waveform_transform_debug_dir else None

        audio_path = self.audio_path[index]

        transcript = self.load_example(index)

        signal, sample_rate = audio.read_audio(
            audio_path,
            sample_rate=self.sample_rate,
            mono=self.mono,
            backend=self.audio_backend,
            duration=self.max_duration,
            dtype=self.audio_dtype
        ) if self.frontend is None or self.frontend.read_audio else (
            audio_path, self.sample_rate)

        #TODO: support forced mono even if transcript is given
        #TODO: subsample speaker labels according to features

        some_segments_have_not_begin_end = any(
            t['begin'] == transcripts.time_missing
            and t['end'] == transcripts.time_missing for t in transcript)
        some_segments_have_ref = any(bool(t['ref']) for t in transcript)
        replace_transcript = self.join_transcript or (not transcript) or (
            some_segments_have_not_begin_end and some_segments_have_ref)

        if replace_transcript:
            assert len(signal) == 1, 'only mono supported for now'
            ref_full = [t['ref'] for t in transcript]
            speaker = torch.cat([
                torch.full(
                    (len(ref) + 1, ), t['speaker'],
                    dtype=torch.int64).scatter_(0, torch.tensor(len(ref)),
                                                transcripts.speaker_missing)
                for t, ref in zip(transcript, ref_full)
            ])[:-1].unsqueeze(0)

            transcript = [
                dict(audio_path=audio_path,
                     ref=' '.join(ref_full),
                     example_id=self.example_id(dict(audio_path=audio_path)),
                     channel=0,
                     begin_samples=0,
                     end_samples=None)
            ]
        else:
            transcript = [
                dict(audio_path=audio_path,
                     ref=t['ref'],
                     example_id=self.example_id(t),
                     channel=channel,
                     begin_samples=int(t['begin'] * sample_rate)
                     if t['begin'] != transcripts.time_missing else 0,
                     end_samples=1 + int(t['end'] * sample_rate) if
                     t['end'] != transcripts.time_missing else signal.shape[1],
                     speaker=t['speaker'])
                for t in sorted(transcript, key=transcripts.sort_key)
                for channel in ([t['channel']] if t['channel'] != transcripts.
                                channel_missing else range(len(signal)))
            ]
            speaker = torch.LongTensor([t.pop('speaker')
                                        for t in transcript]).unsqueeze(-1)
        ## TODO check logic
        features = []
        for t in transcript:
            channel = t.pop('channel')
            time_slice = slice(t.pop('begin_samples'), t.pop(
                'end_samples'))  # pop is required independent of segmented
            if self.segmented and not self.debug_short_long_records_features_from_whole_normalized_signal:
                segment = signal[None, channel, time_slice]
            else:
                segment = signal[
                    None,
                    channel, :]  # begin, end meta could be corrupted, thats why we dont use it here
            if self.frontend is not None:
                if self.debug_short_long_records_features_from_whole_normalized_signal:
                    segment_features = self.frontend(segment)
                    hop_length = self.frontend.hop_length
                    segment_features = segment_features[:, :,
                                                        time_slice.start //
                                                        hop_length:time_slice.
                                                        stop // hop_length]
                    features.append(segment_features.squeeze(0))
                else:
                    features.append(
                        self.frontend(
                            segment,
                            waveform_transform_debug=waveform_transform_debug).
                        squeeze(0))
            else:
                features.append(segment)

        targets = []
        for pipeline in self.text_pipelines:
            encoded_transcripts = []
            for t in transcript:
                processed = pipeline.preprocess(t['ref'])
                tokens = torch.tensor(pipeline.encode([processed])[0],
                                      dtype=torch.long,
                                      device='cpu')
                encoded_transcripts.append(tokens)
            targets.append(encoded_transcripts)

        # not batch mode
        if not self.segmented:
            transcript, speaker, features = transcript[0], speaker[
                0], features[0]
            targets = [target[0] for target in targets]
        return [transcript, speaker, features] + targets
Esempio n. 9
0
def ref(input_path, output_path, sample_rate, window_size, device,
        max_duration, debug_audio, html, ext):
    os.makedirs(output_path, exist_ok=True)
    audio_source = ([
        (input_path, audio_name) for audio_name in os.listdir(input_path)
    ] if os.path.isdir(input_path) else [(os.path.dirname(input_path),
                                          os.path.basename(input_path))])
    for i, (input_path, audio_name) in enumerate(audio_source):
        print(i, '/', len(audio_source), audio_name)
        audio_path = os.path.join(input_path, audio_name)
        noextname = audio_name[:-len(ext)]
        transcript_path = os.path.join(output_path, noextname + '.json')
        rttm_path = os.path.join(output_path, noextname + '.rttm')

        signal, sample_rate = audio.read_audio(audio_path,
                                               sample_rate=sample_rate,
                                               mono=False,
                                               dtype='float32',
                                               duration=max_duration)

        speaker_id_ref, speaker_id_ref_ = select_speaker(
            signal.to(device),
            silence_absolute_threshold=0.05,
            silence_relative_threshold=0.2,
            kernel_size_smooth_signal=128,
            kernel_size_smooth_speaker=4096,
            kernel_size_smooth_silence=4096)

        transcript = [
            dict(audio_path=audio_path,
                 begin=float(begin) / sample_rate,
                 end=(float(begin) + float(duration)) / sample_rate,
                 speaker=speaker,
                 speaker_name=transcripts.default_speaker_names[speaker])
            for speaker in range(1, len(speaker_id_ref_))
            for begin, duration, mask in zip(
                *models.rle1d(speaker_id_ref_[speaker])) if mask == 1
        ]

        #transcript = [dict(audio_path = audio_path, begin = float(begin) / sample_rate, end = (float(begin) + float(duration)) / sample_rate, speaker_name = str(int(speaker)), speaker = int(speaker)) for begin, duration, speaker in zip(*models.rle1d(speaker_id_ref.cpu()))]

        transcript_without_speaker_missing = [
            t for t in transcript
            if t['speaker'] != transcripts.speaker_missing
        ]
        transcripts.save(transcript_path, transcript_without_speaker_missing)
        print(transcript_path)

        transcripts.save(rttm_path, transcript_without_speaker_missing)
        print(rttm_path)

        if debug_audio:
            audio.write_audio(
                transcript_path + '.wav',
                torch.cat([
                    signal[..., :speaker_id_ref.shape[-1]],
                    convert_speaker_id(speaker_id_ref[..., :signal.shape[-1]],
                                       to_bipole=True).unsqueeze(0).cpu() *
                    0.5, speaker_id_ref_[..., :signal.shape[-1]].cpu() * 0.5
                ]),
                sample_rate,
                mono=False)
            print(transcript_path + '.wav')

        if html:
            html_path = os.path.join(output_path, audio_name + '.html')
            vis.transcript(html_path,
                           sample_rate=sample_rate,
                           mono=True,
                           transcript=transcript,
                           duration=max_duration)
Esempio n. 10
0
params.grain_spacing = (sample_rate * params.grain_spacing_ms) / 1000
[grain_groups, event_list, event_groups,
 features] = grp.group_events(source_audio, params)
if params.debug > 0:
    stats.num_events = len(event_list)

if params.mode == 'loop':
    streams = gen.group_loop(sample_rate, params, grain_groups, features,
                             stats)
elif params.mode == 'block':
    streams = gen.block_generator(sample_rate, params, grain_groups, features,
                                  stats)

print "Mixing down.."
output_audio = au.post_process(streams, params)
au.write_audio(params.outfile, sample_rate, output_audio)

if params.debug > 0:
    print "Run stats:"
    print "  Number of events: %d" % stats.num_events
    print "  Number of grains: %d" % stats.num_grains
    print "  Number of effect convolutions: %d" % stats.convolutions
    print "  Number of filter uses: %d" % stats.filterings
    if params.debug > 1:
        import plotting as pl
        print "Plotting.."
        pl.plot_features(event_groups, features, params.num_groups)
        pl.plot_source_audio(source_audio, sample_rate, event_list,
                             event_groups)
        pl.plot_generated_audio(output_audio, sample_rate)
        pl.show()
Esempio n. 11
0
    def __getitem__(self, index):
        waveform_transform_debug = (
            lambda audio_path, sample_rate, signal: audio.write_audio(
                os.path.join(self.waveform_transform_debug_dir,
                             os.path.basename(audio_path) + '.wav'), signal,
                sample_rate)) if self.waveform_transform_debug_dir else None

        audio_path = self.audio_path[index]

        transcript = self.load_example(index)

        signal, sample_rate = audio.read_audio(
            audio_path,
            sample_rate=self.sample_rate,
            mono=self.mono,
            backend=self.audio_backend,
            duration=self.max_duration
        ) if self.frontend is None or self.frontend.read_audio else (
            audio_path, self.sample_rate)

        #TODO: support forced mono even if transcript is given
        #TODO: subsample speaker labels according to features

        some_segments_have_not_begin_end = any(
            t['begin'] == self.time_missing and t['end'] == self.time_missing
            for t in transcript)
        some_segments_have_ref = any(bool(t['ref']) for t in transcript)
        replace_transcript = self.join_transcript or (not transcript) or (
            some_segments_have_not_begin_end and some_segments_have_ref)

        if replace_transcript:
            assert len(signal) == 1, 'only mono supported for now'
            # replacing ref by normalizing only with default preprocessor
            ref_full = [
                self.labels[0].normalize_text(t['ref']) for t in transcript
            ]
            speaker = torch.cat([
                torch.full((len(ref) + 1, ), t['speaker'],
                           dtype=torch.int64).scatter_(0,
                                                       torch.tensor(len(ref)),
                                                       self.speaker_missing)
                for t, ref in zip(transcript, ref_full)
            ])[:-1]
            transcript = [
                dict(audio_path=audio_path,
                     ref=' '.join(ref_full),
                     example_id=self.example_id(dict(audio_path=audio_path)),
                     channel=0,
                     begin_samples=0,
                     end_samples=None)
            ]
            normalize_text = False
        else:
            transcript = [
                dict(audio_path=audio_path,
                     ref=t['ref'],
                     example_id=self.example_id(t),
                     channel=channel,
                     begin_samples=int(t['begin'] * sample_rate)
                     if t['begin'] != self.time_missing else 0,
                     end_samples=1 + int(t['end'] * sample_rate)
                     if t['end'] != self.time_missing else signal.shape[1],
                     speaker=t['speaker'])
                for t in sorted(transcript, key=transcripts.sort_key)
                for channel in ([t['channel']] if t['channel'] != self.
                                channel_missing else range(len(signal)))
            ]
            speaker = torch.LongTensor([t.pop('speaker')
                                        for t in transcript]).unsqueeze(-1)
            normalize_text = True

        features = [
            self.frontend(segment.unsqueeze(0),
                          waveform_transform_debug=waveform_transform_debug)
            if self.frontend is not None else segment.unsqueeze(0)
            for t in transcript for segment in [
                signal[t.pop('channel'),
                       t.pop('begin_samples'):t.pop('end_samples')]
            ]
        ]
        targets = [[
            labels.encode(t['ref'], normalize=normalize_text)[1]
            for t in transcript
        ] for labels in self.labels]

        # not batch mode
        if not self.segmented:
            transcript, speaker, features = transcript[0], speaker[
                0], features[0][0]
            targets = [target[0] for target in targets]
        return [transcript, speaker, features] + targets