Ejemplo n.º 1
0
 def _gen_data():
     for clean_wav_path in self.data_paths:
         noisy_wav_path = clean_wav_path.replace(self.clean_dir, self.noisy_dir)
         clean_wav = read_raw_audio(clean_wav_path, sample_rate=self.speech_featurizer.sample_rate)
         noisy_wav = read_raw_audio(noisy_wav_path, sample_rate=self.speech_featurizer.sample_rate)
         clean_slices, noisy_slices = self.parse(clean_wav, noisy_wav)
         for clean, noisy in zip(clean_slices, noisy_slices):
             yield clean, noisy
 def _gen_data():
     for clean_wav_path in self.data_paths:
         clean_wav = read_raw_audio(
             clean_wav_path,
             sample_rate=self.speech_config["sample_rate"])
         noisy_wav_path = clean_wav_path.replace(
             self.clean_dir, self.noisy_dir)
         noisy_wav = read_raw_audio(
             noisy_wav_path,
             sample_rate=self.speech_config["sample_rate"])
         clean_slices, noisy_slices = self.parse(clean_wav, noisy_wav)
         yield clean_wav_path, clean_slices, noisy_slices
Ejemplo n.º 3
0
def main(
    filename: str,
    tflite: str = None,
    blank: int = 0,
    num_rnns: int = 1,
    nstates: int = 2,
    statesize: int = 320,
):
    tflitemodel = tf.lite.Interpreter(model_path=tflite)

    signal = read_raw_audio(filename)

    input_details = tflitemodel.get_input_details()
    output_details = tflitemodel.get_output_details()
    tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape)
    tflitemodel.allocate_tensors()
    tflitemodel.set_tensor(input_details[0]["index"], signal)
    tflitemodel.set_tensor(input_details[1]["index"],
                           tf.constant(blank, dtype=tf.int32))
    tflitemodel.set_tensor(
        input_details[2]["index"],
        tf.zeros([num_rnns, nstates, 1, statesize], dtype=tf.float32))
    tflitemodel.invoke()
    hyp = tflitemodel.get_tensor(output_details[0]["index"])

    print("".join([chr(u) for u in hyp]))
def main(argv):
    speech_file = argv[1]
    feature_type = argv[2]
    speech_conf = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "feature_type": feature_type,
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_feature": False,
        "num_feature_bins": 80,
    }
    signal = read_raw_audio(speech_file, speech_conf["sample_rate"])

    nsf = NumpySpeechFeaturizer(speech_conf)
    sf = TFSpeechFeaturizer(speech_conf)
    ft = nsf.stft(signal)
    print(ft.shape, np.mean(ft))
    ft = sf.stft(signal).numpy()
    print(ft.shape, np.mean(ft))
    ft = sf.extract(signal)

    plt.figure(figsize=(16, 2.5))
    ax = plt.gca()
    ax.set_title(f"{feature_type}", fontweight="bold")
    librosa.display.specshow(ft.T, cmap="magma")
    v1 = np.linspace(ft.min(), ft.max(), 8, endpoint=True)
    plt.colorbar(pad=0.01, fraction=0.02, ax=ax, format="%.2f", ticks=v1)
    plt.tight_layout()
Ejemplo n.º 5
0
 def _gen_data():
     for clean_wav_path in self.data_paths:
         clean_wav = read_raw_audio(
             clean_wav_path, sample_rate=self.speech_featurizer.sample_rate)
         clean_slices, noisy_slices = self.parse(clean_wav)
         for clean, noisy in zip(clean_slices, noisy_slices):
             yield clean, noisy
Ejemplo n.º 6
0
def main(
    saved_model: str = None,
    filename: str = None,
):
    tf.keras.backend.clear_session()

    module = tf.saved_model.load(export_dir=saved_model)

    signal = read_raw_audio(filename)
    transcript = module.pred(signal)

    print("Transcript: ", "".join([chr(u) for u in transcript]))
Ejemplo n.º 7
0
    def preprocess(self, path, transcript):
        with tf.device("/CPU:0"):
            signal = read_raw_audio(path.decode("utf-8"), self.speech_featurizer.sample_rate)

            features = self.speech_featurizer.extract(signal)
            features = tf.convert_to_tensor(features, tf.float32)
            input_length = tf.cast(tf.shape(features)[0], tf.int32)

            label = self.text_featurizer.extract(transcript.decode("utf-8"))
            label = tf.convert_to_tensor(label, dtype=tf.int32)

            return path, features, input_length, label
Ejemplo n.º 8
0
            def fn(_path: bytes, _audio: bytes, _indices: bytes):
                signal = read_raw_audio(_audio, sample_rate=self.speech_featurizer.sample_rate)
                signal = self.augmentations.signal_augment(signal)
                features = self.speech_featurizer.extract(signal.numpy())
                features = self.augmentations.feature_augment(features)
                features = tf.convert_to_tensor(features, tf.float32)
                input_length = tf.cast(tf.shape(features)[0], tf.int32)

                label = tf.strings.to_number(tf.strings.split(_indices), out_type=tf.int32)
                label_length = tf.cast(tf.shape(label)[0], tf.int32)

                prediction = self.text_featurizer.prepand_blank(label)
                prediction_length = tf.cast(tf.shape(prediction)[0], tf.int32)

                return _path, features, input_length, label, label_length, prediction, prediction_length
Ejemplo n.º 9
0
def main(argv):
    speech_file = argv[1]
    feature_type = argv[2]
    augments = {
        # "after": {
        #     "time_masking": {
        #         "num_masks": 10,
        #         "mask_factor": 100,
        #         "p_upperbound": 0.05
        #     },
        #     "freq_masking": {
        #         "mask_factor": 27
        #     }
        # },
    }
    au = UserAugmentation(augments)
    speech_conf = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "feature_type": feature_type,
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_feature": False,
        "num_feature_bins": 80,
    }
    signal = read_raw_audio(speech_file, speech_conf["sample_rate"])

    sf = NumpySpeechFeaturizer(speech_conf)
    ft = sf.extract(signal)
    ft = au["after"].augment(ft)[:, :, 0]

    plt.figure(figsize=(16, 2.5))
    ax = plt.gca()
    ax.set_title(f"{feature_type}", fontweight="bold")
    librosa.display.specshow(ft.T, cmap="magma")
    v1 = np.linspace(ft.min(), ft.max(), 8, endpoint=True)
    plt.colorbar(pad=0.01, fraction=0.02, ax=ax, format="%.2f", ticks=v1)
    plt.tight_layout()
    # plt.savefig(argv[3])
    plt.show()
Ejemplo n.º 10
0
    print("Loading subwords ...")
    text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config,
                                                       args.subwords)
else:
    text_featurizer = CharFeaturizer(config.decoder_config)
text_featurizer.decoder_config.beam_width = args.beam_width

# build model
conformer = Conformer(**config.model_config,
                      vocabulary_size=text_featurizer.num_classes)
conformer.make(speech_featurizer.shape)
conformer.load_weights(args.saved, by_name=True, skip_mismatch=True)
conformer.summary(line_length=120)
conformer.add_featurizers(speech_featurizer, text_featurizer)

signal = read_raw_audio(args.filename)
features = speech_featurizer.tf_extract(signal)
input_length = math_util.get_reduced_length(
    tf.shape(features)[0], conformer.time_reduction_factor)

if args.beam_width:
    transcript = conformer.recognize_beam(features[None, ...],
                                          input_length[None, ...])
    print("Transcript:", transcript[0].numpy().decode("UTF-8"))
elif args.timestamp:
    transcript, stime, etime, _, _ = conformer.recognize_tflite_with_timestamp(
        signal, tf.constant(text_featurizer.blank, dtype=tf.int32),
        conformer.predict_net.get_initial_state())
    print("Transcript:", transcript)
    print("Start time:", stime)
    print("End time:", etime)
def main():
    parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram")

    parser.add_argument("--config", type=str, default=None,
                        help="Config file")

    parser.add_argument("--audio", type=str, default=None,
                        help="Audio file")

    parser.add_argument("--saved_model", type=str, default=None,
                        help="Saved model")

    parser.add_argument("--from_weights", type=bool, default=False,
                        help="Load from weights")

    parser.add_argument("--output", type=str, default=None,
                        help="Output dir storing histograms")

    args = parser.parse_args()

    config = UserConfig(args.config, args.config, learning=False)
    speech_featurizer = SpeechFeaturizer(config["speech_config"])
    text_featurizer = CharFeaturizer(config["decoder_config"])
    text_featurizer.add_scorer(Scorer(**text_featurizer.decoder_config["lm_config"],
                                      vocabulary=text_featurizer.vocab_array))

    f, c = speech_featurizer.compute_feature_dim()
    satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c],
                                      arch_config=config["model_config"],
                                      num_classes=text_featurizer.num_classes)
    satt_ds2_model._build([1, 50, f, c])

    if args.from_weights:
        satt_ds2_model.load_weights(args.saved_model)
    else:
        saved_model = tf.keras.models.load_model(args.saved_model)
        satt_ds2_model.set_weights(saved_model.get_weights())

    satt_ds2_model.summary(line_length=100)

    satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer)

    signal = read_raw_audio(args.audio, speech_featurizer.sample_rate)
    features = speech_featurizer.extract(signal)
    decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True)
    print(bytes_to_string(decoded.numpy()))

    for i in range(1, len(satt_ds2_model.base_model.layers)):
        func = tf.keras.backend.function([satt_ds2_model.base_model.input],
                                         [satt_ds2_model.base_model.layers[i].output])
        data = func([np.expand_dims(features, 0), 1])[0][0]
        print(data.shape)
        data = data.flatten()
        plt.hist(data, 200, color='green', histtype="stepfilled")
        plt.title(f"Output of {satt_ds2_model.base_model.layers[i].name}", fontweight="bold")
        plt.savefig(os.path.join(
            args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png"))
        plt.clf()
        plt.cla()
        plt.close()

    fc = satt_ds2_model(tf.expand_dims(features, 0), training=False)
    plt.hist(fc[0].numpy().flatten(), 200, color="green", histtype="stepfilled")
    plt.title(f"Output of {satt_ds2_model.layers[-1].name}", fontweight="bold")
    plt.savefig(os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png"))
    plt.clf()
    plt.cla()
    plt.close()
    fc = tf.nn.softmax(fc)
    plt.hist(fc[0].numpy().flatten(), 10, color="green", histtype="stepfilled")
    plt.title("Output of softmax", fontweight="bold")
    plt.savefig(os.path.join(args.output, "softmax_hist.png"))
    plt.clf()
    plt.cla()
    plt.close()
    plt.hist(features.flatten(), 200, color="green", histtype="stepfilled")
    plt.title("Log Mel Spectrogram", fontweight="bold")
    plt.savefig(os.path.join(args.output, "log_mel_spectrogram.png"))
    plt.clf()
    plt.cla()
    plt.close()
Ejemplo n.º 12
0
model.add_featurizers(
    speech_featurizer=speech_featurizer,
    text_featurizer=text_featurizer
)

# features = tf.zeros(shape=[5, 50, 80, 1], dtype=tf.float32)
# pred = model.recognize(features)
# print(pred)
# pred = model.recognize_beam(features)
# print(pred)

# stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# logdir = '/tmp/logs/func/%s' % stamp
# writer = tf.summary.create_file_writer(logdir)
#
signal = read_raw_audio(sys.argv[1], speech_featurizer.sample_rate)
#
# tf.summary.trace_on(graph=True, profiler=True)
# hyps = model.recognize_tflite(signal, 0, tf.zeros([1, 2, 1, 320], dtype=tf.float32))
# with writer.as_default():
#     tf.summary.trace_export(
#         name="recognize_tflite",
#         step=0,
#         profiler_outdir=logdir)
#
# print(hyps[0])
#
# # hyps = model.recognize_beam(features)
#
#
Ejemplo n.º 13
0
    return extract_from_mel(features)


@tf.function(input_signature=[
    tf.TensorSpec(shape=[None, speech_config['n_mels'], 1],
                  dtype=tf.float32,
                  name="signal")
])
def extract_from_mel(features):
    with tf.device('/cpu:0'):
        encoded = conformer.encoder_inference(features)
    return encoded


suffix = '.wav'
mel_query = '_mel.npy'
feature_query = '_conformer_enc16.npy'
audio_files = sorted(find_files(args.dataset, '*' + suffix))
print('files:', len(audio_files), audio_files[0])

for filename in tqdm(audio_files):
    mel = filename.replace(suffix, mel_query)
    if os.path.exists(mel):
        features = np.load(mel).reshape([-1, speech_config['n_mels'], 1])
        encoded = extract_from_mel(features)
    else:
        signal = read_raw_audio(filename)
        encoded = extract_from_audio(signal)

    np.save(filename.replace(suffix, feature_query), encoded)
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram")

    parser.add_argument("--config", type=str, default=None, help="Config file")

    parser.add_argument("--audio", type=str, default=None, help="Audio file")

    parser.add_argument("--saved_model",
                        type=str,
                        default=None,
                        help="Saved model")

    parser.add_argument("--from_weights",
                        type=bool,
                        default=False,
                        help="Load from weights")

    parser.add_argument("--output",
                        type=str,
                        default=None,
                        help="Output dir storing histograms")

    args = parser.parse_args()

    config = UserConfig(args.config, args.config, learning=False)
    speech_featurizer = SpeechFeaturizer(config["speech_config"])
    text_featurizer = CharFeaturizer(config["decoder_config"])
    text_featurizer.add_scorer(
        Scorer(**text_featurizer.decoder_config["lm_config"],
               vocabulary=text_featurizer.vocab_array))

    f, c = speech_featurizer.compute_feature_dim()
    satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c],
                                      arch_config=config["model_config"],
                                      num_classes=text_featurizer.num_classes)
    satt_ds2_model._build([1, 50, f, c])

    if args.from_weights:
        satt_ds2_model.load_weights(args.saved_model)
    else:
        saved_model = tf.keras.models.load_model(args.saved_model)
        satt_ds2_model.set_weights(saved_model.get_weights())

    satt_ds2_model.summary(line_length=100)

    satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer)

    signal = read_raw_audio(args.audio, speech_featurizer.sample_rate)
    features = speech_featurizer.extract(signal)
    decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0),
                                            lm=True)
    print(bytes_to_string(decoded.numpy()))

    # for i in range(1, len(satt_ds2_model.base_model.layers)):
    #     func = tf.keras.backend.function([satt_ds2_model.base_model.input],
    #                                      [satt_ds2_model.base_model.layers[i].output])
    #     data = func([np.expand_dims(features, 0), 1])[0][0]
    #     print(data.shape)
    #     plt.figure(figsize=(16, 5))
    #     ax = plt.gca()
    #     im = ax.imshow(data.T, origin="lower", aspect="auto")
    #     ax.set_title(f"{satt_ds2_model.base_model.layers[i].name}", fontweight="bold")
    #     divider = make_axes_locatable(ax)
    #     cax = divider.append_axes("right", size="5%", pad=0.05)
    #     plt.colorbar(im, cax=cax)
    #     plt.savefig(os.path.join(
    #         args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png"))
    #     plt.clf()
    #     plt.cla()
    #     plt.close()

    fc = satt_ds2_model(tf.expand_dims(features, 0), training=False)
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title(f"{satt_ds2_model.layers[-1].name}", fontweight="bold")
    im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(
        os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png"))
    plt.clf()
    plt.cla()
    plt.close()
    fc = tf.nn.softmax(fc)
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title("Softmax", fontweight="bold")
    im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(os.path.join(args.output, "softmax.png"))
    plt.clf()
    plt.cla()
    plt.close()
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title("Log Mel Spectrogram", fontweight="bold")
    im = ax.imshow(features[:, :, 0].T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(os.path.join(args.output, "features.png"))
    plt.clf()
    plt.cla()
    plt.close()
Ejemplo n.º 15
0
import librosa

import matplotlib.pyplot as plt
import librosa.display
import numpy as np
import soundfile


def visual(title, audio, sample_rate):
    plt.figure(figsize=(8, 4))
    librosa.display.waveplot(audio, sr=sample_rate)
    plt.title(title)
    plt.tight_layout()
    #plt.show()
    plt.savefig("./audio_b.jpg")


config_dir = "tests/config_aishell.yml"
config = Config(config_dir, learning=True)

aug = config.learning_config.augmentations

sampling_rate = 16000
audio = '/tsdata/ASR/aishell-1//wav/train/S0002/BAC009S0002W0123.wav'
signal = read_raw_audio(audio, sampling_rate)

#visual('Original', signal, sampling_rate)

signal = aug.before.augment(signal)
visual('Original', signal, sampling_rate)
soundfile.write('./test_d.wav', signal, 16000)