Python read_raw_audioの例、tiramisu_asr.featurizers.speech_featurizers.read_raw_audio Pythonの例

コード例 #1

0

ファイルを表示

ファイル: speech_featurizer_test.py プロジェクト: WhiteFu/TiramisuASR

def main(argv):
    speech_file = argv[1]
    feature_type = argv[2]
    augments = {
        # "after": {
        #     "time_masking": {
        #         "num_masks": 10,
        #         "mask_factor": 100,
        #         "p_upperbound": 0.05
        #     },
        #     "freq_masking": {
        #         "mask_factor": 27
        #     }
        # },
    }
    au = UserAugmentation(augments)
    speech_conf = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "feature_type": feature_type,
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_feature": False,
        "num_feature_bins": 80,
    }
    signal = read_raw_audio(speech_file, speech_conf["sample_rate"])

    sf = NumpySpeechFeaturizer(speech_conf)
    ft = sf.extract(signal)
    ft = au["after"].augment(ft)[:, :, 0]

    plt.figure(figsize=(16, 2.5))
    ax = plt.gca()
    ax.set_title(f"{feature_type}", fontweight="bold")
    librosa.display.specshow(ft.T, cmap="magma")
    v1 = np.linspace(ft.min(), ft.max(), 8, endpoint=True)
    plt.colorbar(pad=0.01, fraction=0.02, ax=ax, format="%.2f", ticks=v1)
    plt.tight_layout()
    plt.savefig(argv[3])
    plt.show()

コード例 #2

0

ファイルを表示

    def preprocess(self, audio, transcript):
        with tf.device("/CPU:0"):
            signal = read_raw_audio(audio,
                                    self.speech_featurizer_lms.sample_rate)

            signal = self.augmentations["before"].augment(signal)

            lms = self.speech_featurizer_lms.extract(signal)
            lgs = self.speech_featurizer_lgs.extract(signal)

            lms = self.augmentations["after"].augment(lms)
            lgs = self.augmentations["after"].augment(lgs)

            label = self.text_featurizer.extract(transcript.decode("utf-8"))
            label_length = tf.cast(tf.shape(label)[0], tf.int32)
            pred_inp = self.text_featurizer.prepand_blank(label)
            lms = tf.convert_to_tensor(lms, tf.float32)
            lgs = tf.convert_to_tensor(lgs, tf.float32)
            input_length = tf.cast(tf.shape(lms)[0], tf.int32)

            return lms, lgs, input_length, label, label_length, pred_inp

コード例 #3

0

ファイルを表示

ファイル: speech_featurizer_test.py プロジェクト: jackyvan/TiramisuASR

def main(argv):
    speech_file = argv[1]
    feature_type = argv[2]
    speech_conf = {
        "sample_rate": 16000,
        "frame_ms": 25,
        "stride_ms": 10,
        "feature_type": feature_type,
        "preemphasis": 0.97,
        "normalize_signal": True,
        "normalize_feature": True,
        "normalize_per_feature": False,
        "num_feature_bins": 80,
    }
    signal = read_raw_audio(speech_file, speech_conf["sample_rate"])

    sf = TFSpeechFeaturizer(speech_conf)
    ft = sf.extract(signal)[:, :, 0]

    plt.figure(figsize=(15, 5))
    plt.imshow(ft.T, origin="lower")
    plt.colorbar()
    plt.tight_layout()
    plt.show()

コード例 #4

0

ファイルを表示

parser.add_argument("--blank",
                    type=int,
                    default=0,
                    help="Path to conformer tflite")

parser.add_argument("--statesize",
                    type=int,
                    default=320,
                    help="Path to conformer tflite")

args = parser.parse_args()

tflitemodel = tf.lite.Interpreter(model_path=args.tflite)

signal = read_raw_audio(args.filename)

input_details = tflitemodel.get_input_details()
output_details = tflitemodel.get_output_details()
tflitemodel.resize_tensor_input(input_details[0]["index"], signal.shape)
tflitemodel.allocate_tensors()
tflitemodel.set_tensor(input_details[0]["index"], signal)
tflitemodel.set_tensor(input_details[1]["index"],
                       tf.constant(args.blank, dtype=tf.int32))
tflitemodel.set_tensor(input_details[2]["index"],
                       tf.zeros([1, 2, 1, args.statesize], dtype=tf.float32))
tflitemodel.invoke()
hyp = tflitemodel.get_tensor(output_details[0]["index"])

print("".join([chr(u) for u in hyp]))

コード例 #5

0

ファイルを表示

model.save_weights("/tmp/transducer.h5")

model.add_featurizers(speech_featurizer=speech_featurizer,
                      text_featurizer=text_featurizer)

features = tf.zeros(shape=[5, 50, 80, 1], dtype=tf.float32)
pred = model.recognize(features)
print(pred)
pred = model.recognize_beam(features)
print(pred)

# stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# logdir = '/tmp/logs/func/%s' % stamp
# writer = tf.summary.create_file_writer(logdir)
#
signal = read_raw_audio("/home/nlhuy/Desktop/test/11003.wav",
                        speech_featurizer.sample_rate)
#
# tf.summary.trace_on(graph=True, profiler=True)
# hyps = model.recognize_tflite(signal, 0, tf.zeros([1, 2, 1, 320], dtype=tf.float32))
# with writer.as_default():
#     tf.summary.trace_export(
#         name="recognize_tflite",
#         step=0,
#         profiler_outdir=logdir)
#
# print(hyps[0])
#
# # hyps = model.recognize_beam(features)
#
#

コード例 #6

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram")

    parser.add_argument("--config", type=str, default=None,
                        help="Config file")

    parser.add_argument("--audio", type=str, default=None,
                        help="Audio file")

    parser.add_argument("--saved_model", type=str, default=None,
                        help="Saved model")

    parser.add_argument("--from_weights", type=bool, default=False,
                        help="Load from weights")

    parser.add_argument("--output", type=str, default=None,
                        help="Output dir storing histograms")

    args = parser.parse_args()

    config = UserConfig(args.config, args.config, learning=False)
    speech_featurizer = SpeechFeaturizer(config["speech_config"])
    text_featurizer = TextFeaturizer(config["decoder_config"])
    text_featurizer.add_scorer(Scorer(**text_featurizer.decoder_config["lm_config"],
                                      vocabulary=text_featurizer.vocab_array))

    f, c = speech_featurizer.compute_feature_dim()
    satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c],
                                      arch_config=config["model_config"],
                                      num_classes=text_featurizer.num_classes)
    satt_ds2_model._build([1, 50, f, c])

    if args.from_weights:
        satt_ds2_model.load_weights(args.saved_model)
    else:
        saved_model = tf.keras.models.load_model(args.saved_model)
        satt_ds2_model.set_weights(saved_model.get_weights())

    satt_ds2_model.summary(line_length=100)

    satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer)

    signal = read_raw_audio(args.audio, speech_featurizer.sample_rate)
    features = speech_featurizer.extract(signal)
    decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0), lm=True)
    print(bytes_to_string(decoded.numpy()))

    for i in range(1, len(satt_ds2_model.base_model.layers)):
        func = tf.keras.backend.function([satt_ds2_model.base_model.input],
                                         [satt_ds2_model.base_model.layers[i].output])
        data = func([np.expand_dims(features, 0), 1])[0][0]
        print(data.shape)
        data = data.flatten()
        plt.hist(data, 200, color='green', histtype="stepfilled")
        plt.title(f"Output of {satt_ds2_model.base_model.layers[i].name}", fontweight="bold")
        plt.savefig(os.path.join(
            args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png"))
        plt.clf()
        plt.cla()
        plt.close()

    fc = satt_ds2_model(tf.expand_dims(features, 0), training=False)
    plt.hist(fc[0].numpy().flatten(), 200, color="green", histtype="stepfilled")
    plt.title(f"Output of {satt_ds2_model.layers[-1].name}", fontweight="bold")
    plt.savefig(os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png"))
    plt.clf()
    plt.cla()
    plt.close()
    fc = tf.nn.softmax(fc)
    plt.hist(fc[0].numpy().flatten(), 10, color="green", histtype="stepfilled")
    plt.title("Output of softmax", fontweight="bold")
    plt.savefig(os.path.join(args.output, "softmax_hist.png"))
    plt.clf()
    plt.cla()
    plt.close()
    plt.hist(features.flatten(), 200, color="green", histtype="stepfilled")
    plt.title("Log Mel Spectrogram", fontweight="bold")
    plt.savefig(os.path.join(args.output, "log_mel_spectrogram.png"))
    plt.clf()
    plt.cla()
    plt.close()

コード例 #7

0

ファイルを表示

def main():
    parser = argparse.ArgumentParser(prog="SelfAttentionDS2 Histogram")

    parser.add_argument("--config", type=str, default=None, help="Config file")

    parser.add_argument("--audio", type=str, default=None, help="Audio file")

    parser.add_argument("--saved_model",
                        type=str,
                        default=None,
                        help="Saved model")

    parser.add_argument("--from_weights",
                        type=bool,
                        default=False,
                        help="Load from weights")

    parser.add_argument("--output",
                        type=str,
                        default=None,
                        help="Output dir storing histograms")

    args = parser.parse_args()

    config = UserConfig(args.config, args.config, learning=False)
    speech_featurizer = SpeechFeaturizer(config["speech_config"])
    text_featurizer = CharFeaturizer(config["decoder_config"])
    text_featurizer.add_scorer(
        Scorer(**text_featurizer.decoder_config["lm_config"],
               vocabulary=text_featurizer.vocab_array))

    f, c = speech_featurizer.compute_feature_dim()
    satt_ds2_model = SelfAttentionDS2(input_shape=[None, f, c],
                                      arch_config=config["model_config"],
                                      num_classes=text_featurizer.num_classes)
    satt_ds2_model._build([1, 50, f, c])

    if args.from_weights:
        satt_ds2_model.load_weights(args.saved_model)
    else:
        saved_model = tf.keras.models.load_model(args.saved_model)
        satt_ds2_model.set_weights(saved_model.get_weights())

    satt_ds2_model.summary(line_length=100)

    satt_ds2_model.add_featurizers(speech_featurizer, text_featurizer)

    signal = read_raw_audio(args.audio, speech_featurizer.sample_rate)
    features = speech_featurizer.extract(signal)
    decoded = satt_ds2_model.recognize_beam(tf.expand_dims(features, 0),
                                            lm=True)
    print(bytes_to_string(decoded.numpy()))

    # for i in range(1, len(satt_ds2_model.base_model.layers)):
    #     func = tf.keras.backend.function([satt_ds2_model.base_model.input],
    #                                      [satt_ds2_model.base_model.layers[i].output])
    #     data = func([np.expand_dims(features, 0), 1])[0][0]
    #     print(data.shape)
    #     plt.figure(figsize=(16, 5))
    #     ax = plt.gca()
    #     im = ax.imshow(data.T, origin="lower", aspect="auto")
    #     ax.set_title(f"{satt_ds2_model.base_model.layers[i].name}", fontweight="bold")
    #     divider = make_axes_locatable(ax)
    #     cax = divider.append_axes("right", size="5%", pad=0.05)
    #     plt.colorbar(im, cax=cax)
    #     plt.savefig(os.path.join(
    #         args.output, f"{i}_{satt_ds2_model.base_model.layers[i].name}.png"))
    #     plt.clf()
    #     plt.cla()
    #     plt.close()

    fc = satt_ds2_model(tf.expand_dims(features, 0), training=False)
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title(f"{satt_ds2_model.layers[-1].name}", fontweight="bold")
    im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(
        os.path.join(args.output, f"{satt_ds2_model.layers[-1].name}.png"))
    plt.clf()
    plt.cla()
    plt.close()
    fc = tf.nn.softmax(fc)
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title("Softmax", fontweight="bold")
    im = ax.imshow(fc[0].numpy().T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(os.path.join(args.output, "softmax.png"))
    plt.clf()
    plt.cla()
    plt.close()
    plt.figure(figsize=(16, 5))
    ax = plt.gca()
    ax.set_title("Log Mel Spectrogram", fontweight="bold")
    im = ax.imshow(features[:, :, 0].T, origin="lower", aspect="auto")
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
    plt.colorbar(im, cax=cax)
    plt.savefig(os.path.join(args.output, "features.png"))
    plt.clf()
    plt.cla()
    plt.close()