Ejemplo n.º 1
0
def smoke_test():
    """Smoke test for training Wav2Letter Model using
        randomly generated data. This is used just to quickly
        verify if the model can run without errors.
        expects the model to perform poorly.
    """
    # 26 letters in the english alphabet + blank token
    grapheme_count = 26 + 1
    in_frame_len = 500  # arbitrary frame length
    sample_size = 50  # arbitrary sample size
    mfcc_features = 13  # 13 mfcc features, discard 13 - 29
    batch_size = 25  # arbitrary batch size
    seq_length = 20  # arbitrary max sequence length

    print("Randomly generating input and output data...")

    # create dummy X inputs data
    inputs = torch.randn(sample_size, in_frame_len, mfcc_features)

    # create dummy Y target data of class labels
    # from 1 - 26 (0 reservered for blank)
    targets = torch.randint(1, grapheme_count, (sample_size, seq_length))

    print("inputs shape", inputs.shape)
    print("target shape", targets.shape)

    model = Wav2Letter(mfcc_features, grapheme_count)
    print(model.layers)

    ctc_loss = nn.CTCLoss()
    optimizer = optim.Adam(model.parameters())

    # Each mfcc feature is a channel
    # https://pytorch.org/docs/stable/nn.html#torch.nn.Conv1d
    # transpose (sample_size, in_frame_len, mfcc_features)
    # to      (sample_size, mfcc_features, in_frame_len)
    inputs = inputs.transpose(1, 2)

    model.fit(inputs,
              targets,
              optimizer,
              ctc_loss,
              batch_size,
              epoch=1,
              print_every=1)
    log_probs = model.eval(inputs[0])
    out_put = GreedyDecoder(log_probs)

    # print class labels per time step
    print("output labels", out_put)
    # print true labels
    print("true", targets[0])
Ejemplo n.º 2
0
def train(batch_size, epochs, data_dir):
    # load saved numpy arrays for google speech command
    gs = ImageCommand(data_dir)
    _inputs, _targets = gs.load_vectors()

    # parameters
    # batch_size = batch_size
    mfcc_features = 3
    grapheme_count = gs.intencode.grapheme_count

    print("training google speech dataset")
    print("data size", len(_inputs))
    print("batch_size", batch_size)
    print("epochs", epochs)
    print("num_mfcc_features", mfcc_features)
    print("grapheme_count", grapheme_count)

    # torch tensors
    inputs = torch.Tensor(_inputs)
    targets = torch.IntTensor(_targets)

    print("input shape", inputs.shape)
    print("target shape", targets.shape)

    # Initialize model, loss, optimizer
    model = Wav2Letter(mfcc_features, grapheme_count)
    print(model.layers)

    ctc_loss = nn.CTCLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-4)

    # Each mfcc feature is a channel
    # https://pytorch.org/docs/stable/nn.html#torch.nn.Conv1d
    # transpose (sample_size, in_frame_len, mfcc_features)
    # to      (sample_size, mfcc_features, in_frame_len)
    inputs = inputs.transpose(1, 2)
    print("transposed input", inputs.shape)

    model.fit(inputs, targets, optimizer, ctc_loss, batch_size, epoch=epochs)

    sample = inputs[0]
    sample_target = targets[0]

    log_probs = model.eval(sample)
    output = GreedyDecoder(log_probs)

    print("sample target", sample_target)
    print("predicted", output)
Ejemplo n.º 3
0
def infer(opt):
    mfcc_features = opt.mfcc_features
    datasets_path = opt.datasets_path
    models_path = opt.output_path

    # load saved numpy arrays for google speech command
    gs = GoogleSpeechCommand()
    _inputs, _targets = gs.load_vectors(datasets_path)
    grapheme_count = gs.intencode.grapheme_count

    inputs = flow.Tensor(_inputs).to("cuda")
    targets = flow.tensor(_targets, dtype=flow.int).to("cuda")

    model = Wav2Letter(mfcc_features, grapheme_count)
    model.to("cuda")
    model.load_state_dict(flow.load(os.path.join(models_path, "model.pth")))

    int_encoder = opt.int_encoder
    with open(int_encoder, "rb") as f:
        int_to_char = pickle.load(f)["index2char"]

    decoder = GreedyDecoder(int_to_char)

    inputs = inputs.transpose(1, 2)

    sample = inputs[-1000:]
    sample_target = targets[-1000:]

    log_probs = model(sample)
    output = decoder.decode(log_probs)

    pred_strings, output = decoder.convert_to_strings(output)
    sample_target_strings = decoder.convert_to_strings(
        sample_target, remove_repetitions=False, return_offsets=False)
    wer = decoder.wer(sample_target_strings, pred_strings)

    print("wer", wer)
Ejemplo n.º 4
0
def train(batch_size, epochs):
    gs = GoogleSpeechCommand()
    inputs, targets, input_lengths = gs.load_vectors("./speech_data")

    batch_size = batch_size
    mfcc_features = 13
    grapheme_count = gs.intencode.grapheme_count
    index2char = gs.intencode.index2char
    # 输出参数
    print("training google speech dataset")
    print("data size", len(inputs))
    print("batch_size", batch_size)
    print("epochs", epochs)
    print("num_mfcc_features", mfcc_features)
    print("grapheme_count", grapheme_count)
    print("index2char", index2char)

    print("input shape", inputs.shape)
    print("target shape", targets.shape)

    model = Wav2Letter(mfcc_features, grapheme_count)

    model.fit(inputs, targets, input_lengths, batch_size, epoch=epochs)
    model.save("./model")
Ejemplo n.º 5
0
def train(opt):
    batch_size = opt.batch_size
    epochs = opt.epochs
    mfcc_features = opt.mfcc_features
    rate = opt.rate
    datasets_path = opt.datasets_path

    # load saved numpy arrays for google speech command
    gs = GoogleSpeechCommand()
    _inputs, _targets = gs.load_vectors(datasets_path)
    grapheme_count = gs.intencode.grapheme_count

    print("training google speech dataset")
    print("data size", len(_inputs))
    print("batch_size", batch_size)
    print("epochs", epochs)
    print("num_mfcc_features", mfcc_features)
    print("grapheme_count", grapheme_count)

    inputs = flow.Tensor(_inputs).to("cuda")
    targets = flow.tensor(_targets, dtype=flow.int).to("cuda")

    # split train, eval, test
    data_size = len(_inputs)
    train_inputs = inputs[0:int(rate * data_size)]
    train_targets = targets[0:int(rate * data_size)]
    eval_inputs = inputs[int(rate * data_size):-1000]
    eval_targets = targets[int(rate * data_size):-1000]

    # Initialize model, loss, optimizer
    model = Wav2Letter(mfcc_features, grapheme_count)
    model.to("cuda")

    ctc_loss = nn.CTCLoss()
    optimizer = optim.Adam(model.parameters(), lr=opt.lr)

    # load pretrained model
    if opt.pretrained_model != None:
        model.load_state_dict(flow.load(opt.pretrained_model))

    train_total_steps = int(train_inputs.size(0) // batch_size)
    eval_total_steps = int(eval_inputs.size(0) // batch_size)

    for epoch in range(epochs):
        samples_processed = 0
        avg_epoch_loss = 0

        for step in range(train_total_steps):
            train_data_batch = train_inputs[samples_processed:batch_size +
                                            samples_processed].transpose(1, 2)

            log_probs = model(train_data_batch)
            log_probs = log_probs.transpose(1, 2).transpose(0, 1)

            targets = train_targets[samples_processed:batch_size +
                                    samples_processed]

            input_lengths = flow.tensor(np.full((batch_size, ),
                                                log_probs.shape[0]),
                                        dtype=flow.int).to("cuda")
            target_lengths = flow.tensor(
                [target.shape[0] for target in targets],
                dtype=flow.int).to("cuda")

            loss = ctc_loss(log_probs, targets, input_lengths, target_lengths)

            avg_epoch_loss += loss.numpy().item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            samples_processed += batch_size

        # evaluate
        int_encoder = opt.int_encoder
        with open(int_encoder, "rb") as f:
            int_to_char = pickle.load(f)["index2char"]

        decoder = GreedyDecoder(int_to_char)

        wer = 0
        start_index = 0
        for step in range(eval_total_steps):
            eval_data_batch = eval_inputs[start_index:batch_size +
                                          start_index].transpose(1, 2)
            eval_targets_batch = eval_targets[start_index:batch_size +
                                              start_index]
            eval_log_props = model(eval_data_batch)

            output = decoder.decode(eval_log_props)
            pred_strings, output = decoder.convert_to_strings(output)
            eval_target_strings = decoder.convert_to_strings(
                eval_targets_batch,
                remove_repetitions=False,
                return_offsets=False)
            wer += decoder.wer(eval_target_strings, pred_strings)
            start_index += batch_size

        print(
            "epoch",
            epoch + 1,
            "average epoch loss",
            avg_epoch_loss / train_total_steps,
            "wer",
            wer / eval_total_steps,
        )

        # save models
        if (epoch + 1) % 100 == 0:
            flow.save(
                model.state_dict(),
                os.path.join(opt.output_path,
                             "model_{}.pth".format(epoch + 1)),
            )