Example #1
0
def build_transformer(
    num_classes: int,
    d_model: int,
    d_ff: int,
    num_heads: int,
    input_dim: int,
    num_encoder_layers: int,
    num_decoder_layers: int,
    extractor: str,
    dropout_p: float,
    device: torch.device,
    pad_id: int = 0,
    sos_id: int = 1,
    eos_id: int = 2,
    joint_ctc_attention: bool = False,
    max_length: int = 400,
) -> nn.DataParallel:
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if input_dim < 0:
        raise ParameterError("input_size should be greater than 0")
    if num_encoder_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if num_decoder_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    return nn.DataParallel(
        SpeechTransformer(
            input_dim=input_dim,
            num_classes=num_classes,
            extractor=extractor,
            d_model=d_model,
            d_ff=d_ff,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            num_heads=num_heads,
            encoder_dropout_p=dropout_p,
            decoder_dropout_p=dropout_p,
            pad_id=pad_id,
            sos_id=sos_id,
            eos_id=eos_id,
            max_length=max_length,
            joint_ctc_attention=joint_ctc_attention,
        )).to(device)
def build_transformer(num_classes: int, pad_id: int, d_model: int,
                      num_heads: int, input_size: int, num_encoder_layers: int,
                      num_decoder_layers: int, dropout_p: float,
                      ffnet_style: str, device: str,
                      eos_id: int) -> nn.DataParallel:
    if ffnet_style not in {'ff', 'conv'}:
        raise ParameterError(
            "Unsupported ffnet_style: {0}".format(ffnet_style))

    return nn.DataParallel(
        SpeechTransformer(num_classes=num_classes,
                          pad_id=pad_id,
                          d_model=d_model,
                          num_heads=num_heads,
                          num_encoder_layers=num_encoder_layers,
                          num_decoder_layers=num_decoder_layers,
                          dropout_p=dropout_p,
                          ffnet_style=ffnet_style,
                          input_dim=input_size,
                          eos_id=eos_id)).to(device)
Example #3
0
import torch
from kospeech.models import SpeechTransformer

batch_size = 4
seq_length = 200
target_length = 20
input_size = 80

transformer = SpeechTransformer(num_classes=10,
                                d_model=16,
                                d_ff=32,
                                num_encoder_layers=3,
                                num_decoder_layers=2)

inputs = torch.FloatTensor(batch_size, seq_length, input_size)
input_lengths = torch.LongTensor(
    [seq_length, seq_length - 10, seq_length - 20, seq_length - 30])
targets = torch.randint(0,
                        10,
                        size=(batch_size, target_length),
                        dtype=torch.long)

output = transformer(inputs, input_lengths, targets)
print(output)
import torch
from kospeech.models import SpeechTransformer

batch_size = 4
seq_length = 200
input_size = 80

transformer = SpeechTransformer(num_classes=10,
                                d_model=16,
                                d_ff=32,
                                num_encoder_layers=3,
                                num_decoder_layers=2)

inputs = torch.FloatTensor(batch_size, seq_length, input_size)
input_lengths = torch.LongTensor(
    [seq_length, seq_length - 10, seq_length - 20, seq_length - 30])

output = transformer.greedy_search(inputs, input_lengths, device='cuda')
print(output)