def test_sequential():
    class Masked(torch.nn.Module):
        def forward(self, x, m):
            return x, m
    from espnet.nets.pytorch_backend.transformer.repeat import MultiSequential
    f = MultiSequential(Masked(), Masked())
    x = torch.randn(2, 3)
    m = torch.randn(2, 3) > 0
    assert len(f(x, m)) == 2
    if torch.cuda.is_available():
        f = torch.nn.DataParallel(f)
        f.cuda()
        assert len(f(x.cuda(), m.cuda())) == 2
Example #2
0
def build_blocks(
    net_part,
    idim,
    input_layer,
    blocks_arch,
    repeat_block=0,
    self_attn_type="self_attn",
    positional_encoding_type="abs_pos",
    positionwise_layer_type="linear",
    positionwise_activation_type="relu",
    conv_mod_activation_type="relu",
    dropout_rate_embed=0.0,
    padding_idx=-1,
):
    """Build block for transformer-based models.

    Args:
        net_part (str): either 'encoder' or 'decoder'
        idim (int): dimension of inputs
        input_layer (str): input layer type
        blocks_arch (list): list of blocks for network part (type and parameters)
        repeat_block (int): repeat provided blocks N times if N > 1
        positional_encoding_type (str): positional encoding layer type
        positionwise_layer_type (str): linear
        positionwise_activation_type (str): positionwise activation type
        conv_mod_activation_type (str): convolutional module activation type
        dropout_rate_embed (float): dropout rate for embedding
        padding_idx (int): padding index for embedding input layer (if specified)

    Returns:
        in_layer (torch.nn.*): input layer
        all_blocks (MultiSequential): all blocks for network part
        out_dim (int): dimension of last block output

    """
    fn_modules = []

    (
        input_layer,
        input_layer_odim,
        input_dropout_rate,
        input_pos_dropout_rate,
        out_dim,
    ) = check_and_prepare(net_part, blocks_arch, input_layer)

    pos_enc_class, self_attn_class = get_pos_enc_and_att_class(
        net_part, positional_encoding_type, self_attn_type)

    in_layer = build_input_layer(
        input_layer,
        idim,
        input_layer_odim,
        pos_enc_class,
        dropout_rate_embed,
        input_dropout_rate,
        input_pos_dropout_rate,
        padding_idx,
    )

    for i in range(len(blocks_arch)):
        block_type = blocks_arch[i]["type"]

        if block_type == "tdnn":
            module = build_tdnn_block(blocks_arch[i])
        elif block_type == "transformer":
            module = build_transformer_block(
                net_part,
                blocks_arch[i],
                positionwise_layer_type,
                positionwise_activation_type,
            )
        elif block_type == "conformer":
            module = build_conformer_block(
                blocks_arch[i],
                self_attn_class,
                pos_enc_class,
                positionwise_layer_type,
                positionwise_activation_type,
                conv_mod_activation_type,
            )
        elif block_type == "causal-conv1d":
            module = build_causal_conv1d_block(blocks_arch[i])

        fn_modules.append(module)

    if repeat_block > 1:
        fn_modules = fn_modules * repeat_block

    return in_layer, MultiSequential(*[fn() for fn in fn_modules]), out_dim
Example #3
0
def build_blocks(
    net_part: str,
    idim: int,
    input_layer_type: str,
    blocks: List[Dict[str, Any]],
    repeat_block: int = 0,
    self_attn_type: str = "self_attn",
    positional_encoding_type: str = "abs_pos",
    positionwise_layer_type: str = "linear",
    positionwise_activation_type: str = "relu",
    conv_mod_activation_type: str = "relu",
    input_layer_dropout_rate: float = 0.0,
    input_layer_pos_enc_dropout_rate: float = 0.0,
    padding_idx: int = -1,
) -> Tuple[Union[Conv2dSubsampling, VGG2L, torch.nn.Sequential],
           MultiSequential, int, int]:
    """Build custom model blocks.

    Args:
        net_part: Network part, either 'encoder' or 'decoder'.
        idim: Input dimension.
        input_layer: Input layer type.
        blocks: Blocks parameters for network part.
        repeat_block: Number of times provided blocks are repeated.
        positional_encoding_type: Positional encoding layer type.
        positionwise_layer_type: Positionwise layer type.
        positionwise_activation_type: Positionwise activation type.
        conv_mod_activation_type: Convolutional module activation type.
        input_layer_dropout_rate: Dropout rate for input layer.
        input_layer_pos_enc_dropout_rate: Dropout rate for input layer pos. enc.
        padding_idx: Padding symbol ID for embedding layer.

    Returns:
        in_layer: Input layer
        all_blocks: Encoder/Decoder network.
        out_dim: Network output dimension.
        conv_subsampling_factor: Subsampling factor in frontend CNN.

    """
    fn_modules = []

    pos_enc_class, self_attn_class = get_pos_enc_and_att_class(
        net_part, positional_encoding_type, self_attn_type)

    input_block = prepare_input_layer(
        input_layer_type,
        idim,
        blocks,
        input_layer_dropout_rate,
        input_layer_pos_enc_dropout_rate,
    )

    out_dim = prepare_body_model(net_part, blocks)

    input_layer, conv_subsampling_factor = build_input_layer(
        input_block,
        pos_enc_class,
        padding_idx,
    )

    for i in range(len(blocks)):
        block_type = blocks[i]["type"]

        if block_type in ("causal-conv1d", "conv1d"):
            module = build_conv1d_block(blocks[i], block_type)
        elif block_type == "conformer":
            module = build_conformer_block(
                blocks[i],
                self_attn_class,
                positionwise_layer_type,
                positionwise_activation_type,
                conv_mod_activation_type,
            )
        elif block_type == "transformer":
            module = build_transformer_block(
                net_part,
                blocks[i],
                positionwise_layer_type,
                positionwise_activation_type,
            )

        fn_modules.append(module)

    if repeat_block > 1:
        fn_modules = fn_modules * repeat_block

    return (
        input_layer,
        MultiSequential(*[fn() for fn in fn_modules]),
        out_dim,
        conv_subsampling_factor,
    )
Example #4
0
def build_blocks(
    net_part: str,
    idim: int,
    input_layer_type: str,
    blocks_arch: List,
    repeat_block: int = 0,
    self_attn_type: str = "self_attn",
    positional_encoding_type: str = "abs_pos",
    positionwise_layer_type: str = "linear",
    positionwise_activation_type: str = "relu",
    conv_mod_activation_type: str = "relu",
    dropout_rate_embed: float = 0.0,
    padding_idx: int = -1,
) -> Tuple[
    Union[Conv2dSubsampling, VGG2L, torch.nn.Sequential], MultiSequential, int, int
]:
    """Build block for customizable architecture.

    Args:
        net_part: Network part, either 'encoder' or 'decoder'.
        idim: Input dimension.
        input_layer: Input layer type.
        blocks_arch: Block architecture (types and parameters) for network part.
        repeat_block: Number of times blocks_arch is repeated.
        positional_encoding_type: Positional encoding layer type.
        positionwise_layer_type: Positionwise layer type.
        positionwise_activation_type: Positionwise activation type.
        conv_mod_activation_type: Convolutional module activation type.
        dropout_rate_embed: Dropout rate for embedding layer.
        padding_idx: Padding symbol ID for embedding layer.

    Returns:
        in_layer: Input layer
        all_blocks: (Encoder or Decoder) network.
        out_dim: Network output dimension.
        conv_subsampling_factor: Subsampling factor in frontend CNN.

    """
    fn_modules = []

    (
        input_layer_type,
        input_layer_odim,
        input_dropout_rate,
        input_pos_dropout_rate,
        out_dim,
    ) = check_and_prepare(net_part, blocks_arch, input_layer_type)

    pos_enc_class, self_attn_class = get_pos_enc_and_att_class(
        net_part, positional_encoding_type, self_attn_type
    )

    in_layer, conv_subsampling_factor = build_input_layer(
        input_layer_type,
        idim,
        input_layer_odim,
        pos_enc_class,
        dropout_rate_embed,
        input_dropout_rate,
        input_pos_dropout_rate,
        padding_idx,
    )

    for i in range(len(blocks_arch)):
        block_type = blocks_arch[i]["type"]

        if block_type == "tdnn":
            module = build_tdnn_block(blocks_arch[i])
        elif block_type == "transformer":
            module = build_transformer_block(
                net_part,
                blocks_arch[i],
                positionwise_layer_type,
                positionwise_activation_type,
            )
        elif block_type == "conformer":
            module = build_conformer_block(
                blocks_arch[i],
                self_attn_class,
                positionwise_layer_type,
                positionwise_activation_type,
                conv_mod_activation_type,
            )
        elif block_type == "causal-conv1d":
            module = build_causal_conv1d_block(blocks_arch[i])

        fn_modules.append(module)

    if repeat_block > 1:
        fn_modules = fn_modules * repeat_block

    return (
        in_layer,
        MultiSequential(*[fn() for fn in fn_modules]),
        out_dim,
        conv_subsampling_factor,
    )