Example #1
0
def __power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0):
    """
    codes from https://github.com/librosa/librosa
    use this code fragments instead of importing librosa package,
    because of our server has a problem with importing librosa.
    """
    S = np.asarray(S)

    if amin <= 0:
        raise ParameterError('amin must be strictly positive')

    if np.issubdtype(S.dtype, np.complexfloating):
        warnings.warn('power_to_db was called on complex input so phase '
                      'information will be discarded. To suppress this warning, '
                      'call power_to_db(np.abs(D)**2) instead.')
        magnitude = np.abs(S)
    else:
        magnitude = S

    if callable(ref):
        # User supplied a function to calculate reference power
        ref_value = ref(magnitude)
    else:
        ref_value = np.abs(ref)

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))

    if top_db is not None:
        if top_db < 0:
            raise ParameterError('top_db must be non-negative')
        log_spec = np.maximum(log_spec, log_spec.max() - top_db)

    return log_spec
Example #2
0
def build_deepspeech2(input_size: int, num_classes: int, rnn_type: str,
                      num_rnn_layers: int, rnn_hidden_dim: int,
                      dropout_p: float, bidirectional: bool, activation: str,
                      device: torch.device) -> nn.DataParallel:
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if input_size < 0:
        raise ParameterError("input_size should be greater than 0")
    if rnn_hidden_dim < 0:
        raise ParameterError("hidden_dim should be greater than 0")
    if num_rnn_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if rnn_type.lower() not in BaseRNN.supported_rnns.keys():
        raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type))

    return nn.DataParallel(
        DeepSpeech2(
            input_size=input_size,
            num_classes=num_classes,
            rnn_type=rnn_type,
            num_rnn_layers=num_rnn_layers,
            rnn_hidden_dim=rnn_hidden_dim,
            dropout_p=dropout_p,
            bidirectional=bidirectional,
            activation=activation,
            device=device,
        )).to(device)
Example #3
0
def build_listener(input_size, hidden_dim, dropout_p, num_layers,
                   bidirectional, rnn_type, extractor, activation, device):
    """ Various encoder dispatcher function. """
    if not isinstance(input_size, int):
        raise ParameterError("input_size should be inteager type")
    if not isinstance(hidden_dim, int):
        raise ParameterError("hidden_dim should be inteager type")
    if not isinstance(num_layers, int):
        raise ParameterError("num_layers should be inteager type")
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if input_size < 0:
        raise ParameterError("input_size should be greater than 0")
    if hidden_dim < 0:
        raise ParameterError("hidden_dim should be greater than 0")
    if num_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if extractor.lower() not in {'vgg', 'ds2'}:
        raise ParameterError("Unsupported extractor".format(extractor))
    if rnn_type.lower() not in BaseRNN.supported_rnns.keys():
        raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type))

    return Listener(input_size=input_size,
                    hidden_dim=hidden_dim,
                    dropout_p=dropout_p,
                    num_layers=num_layers,
                    bidirectional=bidirectional,
                    rnn_type=rnn_type,
                    extractor=extractor,
                    device=device,
                    activation=activation)
Example #4
0
def build_seq2seq_decoder(num_classes: int, max_len: int, hidden_dim: int,
                          sos_id: int, eos_id: int, attn_mechanism: str, num_layers: int,
                          rnn_type: str, dropout_p: float, num_heads: int, device: str) -> Seq2seqDecoder:
    """ Various decoder dispatcher function. """
    if hidden_dim % num_heads != 0:
        raise ParameterError("{0} % {1} should be zero".format(hidden_dim, num_heads))
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if num_heads < 0:
        raise ParameterError("num_heads should be greater than 0")
    if hidden_dim < 0:
        raise ParameterError("hidden_dim should be greater than 0")
    if num_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if max_len < 0:
        raise ParameterError("max_len should be greater than 0")
    if num_classes < 0:
        raise ParameterError("num_classes should be greater than 0")
    if rnn_type.lower() not in BaseRNN.supported_rnns.keys():
        raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type))
    if device is None:
        raise ParameterError("device is None")

    return Seq2seqDecoder(num_classes=num_classes, max_length=max_len,
                          hidden_dim=hidden_dim, sos_id=sos_id, eos_id=eos_id,
                          attn_mechanism=attn_mechanism, num_heads=num_heads,
                          num_layers=num_layers, rnn_type=rnn_type,
                          dropout_p=dropout_p, device=device)
Example #5
0
def build_conformer(
    num_classes: int,
    input_size: int,
    encoder_dim: int,
    decoder_dim: int,
    num_encoder_layers: int,
    num_decoder_layers: int,
    decoder_rnn_type: str,
    num_attention_heads: int,
    feed_forward_expansion_factor: int,
    conv_expansion_factor: int,
    input_dropout_p: float,
    feed_forward_dropout_p: float,
    attention_dropout_p: float,
    conv_dropout_p: float,
    decoder_dropout_p: float,
    conv_kernel_size: int,
    half_step_residual: bool,
    device: torch.device,
    decoder: str,
) -> nn.DataParallel:
    if input_dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if feed_forward_dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if attention_dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if conv_dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if input_size < 0:
        raise ParameterError("input_size should be greater than 0")
    assert conv_expansion_factor == 2, "currently, conformer conv expansion factor only supports 2"

    return nn.DataParallel(
        Conformer(
            num_classes=num_classes,
            input_dim=input_size,
            encoder_dim=encoder_dim,
            decoder_dim=decoder_dim,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            decoder_rnn_type=decoder_rnn_type,
            num_attention_heads=num_attention_heads,
            feed_forward_expansion_factor=feed_forward_expansion_factor,
            conv_expansion_factor=conv_expansion_factor,
            input_dropout_p=input_dropout_p,
            feed_forward_dropout_p=feed_forward_dropout_p,
            attention_dropout_p=attention_dropout_p,
            conv_dropout_p=conv_dropout_p,
            decoder_dropout_p=decoder_dropout_p,
            conv_kernel_size=conv_kernel_size,
            half_step_residual=half_step_residual,
            device=device,
            decoder=decoder,
        )).to(device)
Example #6
0
def build_las(listener, speller, device, init_uniform=True):
    """ Various Listen, Attend and Spell dispatcher function. """
    if listener is None:
        raise ParameterError("listener should not be None")
    if speller is None:
        raise ParameterError("speller should not be None")

    model = ListenAttendSpell(listener, speller)
    model.flatten_parameters()
    model = nn.DataParallel(model).to(device)

    if init_uniform:
        for param in model.parameters():
            param.data.uniform_(-0.08, 0.08)

    return model
Example #7
0
def __rms(y=None,
          S=None,
          frame_length=2048,
          hop_length=512,
          center=True,
          pad_mode='reflect'):
    """
    codes from https://github.com/librosa/librosa
    use this code fragments instead of importing librosa package,
    because of our server has a problem with importing librosa.
    """
    if y is not None:
        y = __to_mono(y)
        if center:
            y = np.pad(y, int(frame_length // 2), mode=pad_mode)

        x = __frame(y, frame_length=frame_length, hop_length=hop_length)

        # Calculate power
        power = np.mean(np.abs(x)**2, axis=0, keepdims=True)
    elif S is not None:
        # Check the frame length
        if S.shape[0] != frame_length // 2 + 1:
            raise ParameterError('Since S.shape[0] is {}, '
                                 'frame_length is expected to be {} or {}; '
                                 'found {}'.format(S.shape[0],
                                                   S.shape[0] * 2 - 2,
                                                   S.shape[0] * 2 - 1,
                                                   frame_length))

        # power spectrogram
        x = np.abs(S)**2

        # Adjust the DC and sr/2 component
        x[0] *= 0.5
        if frame_length % 2 == 0:
            x[-1] *= 0.5

        # Calculate power
        power = 2 * np.sum(x, axis=0, keepdims=True) / frame_length**2
    else:
        raise ParameterError('Either `y` or `S` must be input.')

    return np.sqrt(power)
Example #8
0
def build_transformer(
    num_classes: int,
    d_model: int,
    d_ff: int,
    num_heads: int,
    input_dim: int,
    num_encoder_layers: int,
    num_decoder_layers: int,
    extractor: str,
    dropout_p: float,
    device: torch.device,
    pad_id: int = 0,
    sos_id: int = 1,
    eos_id: int = 2,
    joint_ctc_attention: bool = False,
    max_length: int = 400,
) -> nn.DataParallel:
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if input_dim < 0:
        raise ParameterError("input_size should be greater than 0")
    if num_encoder_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if num_decoder_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    return nn.DataParallel(
        SpeechTransformer(
            input_dim=input_dim,
            num_classes=num_classes,
            extractor=extractor,
            d_model=d_model,
            d_ff=d_ff,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            num_heads=num_heads,
            encoder_dropout_p=dropout_p,
            decoder_dropout_p=dropout_p,
            pad_id=pad_id,
            sos_id=sos_id,
            eos_id=eos_id,
            max_length=max_length,
            joint_ctc_attention=joint_ctc_attention,
        )).to(device)
Example #9
0
def build_transformer(num_classes: int, pad_id: int, d_model: int, num_heads: int, input_size: int,
                      num_encoder_layers: int, num_decoder_layers: int,
                      dropout_p: float, ffnet_style: str, device: str, eos_id: int) -> Transformer:
    if ffnet_style not in {'ff', 'conv'}:
        raise ParameterError("Unsupported ffnet_style: {0}".format(ffnet_style))

    model = Transformer(num_classes=num_classes, pad_id=pad_id, d_model=d_model, num_heads=num_heads,
                        num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers,
                        dropout_p=dropout_p, ffnet_style=ffnet_style, input_dim=input_size, eos_id=eos_id)

    return nn.DataParallel(model).to(device)
Example #10
0
    def valid_audio(y, mono=True):
        if not isinstance(y, np.ndarray):
            raise ParameterError('Audio data must be of type numpy.ndarray')

        if not np.issubdtype(y.dtype, np.floating):
            raise ParameterError('Audio data must be floating-point')

        elif mono and y.ndim != 1:
            raise ParameterError('Invalid shape for monophonic audio: '
                                 'ndim={:d}, shape={}'.format(y.ndim, y.shape))

        if y.ndim > 2 or y.ndim == 0:
            raise ParameterError(
                'Audio data must have shape (samples,) or (channels, samples). '
                'Received shape={}'.format(y.shape))

        if not np.isfinite(y).all():
            raise ParameterError('Audio buffer is not finite everywhere')

        if not y.flags["F_CONTIGUOUS"]:
            raise ParameterError(
                'Audio buffer is not Fortran-contiguous. '
                'Use numpy.asfortranarray to ensure Fortran contiguity.')

        return True
Example #11
0
def build_seq2seq_encoder(input_size: int, hidden_dim: int, dropout_p: float,
                          num_layers: int, bidirectional: bool,
                          rnn_type: str, extractor: str,
                          activation: str, device: str, mask_conv: bool) -> SpeechEncoderRNN:
    """ Various encoder dispatcher function. """
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if input_size < 0:
        raise ParameterError("input_size should be greater than 0")
    if hidden_dim < 0:
        raise ParameterError("hidden_dim should be greater than 0")
    if num_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if extractor.lower() not in {'vgg', 'ds2'}:
        raise ParameterError("Unsupported extractor".format(extractor))
    if rnn_type.lower() not in BaseRNN.supported_rnns.keys():
        raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type))

    return SpeechEncoderRNN(
        input_size=input_size,
        hidden_dim=hidden_dim,
        dropout_p=dropout_p,
        num_layers=num_layers,
        mask_conv=mask_conv,
        bidirectional=bidirectional,
        rnn_type=rnn_type,
        extractor=extractor,
        device=device,
        activation=activation
    )
Example #12
0
def build_listener(input_size: int = 80,
                   hidden_dim: int = 512,
                   dropout_p: float = 0.2,
                   num_layers: int = 3,
                   bidirectional: bool = True,
                   rnn_type: str = 'lstm',
                   extractor: str = 'vgg',
                   activation: str = 'hardtanh',
                   device: str = 'cuda',
                   mask_conv: bool = False) -> Listener:
    """ Various encoder dispatcher function. """
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if input_size < 0:
        raise ParameterError("input_size should be greater than 0")
    if hidden_dim < 0:
        raise ParameterError("hidden_dim should be greater than 0")
    if num_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if extractor.lower() not in {'vgg', 'ds2'}:
        raise ParameterError("Unsupported extractor".format(extractor))
    if rnn_type.lower() not in BaseRNN.supported_rnns.keys():
        raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type))

    return Listener(input_size=input_size,
                    hidden_dim=hidden_dim,
                    dropout_p=dropout_p,
                    num_layers=num_layers,
                    mask_conv=mask_conv,
                    bidirectional=bidirectional,
                    rnn_type=rnn_type,
                    extractor=extractor,
                    device=device,
                    activation=activation)
Example #13
0
def build_transformer(num_classes: int, pad_id: int, d_model: int,
                      num_heads: int, input_size: int, num_encoder_layers: int,
                      num_decoder_layers: int, dropout_p: float,
                      ffnet_style: str, device: torch.device, eos_id: int,
                      joint_ctc_attention: bool) -> nn.DataParallel:
    if ffnet_style not in {'ff', 'conv'}:
        raise ParameterError(
            "Unsupported ffnet_style: {0}".format(ffnet_style))

    return nn.DataParallel(
        SpeechTransformer(
            num_classes=num_classes,
            pad_id=pad_id,
            d_model=d_model,
            num_heads=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dropout_p=dropout_p,
            ffnet_style=ffnet_style,
            input_dim=input_size,
            eos_id=eos_id,
            joint_ctc_attention=joint_ctc_attention,
        )).to(device)
Example #14
0
def __frame(x, frame_length=2048, hop_length=512, axis=-1):
    """
    codes from https://github.com/librosa/librosa
    use this code fragments instead of importing librosa package,
    because of our server has a problem with importing librosa.
    """
    if not isinstance(x, np.ndarray):
        raise ParameterError('Input must be of type numpy.ndarray, '
                             'given type(x)={}'.format(type(x)))

    if x.shape[axis] < frame_length:
        raise ParameterError('Input is too short (n={:d})'
                             ' for frame_length={:d}'.format(
                                 x.shape[axis], frame_length))

    if hop_length < 1:
        raise ParameterError('Invalid hop_length: {:d}'.format(hop_length))

    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
    strides = np.asarray(x.strides)

    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize

    if axis == -1:
        if not x.flags['F_CONTIGUOUS']:
            raise ParameterError('Input array must be F-contiguous '
                                 'for framing along axis={}'.format(axis))

        shape = list(x.shape)[:-1] + [frame_length, n_frames]
        strides = list(strides) + [hop_length * new_stride]

    elif axis == 0:
        if not x.flags['C_CONTIGUOUS']:
            raise ParameterError('Input array must be C-contiguous '
                                 'for framing along axis={}'.format(axis))

        shape = [n_frames, frame_length] + list(x.shape)[1:]
        strides = [hop_length * new_stride] + list(strides)
    else:
        raise ParameterError(
            'Frame axis={} must be either 0 or -1'.format(axis))

    return as_strided(x, shape=shape, strides=strides)
Example #15
0
def build_speller(num_classes, max_len, hidden_dim, sos_id, eos_id,
                  attn_mechanism, num_layers, rnn_type, dropout_p, num_heads,
                  device):
    """ Various decoder dispatcher function. """
    if not isinstance(num_classes, int):
        raise ParameterError("num_classes should be inteager type")
    if not isinstance(num_layers, int):
        raise ParameterError("num_layers should be inteager type")
    if not isinstance(hidden_dim, int):
        raise ParameterError("hidden_dim should be inteager type")
    if not isinstance(sos_id, int):
        raise ParameterError("sos_id should be inteager type")
    if not isinstance(eos_id, int):
        raise ParameterError("eos_id should be inteager type")
    if not isinstance(num_heads, int):
        raise ParameterError("num_heads should be inteager type")
    if not isinstance(max_len, int):
        raise ParameterError("max_len should be inteager type")
    if not isinstance(dropout_p, float):
        raise ParameterError("dropout_p should be float type")
    if hidden_dim % num_heads != 0:
        raise ParameterError("{0} % {1} should be zero".format(
            hidden_dim, num_heads))
    if dropout_p < 0.0:
        raise ParameterError("dropout probability should be positive")
    if num_heads < 0:
        raise ParameterError("num_heads should be greater than 0")
    if hidden_dim < 0:
        raise ParameterError("hidden_dim should be greater than 0")
    if num_layers < 0:
        raise ParameterError("num_layers should be greater than 0")
    if max_len < 0:
        raise ParameterError("max_len should be greater than 0")
    if num_classes < 0:
        raise ParameterError("num_classes should be greater than 0")
    if rnn_type.lower() not in BaseRNN.supported_rnns.keys():
        raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type))
    if device is None:
        raise ParameterError("device is None")

    return Speller(num_classes=num_classes,
                   max_length=max_len,
                   hidden_dim=hidden_dim,
                   sos_id=sos_id,
                   eos_id=eos_id,
                   attn_mechanism=attn_mechanism,
                   num_heads=num_heads,
                   num_layers=num_layers,
                   rnn_type=rnn_type,
                   dropout_p=dropout_p,
                   device=device)