def __power_to_db(S, ref=1.0, amin=1e-10, top_db=80.0): """ codes from https://github.com/librosa/librosa use this code fragments instead of importing librosa package, because of our server has a problem with importing librosa. """ S = np.asarray(S) if amin <= 0: raise ParameterError('amin must be strictly positive') if np.issubdtype(S.dtype, np.complexfloating): warnings.warn('power_to_db was called on complex input so phase ' 'information will be discarded. To suppress this warning, ' 'call power_to_db(np.abs(D)**2) instead.') magnitude = np.abs(S) else: magnitude = S if callable(ref): # User supplied a function to calculate reference power ref_value = ref(magnitude) else: ref_value = np.abs(ref) log_spec = 10.0 * np.log10(np.maximum(amin, magnitude)) log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value)) if top_db is not None: if top_db < 0: raise ParameterError('top_db must be non-negative') log_spec = np.maximum(log_spec, log_spec.max() - top_db) return log_spec
def build_deepspeech2(input_size: int, num_classes: int, rnn_type: str, num_rnn_layers: int, rnn_hidden_dim: int, dropout_p: float, bidirectional: bool, activation: str, device: torch.device) -> nn.DataParallel: if dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if input_size < 0: raise ParameterError("input_size should be greater than 0") if rnn_hidden_dim < 0: raise ParameterError("hidden_dim should be greater than 0") if num_rnn_layers < 0: raise ParameterError("num_layers should be greater than 0") if rnn_type.lower() not in BaseRNN.supported_rnns.keys(): raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type)) return nn.DataParallel( DeepSpeech2( input_size=input_size, num_classes=num_classes, rnn_type=rnn_type, num_rnn_layers=num_rnn_layers, rnn_hidden_dim=rnn_hidden_dim, dropout_p=dropout_p, bidirectional=bidirectional, activation=activation, device=device, )).to(device)
def build_listener(input_size, hidden_dim, dropout_p, num_layers, bidirectional, rnn_type, extractor, activation, device): """ Various encoder dispatcher function. """ if not isinstance(input_size, int): raise ParameterError("input_size should be inteager type") if not isinstance(hidden_dim, int): raise ParameterError("hidden_dim should be inteager type") if not isinstance(num_layers, int): raise ParameterError("num_layers should be inteager type") if dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if input_size < 0: raise ParameterError("input_size should be greater than 0") if hidden_dim < 0: raise ParameterError("hidden_dim should be greater than 0") if num_layers < 0: raise ParameterError("num_layers should be greater than 0") if extractor.lower() not in {'vgg', 'ds2'}: raise ParameterError("Unsupported extractor".format(extractor)) if rnn_type.lower() not in BaseRNN.supported_rnns.keys(): raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type)) return Listener(input_size=input_size, hidden_dim=hidden_dim, dropout_p=dropout_p, num_layers=num_layers, bidirectional=bidirectional, rnn_type=rnn_type, extractor=extractor, device=device, activation=activation)
def build_seq2seq_decoder(num_classes: int, max_len: int, hidden_dim: int, sos_id: int, eos_id: int, attn_mechanism: str, num_layers: int, rnn_type: str, dropout_p: float, num_heads: int, device: str) -> Seq2seqDecoder: """ Various decoder dispatcher function. """ if hidden_dim % num_heads != 0: raise ParameterError("{0} % {1} should be zero".format(hidden_dim, num_heads)) if dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if num_heads < 0: raise ParameterError("num_heads should be greater than 0") if hidden_dim < 0: raise ParameterError("hidden_dim should be greater than 0") if num_layers < 0: raise ParameterError("num_layers should be greater than 0") if max_len < 0: raise ParameterError("max_len should be greater than 0") if num_classes < 0: raise ParameterError("num_classes should be greater than 0") if rnn_type.lower() not in BaseRNN.supported_rnns.keys(): raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type)) if device is None: raise ParameterError("device is None") return Seq2seqDecoder(num_classes=num_classes, max_length=max_len, hidden_dim=hidden_dim, sos_id=sos_id, eos_id=eos_id, attn_mechanism=attn_mechanism, num_heads=num_heads, num_layers=num_layers, rnn_type=rnn_type, dropout_p=dropout_p, device=device)
def build_conformer( num_classes: int, input_size: int, encoder_dim: int, decoder_dim: int, num_encoder_layers: int, num_decoder_layers: int, decoder_rnn_type: str, num_attention_heads: int, feed_forward_expansion_factor: int, conv_expansion_factor: int, input_dropout_p: float, feed_forward_dropout_p: float, attention_dropout_p: float, conv_dropout_p: float, decoder_dropout_p: float, conv_kernel_size: int, half_step_residual: bool, device: torch.device, decoder: str, ) -> nn.DataParallel: if input_dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if feed_forward_dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if attention_dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if conv_dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if input_size < 0: raise ParameterError("input_size should be greater than 0") assert conv_expansion_factor == 2, "currently, conformer conv expansion factor only supports 2" return nn.DataParallel( Conformer( num_classes=num_classes, input_dim=input_size, encoder_dim=encoder_dim, decoder_dim=decoder_dim, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, decoder_rnn_type=decoder_rnn_type, num_attention_heads=num_attention_heads, feed_forward_expansion_factor=feed_forward_expansion_factor, conv_expansion_factor=conv_expansion_factor, input_dropout_p=input_dropout_p, feed_forward_dropout_p=feed_forward_dropout_p, attention_dropout_p=attention_dropout_p, conv_dropout_p=conv_dropout_p, decoder_dropout_p=decoder_dropout_p, conv_kernel_size=conv_kernel_size, half_step_residual=half_step_residual, device=device, decoder=decoder, )).to(device)
def build_las(listener, speller, device, init_uniform=True): """ Various Listen, Attend and Spell dispatcher function. """ if listener is None: raise ParameterError("listener should not be None") if speller is None: raise ParameterError("speller should not be None") model = ListenAttendSpell(listener, speller) model.flatten_parameters() model = nn.DataParallel(model).to(device) if init_uniform: for param in model.parameters(): param.data.uniform_(-0.08, 0.08) return model
def __rms(y=None, S=None, frame_length=2048, hop_length=512, center=True, pad_mode='reflect'): """ codes from https://github.com/librosa/librosa use this code fragments instead of importing librosa package, because of our server has a problem with importing librosa. """ if y is not None: y = __to_mono(y) if center: y = np.pad(y, int(frame_length // 2), mode=pad_mode) x = __frame(y, frame_length=frame_length, hop_length=hop_length) # Calculate power power = np.mean(np.abs(x)**2, axis=0, keepdims=True) elif S is not None: # Check the frame length if S.shape[0] != frame_length // 2 + 1: raise ParameterError('Since S.shape[0] is {}, ' 'frame_length is expected to be {} or {}; ' 'found {}'.format(S.shape[0], S.shape[0] * 2 - 2, S.shape[0] * 2 - 1, frame_length)) # power spectrogram x = np.abs(S)**2 # Adjust the DC and sr/2 component x[0] *= 0.5 if frame_length % 2 == 0: x[-1] *= 0.5 # Calculate power power = 2 * np.sum(x, axis=0, keepdims=True) / frame_length**2 else: raise ParameterError('Either `y` or `S` must be input.') return np.sqrt(power)
def build_transformer( num_classes: int, d_model: int, d_ff: int, num_heads: int, input_dim: int, num_encoder_layers: int, num_decoder_layers: int, extractor: str, dropout_p: float, device: torch.device, pad_id: int = 0, sos_id: int = 1, eos_id: int = 2, joint_ctc_attention: bool = False, max_length: int = 400, ) -> nn.DataParallel: if dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if input_dim < 0: raise ParameterError("input_size should be greater than 0") if num_encoder_layers < 0: raise ParameterError("num_layers should be greater than 0") if num_decoder_layers < 0: raise ParameterError("num_layers should be greater than 0") return nn.DataParallel( SpeechTransformer( input_dim=input_dim, num_classes=num_classes, extractor=extractor, d_model=d_model, d_ff=d_ff, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, num_heads=num_heads, encoder_dropout_p=dropout_p, decoder_dropout_p=dropout_p, pad_id=pad_id, sos_id=sos_id, eos_id=eos_id, max_length=max_length, joint_ctc_attention=joint_ctc_attention, )).to(device)
def build_transformer(num_classes: int, pad_id: int, d_model: int, num_heads: int, input_size: int, num_encoder_layers: int, num_decoder_layers: int, dropout_p: float, ffnet_style: str, device: str, eos_id: int) -> Transformer: if ffnet_style not in {'ff', 'conv'}: raise ParameterError("Unsupported ffnet_style: {0}".format(ffnet_style)) model = Transformer(num_classes=num_classes, pad_id=pad_id, d_model=d_model, num_heads=num_heads, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dropout_p=dropout_p, ffnet_style=ffnet_style, input_dim=input_size, eos_id=eos_id) return nn.DataParallel(model).to(device)
def valid_audio(y, mono=True): if not isinstance(y, np.ndarray): raise ParameterError('Audio data must be of type numpy.ndarray') if not np.issubdtype(y.dtype, np.floating): raise ParameterError('Audio data must be floating-point') elif mono and y.ndim != 1: raise ParameterError('Invalid shape for monophonic audio: ' 'ndim={:d}, shape={}'.format(y.ndim, y.shape)) if y.ndim > 2 or y.ndim == 0: raise ParameterError( 'Audio data must have shape (samples,) or (channels, samples). ' 'Received shape={}'.format(y.shape)) if not np.isfinite(y).all(): raise ParameterError('Audio buffer is not finite everywhere') if not y.flags["F_CONTIGUOUS"]: raise ParameterError( 'Audio buffer is not Fortran-contiguous. ' 'Use numpy.asfortranarray to ensure Fortran contiguity.') return True
def build_seq2seq_encoder(input_size: int, hidden_dim: int, dropout_p: float, num_layers: int, bidirectional: bool, rnn_type: str, extractor: str, activation: str, device: str, mask_conv: bool) -> SpeechEncoderRNN: """ Various encoder dispatcher function. """ if dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if input_size < 0: raise ParameterError("input_size should be greater than 0") if hidden_dim < 0: raise ParameterError("hidden_dim should be greater than 0") if num_layers < 0: raise ParameterError("num_layers should be greater than 0") if extractor.lower() not in {'vgg', 'ds2'}: raise ParameterError("Unsupported extractor".format(extractor)) if rnn_type.lower() not in BaseRNN.supported_rnns.keys(): raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type)) return SpeechEncoderRNN( input_size=input_size, hidden_dim=hidden_dim, dropout_p=dropout_p, num_layers=num_layers, mask_conv=mask_conv, bidirectional=bidirectional, rnn_type=rnn_type, extractor=extractor, device=device, activation=activation )
def build_listener(input_size: int = 80, hidden_dim: int = 512, dropout_p: float = 0.2, num_layers: int = 3, bidirectional: bool = True, rnn_type: str = 'lstm', extractor: str = 'vgg', activation: str = 'hardtanh', device: str = 'cuda', mask_conv: bool = False) -> Listener: """ Various encoder dispatcher function. """ if dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if input_size < 0: raise ParameterError("input_size should be greater than 0") if hidden_dim < 0: raise ParameterError("hidden_dim should be greater than 0") if num_layers < 0: raise ParameterError("num_layers should be greater than 0") if extractor.lower() not in {'vgg', 'ds2'}: raise ParameterError("Unsupported extractor".format(extractor)) if rnn_type.lower() not in BaseRNN.supported_rnns.keys(): raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type)) return Listener(input_size=input_size, hidden_dim=hidden_dim, dropout_p=dropout_p, num_layers=num_layers, mask_conv=mask_conv, bidirectional=bidirectional, rnn_type=rnn_type, extractor=extractor, device=device, activation=activation)
def build_transformer(num_classes: int, pad_id: int, d_model: int, num_heads: int, input_size: int, num_encoder_layers: int, num_decoder_layers: int, dropout_p: float, ffnet_style: str, device: torch.device, eos_id: int, joint_ctc_attention: bool) -> nn.DataParallel: if ffnet_style not in {'ff', 'conv'}: raise ParameterError( "Unsupported ffnet_style: {0}".format(ffnet_style)) return nn.DataParallel( SpeechTransformer( num_classes=num_classes, pad_id=pad_id, d_model=d_model, num_heads=num_heads, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dropout_p=dropout_p, ffnet_style=ffnet_style, input_dim=input_size, eos_id=eos_id, joint_ctc_attention=joint_ctc_attention, )).to(device)
def __frame(x, frame_length=2048, hop_length=512, axis=-1): """ codes from https://github.com/librosa/librosa use this code fragments instead of importing librosa package, because of our server has a problem with importing librosa. """ if not isinstance(x, np.ndarray): raise ParameterError('Input must be of type numpy.ndarray, ' 'given type(x)={}'.format(type(x))) if x.shape[axis] < frame_length: raise ParameterError('Input is too short (n={:d})' ' for frame_length={:d}'.format( x.shape[axis], frame_length)) if hop_length < 1: raise ParameterError('Invalid hop_length: {:d}'.format(hop_length)) n_frames = 1 + (x.shape[axis] - frame_length) // hop_length strides = np.asarray(x.strides) new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize if axis == -1: if not x.flags['F_CONTIGUOUS']: raise ParameterError('Input array must be F-contiguous ' 'for framing along axis={}'.format(axis)) shape = list(x.shape)[:-1] + [frame_length, n_frames] strides = list(strides) + [hop_length * new_stride] elif axis == 0: if not x.flags['C_CONTIGUOUS']: raise ParameterError('Input array must be C-contiguous ' 'for framing along axis={}'.format(axis)) shape = [n_frames, frame_length] + list(x.shape)[1:] strides = [hop_length * new_stride] + list(strides) else: raise ParameterError( 'Frame axis={} must be either 0 or -1'.format(axis)) return as_strided(x, shape=shape, strides=strides)
def build_speller(num_classes, max_len, hidden_dim, sos_id, eos_id, attn_mechanism, num_layers, rnn_type, dropout_p, num_heads, device): """ Various decoder dispatcher function. """ if not isinstance(num_classes, int): raise ParameterError("num_classes should be inteager type") if not isinstance(num_layers, int): raise ParameterError("num_layers should be inteager type") if not isinstance(hidden_dim, int): raise ParameterError("hidden_dim should be inteager type") if not isinstance(sos_id, int): raise ParameterError("sos_id should be inteager type") if not isinstance(eos_id, int): raise ParameterError("eos_id should be inteager type") if not isinstance(num_heads, int): raise ParameterError("num_heads should be inteager type") if not isinstance(max_len, int): raise ParameterError("max_len should be inteager type") if not isinstance(dropout_p, float): raise ParameterError("dropout_p should be float type") if hidden_dim % num_heads != 0: raise ParameterError("{0} % {1} should be zero".format( hidden_dim, num_heads)) if dropout_p < 0.0: raise ParameterError("dropout probability should be positive") if num_heads < 0: raise ParameterError("num_heads should be greater than 0") if hidden_dim < 0: raise ParameterError("hidden_dim should be greater than 0") if num_layers < 0: raise ParameterError("num_layers should be greater than 0") if max_len < 0: raise ParameterError("max_len should be greater than 0") if num_classes < 0: raise ParameterError("num_classes should be greater than 0") if rnn_type.lower() not in BaseRNN.supported_rnns.keys(): raise ParameterError("Unsupported RNN Cell: {0}".format(rnn_type)) if device is None: raise ParameterError("device is None") return Speller(num_classes=num_classes, max_length=max_len, hidden_dim=hidden_dim, sos_id=sos_id, eos_id=eos_id, attn_mechanism=attn_mechanism, num_heads=num_heads, num_layers=num_layers, rnn_type=rnn_type, dropout_p=dropout_p, device=device)