def __init__(self, type, idim, layers, units, projs, dropout, nmask=1, nonlinear="sigmoid"): super().__init__() subsample = np.ones(layers + 1, dtype=np.int) typ = type.lstrip("vgg").rstrip("p") if type[-1] == "p": self.brnn = RNNP(idim, layers, units, projs, subsample, dropout, typ=typ) else: self.brnn = RNN(idim, layers, units, projs, dropout, typ=typ) self.type = type self.nmask = nmask self.linears = torch.nn.ModuleList( [torch.nn.Linear(projs, idim) for _ in range(nmask)]) if nonlinear not in ("sigmoid", "relu", "tanh", "crelu"): raise ValueError("Not supporting nonlinear={}".format(nonlinear)) self.nonlinear = nonlinear
def __init__( self, input_size: int, rnn_type: str = "lstm", bidirectional: bool = True, use_projection: bool = True, num_layers: int = 4, hidden_size: int = 320, output_size: int = 320, dropout: float = 0.0, subsample: Optional[Sequence[int]] = (2, 2, 1, 1), ): assert check_argument_types() super().__init__() self._output_size = output_size self.rnn_type = rnn_type self.bidirectional = bidirectional self.use_projection = use_projection if rnn_type not in {"lstm", "gru"}: raise ValueError(f"Not supported rnn_type={rnn_type}") if subsample is None: subsample = np.ones(num_layers + 1, dtype=np.int64) else: subsample = subsample[:num_layers] # Append 1 at the beginning because the second or later is used subsample = np.pad( np.array(subsample, dtype=np.int64), [1, num_layers - len(subsample)], mode="constant", constant_values=1, ) rnn_type = ("b" if bidirectional else "") + rnn_type if use_projection: self.enc = torch.nn.ModuleList([ RNNP( input_size, num_layers, hidden_size, output_size, subsample, dropout, typ=rnn_type, ) ]) else: self.enc = torch.nn.ModuleList([ RNN( input_size, num_layers, hidden_size, output_size, dropout, typ=rnn_type, ) ])
def __init__( self, input_dim: int, rnn_type: str = "blstm", num_spk: int = 2, nonlinear: str = "tanh", layer: int = 2, unit: int = 512, emb_D: int = 40, dropout: float = 0.0, ): """Deep Clustering Separator. References: [1] Deep clustering: Discriminative embeddings for segmentation and separation; John R. Hershey. et al., 2016; https://ieeexplore.ieee.org/document/7471631 [2] Manifold-Aware Deep Clustering: Maximizing Angles Between Embedding Vectors Based on Regular Simplex; Tanaka, K. et al., 2021; https://www.isca-speech.org/archive/interspeech_2021/tanaka21_interspeech.html Args: input_dim: input feature dimension rnn_type: string, select from 'blstm', 'lstm' etc. bidirectional: bool, whether the inter-chunk RNN layers are bidirectional. num_spk: number of speakers nonlinear: the nonlinear function for mask estimation, select from 'relu', 'tanh', 'sigmoid' layer: int, number of stacked RNN layers. Default is 3. unit: int, dimension of the hidden state. emb_D: int, dimension of the feature vector for a tf-bin. dropout: float, dropout ratio. Default is 0. """ # noqa: E501 super().__init__() self._num_spk = num_spk self.blstm = RNN( idim=input_dim, elayers=layer, cdim=unit, hdim=unit, dropout=dropout, typ=rnn_type, ) self.linear = torch.nn.Linear(unit, input_dim * emb_D) if nonlinear not in ("sigmoid", "relu", "tanh"): raise ValueError("Not supporting nonlinear={}".format(nonlinear)) self.nonlinear = { "sigmoid": torch.nn.Sigmoid(), "relu": torch.nn.ReLU(), "tanh": torch.nn.Tanh(), }[nonlinear] self.D = emb_D
def __init__( self, n_fft: int = 512, win_length: int = None, hop_length: int = 128, rnn_type: str = "blstm", layer: int = 3, unit: int = 512, dropout: float = 0.0, num_spk: int = 2, nonlinear: str = "sigmoid", utt_mvn: bool = False, mask_type: str = "IRM", loss_type: str = "mask_mse", ): super(TFMaskingNet, self).__init__() self.num_spk = num_spk self.num_bin = n_fft // 2 + 1 self.mask_type = mask_type self.loss_type = loss_type if loss_type not in ("mask_mse", "magnitude", "spectrum"): raise ValueError("Unsupported loss type: %s" % loss_type) self.stft = Stft( n_fft=n_fft, win_length=win_length, hop_length=hop_length, ) if utt_mvn: self.utt_mvn = UtteranceMVN(norm_means=True, norm_vars=True) else: self.utt_mvn = None self.rnn = RNN( idim=self.num_bin, elayers=layer, cdim=unit, hdim=unit, dropout=dropout, typ=rnn_type, ) self.linear = torch.nn.ModuleList( [torch.nn.Linear(unit, self.num_bin) for _ in range(self.num_spk)]) if nonlinear not in ("sigmoid", "relu", "tanh"): raise ValueError("Not supporting nonlinear={}".format(nonlinear)) self.nonlinear = { "sigmoid": torch.nn.Sigmoid(), "relu": torch.nn.ReLU(), "tanh": torch.nn.Tanh(), }[nonlinear]
def __init__( self, input_dim: int, rnn_type: str = "blstm", num_spk: int = 2, nonlinear: str = "tanh", layer: int = 2, unit: int = 512, emb_D: int = 40, dropout: float = 0.0, ): """Deep Attractor Network Separator Reference: DEEP ATTRACTOR NETWORK FOR SINGLE-MICROPHONE SPEAKER SEPARATION; Zhuo Chen. et al., 2017; https://pubmed.ncbi.nlm.nih.gov/29430212/ Args: input_dim: input feature dimension rnn_type: string, select from 'blstm', 'lstm' etc. bidirectional: bool, whether the inter-chunk RNN layers are bidirectional. num_spk: number of speakers nonlinear: the nonlinear function for mask estimation, select from 'relu', 'tanh', 'sigmoid' layer: int, number of stacked RNN layers. Default is 3. unit: int, dimension of the hidden state. emb_D: int, dimension of the attribute vector for one tf-bin. dropout: float, dropout ratio. Default is 0. """ super().__init__() self._num_spk = num_spk self.blstm = RNN( idim=input_dim, elayers=layer, cdim=unit, hdim=unit, dropout=dropout, typ=rnn_type, ) self.linear = torch.nn.Linear(unit, input_dim * emb_D) if nonlinear not in ("sigmoid", "relu", "tanh"): raise ValueError("Not supporting nonlinear={}".format(nonlinear)) self.nonlinear = { "sigmoid": torch.nn.Sigmoid(), "relu": torch.nn.ReLU(), "tanh": torch.nn.Tanh(), }[nonlinear] self.D = emb_D
def __init__( self, input_dim: int, rnn_type: str = "blstm", num_spk: int = 2, predict_noise: bool = False, nonlinear: str = "sigmoid", layer: int = 3, unit: int = 512, dropout: float = 0.0, ): """RNN Separator Args: input_dim: input feature dimension rnn_type: string, select from 'blstm', 'lstm' etc. bidirectional: bool, whether the inter-chunk RNN layers are bidirectional. num_spk: number of speakers predict_noise: whether to output the estimated noise signal nonlinear: the nonlinear function for mask estimation, select from 'relu', 'tanh', 'sigmoid' layer: int, number of stacked RNN layers. Default is 3. unit: int, dimension of the hidden state. dropout: float, dropout ratio. Default is 0. """ super().__init__() self._num_spk = num_spk self.predict_noise = predict_noise self.rnn = RNN( idim=input_dim, elayers=layer, cdim=unit, hdim=unit, dropout=dropout, typ=rnn_type, ) num_outputs = self.num_spk + 1 if self.predict_noise else self.num_spk self.linear = torch.nn.ModuleList( [torch.nn.Linear(unit, input_dim) for _ in range(num_outputs)]) if nonlinear not in ("sigmoid", "relu", "tanh"): raise ValueError("Not supporting nonlinear={}".format(nonlinear)) self.nonlinear = { "sigmoid": torch.nn.Sigmoid(), "relu": torch.nn.ReLU(), "tanh": torch.nn.Tanh(), }[nonlinear]
def __init__( self, input_size: int, rnn_type: str = "lstm", bidirectional: bool = True, use_projection: bool = True, num_layers: int = 4, hidden_size: int = 320, output_size: int = 320, dropout: float = 0.0, in_channel: int = 1, ): assert check_argument_types() super().__init__() self._output_size = output_size self.rnn_type = rnn_type self.bidirectional = bidirectional self.use_projection = use_projection if rnn_type not in {"lstm", "gru"}: raise ValueError(f"Not supported rnn_type={rnn_type}") # Subsample is not used for VGGRNN subsample = np.ones(num_layers + 1, dtype=np.int64) rnn_type = ("b" if bidirectional else "") + rnn_type if use_projection: self.enc = torch.nn.ModuleList([ VGG2L(in_channel), RNNP( get_vgg2l_odim(input_size, in_channel=in_channel), num_layers, hidden_size, output_size, subsample, dropout, typ=rnn_type, ), ]) else: self.enc = torch.nn.ModuleList([ VGG2L(in_channel), RNN( get_vgg2l_odim(input_size, in_channel=in_channel), num_layers, hidden_size, output_size, dropout, typ=rnn_type, ), ])
def __init__(self, type, idim, layers, units, projs, dropout, nmask=1): super().__init__() subsample = np.ones(layers + 1, dtype=np.int) typ = type.lstrip("vgg").rstrip("p") if type[-1] == "p": self.brnn = RNNP(idim, layers, units, projs, subsample, dropout, typ=typ) else: self.brnn = RNN(idim, layers, units, projs, dropout, typ=typ) self.type = type self.nmask = nmask self.linears = torch.nn.ModuleList( [torch.nn.Linear(projs, idim) for _ in range(nmask)])
def __init__( self, input_dim: int, rnn_type: str = "blstm", num_spk: int = 2, nonlinear: str = "tanh", layer: int = 2, unit: int = 512, emb_D: int = 40, dropout: float = 0.0, alpha: float = 5.0, max_iteration: int = 500, threshold: float = 1.0e-05, ): """Deep Clustering End-to-End Separator References: Single-Channel Multi-Speaker Separation using Deep Clustering; Yusuf Isik. et al., 2016; https://www.isca-speech.org/archive/interspeech_2016/isik16_interspeech.html Args: input_dim: input feature dimension rnn_type: string, select from 'blstm', 'lstm' etc. bidirectional: bool, whether the inter-chunk RNN layers are bidirectional. num_spk: number of speakers nonlinear: the nonlinear function for mask estimation, select from 'relu', 'tanh', 'sigmoid' layer: int, number of stacked RNN layers. Default is 3. unit: int, dimension of the hidden state. emb_D: int, dimension of the feature vector for a tf-bin. dropout: float, dropout ratio. Default is 0. alpha: float, the clustering hardness parameter. max_iteration: int, the max iterations of soft kmeans. threshold: float, the threshold to end the soft k-means process. """ super().__init__() self._num_spk = num_spk self.blstm = RNN( idim=input_dim, elayers=layer, cdim=unit, hdim=unit, dropout=dropout, typ=rnn_type, ) self.linear = torch.nn.Linear(unit, input_dim * emb_D) if nonlinear not in ("sigmoid", "relu", "tanh"): raise ValueError("Not supporting nonlinear={}".format(nonlinear)) self.nonlinear = { "sigmoid": torch.nn.Sigmoid(), "relu": torch.nn.ReLU(), "tanh": torch.nn.Tanh(), }[nonlinear] self.enh_blstm = RNN( idim=input_dim * (num_spk + 1), elayers=1, cdim=unit, hdim=unit, dropout=dropout, typ=rnn_type, ) self.enh_linear = torch.nn.Linear(unit, input_dim * num_spk) self.D = emb_D self.alpha = alpha self.max_iteration = max_iteration self.threshold = threshold