def __init__(self, input_size=(36, 256), conv_dim=64, repeat_num=5, num_speakers=10): super(Discriminator, self).__init__() self.num_speakers = num_speakers self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Initial layers. self.conv_layer_1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.GLU(dim=1)) self.conv_gated_1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.GLU(dim=1)) # Down-sampling layers. self.down_sample_1 = DownsampleBlock(dim_in=64, dim_out=256, kernel_size=(3, 3), stride=(2, 2), padding=1, bias=False) self.down_sample_2 = DownsampleBlock(dim_in=128, dim_out=512, kernel_size=(3, 3), stride=(2, 2), padding=1, bias=False) self.down_sample_3 = DownsampleBlock(dim_in=256, dim_out=1024, kernel_size=(3, 3), stride=(2, 2), padding=1, bias=False) self.down_sample_4 = DownsampleBlock(dim_in=512, dim_out=1024, kernel_size=(1, 5), stride=(1, 1), padding=(0, 2), bias=False) # Fully connected layer. self.fully_connected = nn.Linear(in_features=512, out_features=1) # Projection. self.projection = nn.Linear(self.num_speakers * 2, 512)
def __init__(self, inputdim, outputdim, **kwargs): super().__init__() self._filtersizes = kwargs.get('filtersizes', [3, 3, 3, 3, 3]) self._filter = [1] + kwargs.get('filter', [16, 24, 32, 16, 16]) self._pooling = kwargs.get('pooling', [2, 2, 2, 2, 2]) self._linear_dim = kwargs.get('lineardim', 128) self._cmvn = kwargs.get( 'cmvn', True) # Use or not use rollwing window standardization self.norm = MovingAvgNorm(100) if self._cmvn else nn.Sequential() net = nn.ModuleList() for nl, (h0, h1, filtersize, poolingsize) in enumerate( zip(self._filter, self._filter[1:], self._filtersizes, self._pooling)): if nl == 0: net.append( nn.Sequential( nn.GroupNorm(1, h0), nn.Conv2d(h0, h1 * 2, kernel_size=filtersize, padding=filtersize // 2, stride=1), nn.GLU(1), )) else: net.append( nn.Sequential( nn.GroupNorm(1, h0), nn.Conv2d(h0, h1 * 2, kernel_size=1, padding=0, stride=1), nn.GLU(1), nn.GroupNorm(1, h1), nn.Conv2d(h1, h1 * 2, kernel_size=filtersize, padding=filtersize // 2, stride=1), nn.GLU(1), )) net.append(nn.MaxPool2d(kernel_size=poolingsize, ceil_mode=True)) self.network = nn.Sequential(*net) with torch.no_grad(): feature_output = self.network(torch.randn(1, 1, 300, inputdim)).shape feature_output = feature_output[1] * feature_output[3] # LeftOverFeatDim * ChannelDim self.timepool = nn.AdaptiveAvgPool2d((1, None)) self.outputlayer = nn.Sequential( nn.Conv1d(feature_output, self._linear_dim * 2, kernel_size=1), nn.GLU(1), nn.Dropout(0.5), nn.Conv1d(self._linear_dim, outputdim, kernel_size=1, groups=1)) self.network.apply(init_weights) self.outputlayer.apply(init_weights)
def __init__(self, input_dim, output_dim, residual_dim, gate_dim, skip_dim, kernel_size, down_sample_factor=2, dilation_rate=None): """ Initialize Encoder module Args: input_dim (int): Number of channels of input tensor output_dim (int): Number of channels of output tensor skip_dim (int): Number of channels of skip connection kernel_size (int): Size of kernel down_sample_factor: Upsample factor dilation_rate: List of dilation rate for WNCell Returns: Tensor: Output tensor """ super().__init__() self.down_sample_factor = down_sample_factor if dilation_rate is None: dilation_rate = [1, 2, 4, 8, 16, 32] self.input_layer = nn.Sequential( ConvNorm(input_dim, 2 * residual_dim, kernel_size=15), nn.GLU(dim=1)) if self.down_sample_factor > 1: self.down_sample = nn.ModuleList() assert down_sample_factor % 2 == 0 for i in range(down_sample_factor // 2): self.down_sample.extend([ ConvNorm(residual_dim, 2 * residual_dim, kernel_size=8, stride=2), nn.InstanceNorm1d(2 * residual_dim, momentum=0.8), nn.GLU(dim=1) ]) self.down_sample = nn.Sequential(*self.down_sample) self.WN = nn.ModuleList() for d in dilation_rate: self.WN.append( WNCell(residual_dim=residual_dim, gate_dim=gate_dim, skip_dim=skip_dim, kernel_size=kernel_size, dilation=d)) self.output_layer = nn.Sequential( ConvNorm(skip_dim, 2 * output_dim, kernel_size=kernel_size), nn.InstanceNorm1d(2 * output_dim, momentum=0.8), nn.GLU(dim=1), ConvNorm(output_dim, output_dim, kernel_size=1))
def GatedLinear(in_features, out_features, dropout=0.0, bias=True): """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" return nn.Sequential( Linear(in_features, out_features * 4, dropout, bias), nn.GLU(), Linear(out_features * 2, out_features * 2, dropout, bias), nn.GLU(), Linear(out_features, out_features, dropout, bias), )
def __init__(self, num_speakers=10): super(Discriminator, self).__init__() self.num_speakers = num_speakers self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # 初始化层 self.conv_layer_1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=128, kernel_size=(3, 3), stride=(1, 1), padding=1), nn.GLU(dim=1)) # 下采样层 self.down_sample_1 = nn.Sequential( nn.Conv2d(in_channels=64, out_channels=256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), nn.InstanceNorm2d(num_features=256, affine=True, track_running_stats=True), nn.GLU(dim=1)) self.down_sample_2 = nn.Sequential( nn.Conv2d(in_channels=128, out_channels=512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), nn.InstanceNorm2d(num_features=512, affine=True, track_running_stats=True), nn.GLU(dim=1)) self.down_sample_3 = nn.Sequential( nn.Conv2d(in_channels=256, out_channels=1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), nn.InstanceNorm2d(num_features=1024, affine=True, track_running_stats=True), nn.GLU(dim=1)) self.down_sample_4 = nn.Sequential( nn.Conv2d(in_channels=512, out_channels=1024, kernel_size=(1, 5), stride=(1, 1), padding=(0, 2), bias=False), nn.InstanceNorm2d(num_features=1024, affine=True, track_running_stats=True), nn.GLU(dim=1)) # 全连接层 self.fully_connected = nn.Linear(in_features=512, out_features=1) # 映射层. self.projection = nn.Linear(self.num_speakers, 512)
def __init__(self, kernel_size, in_ch, out_ch, bottlececk_dim=0, dropout=0.): super().__init__() self.conv_residual = None if in_ch != out_ch: self.conv_residual = nn.utils.weight_norm(nn.Conv2d( in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)), name='weight', dim=0) self.dropout_residual = nn.Dropout(p=dropout) self.pad_left = nn.ConstantPad2d((0, 0, kernel_size - 1, 0), 0) layers = OrderedDict() if bottlececk_dim == 0: layers['conv'] = nn.utils.weight_norm(nn.Conv2d( in_channels=in_ch, out_channels=out_ch * 2, kernel_size=(kernel_size, 1)), name='weight', dim=0) # TODO(hirofumi0810): padding? layers['dropout'] = nn.Dropout(p=dropout) layers['glu'] = nn.GLU() elif bottlececk_dim > 0: layers['conv_in'] = nn.utils.weight_norm(nn.Conv2d( in_channels=in_ch, out_channels=bottlececk_dim, kernel_size=(1, 1)), name='weight', dim=0) layers['dropout_in'] = nn.Dropout(p=dropout) layers['conv_bottleneck'] = nn.utils.weight_norm(nn.Conv2d( in_channels=bottlececk_dim, out_channels=bottlececk_dim, kernel_size=(kernel_size, 1)), name='weight', dim=0) layers['dropout'] = nn.Dropout(p=dropout) layers['glu'] = nn.GLU() layers['conv_out'] = nn.utils.weight_norm(nn.Conv2d( in_channels=bottlececk_dim, out_channels=out_ch * 2, kernel_size=(1, 1)), name='weight', dim=0) layers['dropout_out'] = nn.Dropout(p=dropout) self.layers = nn.Sequential(layers)
def __init__(self, num_speakers=10): super(Discriminator, self).__init__() self.num_speakers = num_speakers # Initial layers. self.conv_layer_1 = nn.Sequential( nn.Conv2d(in_channels=1, out_channels=256, kernel_size=(3, 3), stride=(1, 2), padding=1), nn.GLU(dim=1)) #self.conv1 = nn.Conv2d(1, 128, kernel_size= (3,3), stride = 1, padding= 1) #self.gate1 = nn.Conv2d(1, 128, kernel_size = 3, stride = 1, padding = 1) # Down-sampling layers. self.down_sample_1 = nn.Sequential( nn.Conv2d(in_channels=128, out_channels=512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), nn.InstanceNorm2d(512, affine=True), nn.GLU(dim=1)) #self.down_sample_1 = DisDown(128, 256, kernel_size = 3, stride = 2, padding = 1) self.down_sample_2 = nn.Sequential( nn.Conv2d(in_channels=256, out_channels=1024, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), nn.InstanceNorm2d(1024, affine=True), nn.GLU(dim=1)) #self.down_sample_2 = DisDown(256, 512, kernel_size = 3, stride = 2, padding = 1) self.down_sample_3 = nn.Sequential( nn.Conv2d(in_channels=512, out_channels=2048, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False), nn.InstanceNorm2d(2048, affine=True), nn.GLU(dim=1)) #self.down_sample_3 = DisDown(512, 1024, kernel_size = 3, stride = 2, padding = 1) self.down_sample_4 = nn.Sequential( nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=(1, 5), stride=(1, 2), padding=(0, 2), bias=False), nn.GLU(dim=1)) #self.down_sample_4 = DisDown(1024, 512, kernel_size = (1,5), stride = 1, padding = (0,2)) # Fully connected layer. self.fully_connected = nn.Linear(in_features=512, out_features=num_speakers)
def __init__(self, latent_dim, num_classes=0, embedding_dim=EMBEDDING_DIM): super().__init__() self.c1 = nn.Sequential( nn.ConvTranspose2d(latent_dim, latent_dim * 2, 4), norm_class(latent_dim * 2), nn.GLU(dim=1)) self.num_classes = num_classes if num_classes == 0: return self.embed = nn.Embedding(num_classes, embedding_dim) self.c2 = nn.Sequential( nn.Conv2d(latent_dim + embedding_dim, latent_dim * 2, 1), norm_class(latent_dim * 2), nn.GLU(dim=1))
def __init__(self, in_chan, out_chan, kernel=3, stride=1, pad=1, dil=1, dropout=0., groups=1, k=1, **kwargs): super().__init__() assert k in [1, 2], 'Handle only k = 1 or 2' self.conv = nn.Sequential(nn.Conv1d(in_chan, out_chan, kernel, stride=stride, padding=pad, dilation=dil, groups=groups), nn.BatchNorm1d(out_chan), nn.ReLU(inplace=True) if k == 1 else nn.GLU(dim=1), nn.Dropout(dropout))
def __init__(self, config: Config, embed_dim: int) -> None: super().__init__(config) out_channels = config.cnn.kernel_num kernel_sizes = config.cnn.kernel_sizes conv_layers = [] linear_layers = [] in_channels = embed_dim for k in kernel_sizes: assert (k - 1) % 2 == 0 proj = ( nn.Linear(in_channels, out_channels) if in_channels != out_channels else None ) linear_layers.append(proj) single_conv = nn.Conv1d( in_channels, 2 * out_channels, k, padding=int((k - 1) / 2) ) conv_layers.append(single_conv) in_channels = out_channels self.convs = nn.ModuleList(conv_layers) self.projections = nn.ModuleList(linear_layers) self.glu = nn.GLU(dim=1) self.representation_dim = out_channels self.dropout = nn.Dropout(p=config.dropout)
def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2, model='Gen'): super(ConvLayer, self).__init__() self.conv = weight_norm( nn.Conv1d(n_inputs, n_outputs * 2, kernel_size, stride=stride, padding=padding, dilation=dilation)) self.dilation = dilation self.padding = padding self.model = model self.glu = nn.GLU(dim=1) self.dropout = nn.Dropout(dropout) self.trans = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None self.init_weights()
def __init__(self, in_channel, channel, kernel_size, conv='wnconv2d', activation=nn.ELU, dropout=0.1, auxiliary_channel=0, condition_dim=0): super(GatedResBlock, self).__init__() if conv == 'wnconv2d': conv_module = partial(tfa.layers.WeightNormalization(tf.keras.layers.Conv2D(padding='same'))) elif conv == 'causal_downright': conv_module = partial(CausalConv2d, padding='downright') elif conv == 'causal': conv_module = partial(CausalConv2d, padding='causal') self.activation = activation(inplace=True) self.conv1 = conv_module(in_channel, channel, kernel_size) if auxiliary_channel > 0: self.aux_conv = WNConv2d(auxiliary_channel, channel, 1) self.dropout = nn.Dropout(dropout) self.conv2 = conv_module(channel, in_channel * 2, kernel_size) if condition_dim > 0: # self.condition = nn.Linear(condition_dim, in_channel * 2, bias=False) self.condition = WNConv2d(condition_dim, in_channel * 2, 1, bias=False) self.gate = nn.GLU(1)
def __init__(self, embed_dim, conv_dim, num_heads, kernel_size, weight_dropout=0.1, dropout=0.3, input_dropout=0.0, weight_softmax=True, encoder_glu=False, normalize_before=False): super().__init__() self.embed_dim = embed_dim self.conv_dim = conv_dim if encoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None self.conv = DynamicConv(self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=weight_softmax, num_heads=num_heads, weight_dropout=weight_dropout) self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = dropout self.input_dropout = input_dropout self.normalize_before = normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim)
def __init__(self, args, kernel_size=1): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None self.conv = TaLKConv(self.conv_dim, offsets_dropout=args.weight_dropout, decode=False, num_heads=args.encoder_attention_heads, min_len_left=kernel_size, min_len_right=kernel_size) self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList( [LayerNorm(self.embed_dim) for _ in range(2)])
def __init__(self, latent_dim=32, hidden_dim=256, input_dim=784, nb_layers=2, dropout_p=0.): """ A simple MLP encoder with gated activations. :param latent_dim: input features :param hidden_dim: hidden features :param input_dim: nb output features OR parameters for the likelihood distribution :param nb_layers: excluding the output projection :param dropout_p: """ super().__init__() layers = [] for i in range(nb_layers): inter_dim = latent_dim if i == 0 else hidden_dim layers += [ nn.Linear(inter_dim, hidden_dim * 2), nn.GLU(dim=1), nn.Dropout(dropout_p) ] layers.append(nn.Linear(hidden_dim, input_dim)) self.layers = nn.Sequential(*layers)
def __init__(self, in_feature=1): super().__init__() self.block = nn.Sequential( # input: (1 x 1 x 24 x 128) ConvLayer2D(in_filter=in_feature, out_filter=256, kernel=(3, 3), stride=(1, 2)), nn.GLU(dim=1), # input: (1 x 128 x 24 x 64) Downsample2D(in_filter=128, out_filter=256, kernel=(3, 3), stride=(2, 2)), # input: (1 x 256 x 12 x 32) Downsample2D(in_filter=256, out_filter=512, kernel=(3, 3), stride=(2, 2)), # input: (1 x 512 x 6 x 16) #nn.ZeroPad2d((3, 2, 0, 1)), Downsample2D(in_filter=512, out_filter=1024, kernel=(6, 3), stride=(1, 2)), # input: (1 x 1024 x 1 x 1) PermuteBlock(), # input?: (1 x 1024) nn.Linear(1024, 1) #nn.Sigmoid() )
def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2*self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == 'lightweight': self.conv = LightweightConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) elif args.encoder_conv_type == 'dynamic': self.conv = DynamicConv1dTBC(self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = args.dropout self.relu_dropout = args.relu_dropout self.input_dropout = args.input_dropout self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norms = nn.ModuleList([LayerNorm(self.embed_dim) for _ in range(2)])
def __init__(self, channels, kernel_size=31): super(ConvModule, self).__init__() assert (kernel_size - 1) % 2 == 0 self.pointwise_conv1 = nn.Conv1d( channels, 2 * channels, kernel_size=1, stride=1, padding=0, ) self.depthwise_conv = nn.Conv1d( channels, channels, kernel_size=kernel_size, stride=1, padding=(kernel_size - 1) // 2, groups=channels, ) self.batch_norm = nn.BatchNorm1d(channels) self.pointwise_conv2 = nn.Conv1d(channels, channels, kernel_size=1, stride=1, padding=0) self.glu_act = nn.GLU(dim=1) self.swish_act = Swish()
def __init__(self): super(NNActivationModule, self).__init__() self.activations = nn.ModuleList([ nn.ELU(), nn.Hardshrink(), nn.Hardsigmoid(), nn.Hardtanh(), nn.Hardswish(), nn.LeakyReLU(), nn.LogSigmoid(), # nn.MultiheadAttention(), nn.PReLU(), nn.ReLU(), nn.ReLU6(), nn.RReLU(), nn.SELU(), nn.CELU(), nn.GELU(), nn.Sigmoid(), nn.SiLU(), nn.Mish(), nn.Softplus(), nn.Softshrink(), nn.Softsign(), nn.Tanh(), nn.Tanhshrink(), # nn.Threshold(0.1, 20), nn.GLU(), nn.Softmin(), nn.Softmax(), nn.Softmax2d(), nn.LogSoftmax(), # nn.AdaptiveLogSoftmaxWithLoss(), ])
def __init__( self, wshare, n_feat, dropout_rate, kernel_size, use_kernel_mask=False, use_bias=False, ): """Construct Dynamic Convolution layer.""" super(DynamicConvolution, self).__init__() assert n_feat % wshare == 0 self.wshare = wshare self.use_kernel_mask = use_kernel_mask self.dropout_rate = dropout_rate self.kernel_size = kernel_size self.attn = None # linear -> GLU -- -> lightconv -> linear # \ / # Linear self.linear1 = nn.Linear(n_feat, n_feat * 2) self.linear2 = nn.Linear(n_feat, n_feat) self.linear_weight = nn.Linear(n_feat, self.wshare * 1 * kernel_size) nn.init.xavier_uniform(self.linear_weight.weight) self.act = nn.GLU() # dynamic conv related self.use_bias = use_bias if self.use_bias: self.bias = nn.Parameter(torch.Tensor(n_feat))
def __init__(self, dim_domain=16): super(DiscriminatorV2, self).__init__() self.conv_in=nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, stride=1, padding=1) self.conv_in_gate=nn.Conv2d(in_channels=1, out_channels=128, kernel_size=3, stride=1, padding=1) self.glu=nn.GLU(dim=1) height, width=36, 128 num_downsamples=4 in_channels=128 down_samples=[] for i in range(num_downsamples-1): down_samples.append(DownSample_Block(in_channels=in_channels, out_channels=in_channels*2, kernel_size=3, stride=2, padding=1)) in_channels*=2 down_samples.append(DownSample_Block(in_channels=in_channels, out_channels=in_channels, kernel_size=(1, 5), stride=1, padding=(0, 2))) self.down_samples=nn.Sequential(*down_samples) #TODO check output shape [5, 32] self.fc=nn.Linear(in_channels, 1) self.projection=nn.Linear(dim_domain, in_channels) # PatchGAN classifier # TODO floor or ceil kernel_size_0 = math.ceil(height / np.power(2, num_downsamples-1)) # 4 kernel_size_1 = math.ceil(width / np.power(2, num_downsamples-1)) # 16 # make it a single value self.conv_clf_spks = nn.Conv2d(in_channels, 4, kernel_size=(kernel_size_0, kernel_size_1), stride=1, padding=0, bias=False) # for num_speaker
def forward(self, x): time = x.shape[2] num_featuers = x.shape[3] x = self.conv1(x) x = nn.GLU(dim=1)(x) for i in range(3): x = self.__getattr__("downsample_block" + str(i + 1))(x) x = self.conv2(x) x = nn.InstanceNorm2d(x.shape[1])(x) x = nn.GLU(dim=1)(x) x = self.conv3(x) x = x.view(x.shape[0], -1) x = self.fc(x) return nn.Sigmoid()(x)
def forward(self, x): time = x.shape[2] num_featuers = x.shape[3] x = self.conv1(x) x = nn.GLU(dim=1)(x) for i in range(2): x = self.__getattr__("downsample_block" + str(i + 1))(x) x = x.reshape((x.shape[0], -1, int(int((time + 1) / 2 + 1) / 2), 1)) x = self.conv2(x) x = nn.InstanceNorm2d(x.shape[1])(x) for i in range(6): x = self.__getattr__("residual_block" + str(i + 1))(x) x = self.conv3(x) x = nn.InstanceNorm2d(x.shape[1])(x) x = x.reshape(x.shape[0], -1, int(int((time + 1) / 2 + 1) / 2), int(int((num_featuers + 1) / 2 + 1) / 2)) for i in range(2): x = self.__getattr__("upsample_block" + str(i + 1))(x) x = self.conv4(x) x = x.reshape(x.shape[0], x.shape[3], x.shape[2], x.shape[1]) return self.output(x)
def __init__(self, in_channel, channel, kernel_size, conv='wnconv2d', dropout=0.1, condition_dim=0, aux_channels=0): super().__init__() assert conv in [ 'wnconv2d', 'causal_downright', 'causal' ], "Invalid conv argument [wnconv2d, causal_downright, causal]" # partial is amazing! should use more! if conv == 'wnconv2d': conv_builder = partial(WNConv2d, padding=kernel_size // 2) elif conv == 'causal_downright': conv_builder = partial(CausalConv2d, padding='downright') elif conv == 'causal': conv_builder = partial(CausalConv2d, padding='causal') self.conv1 = conv_builder(in_channel, channel, kernel_size) self.bn1 = nn.BatchNorm2d(channel) self.conv2 = conv_builder(channel, in_channel * 2, kernel_size) self.bn2 = nn.BatchNorm2d(in_channel * 2) self.drop1 = nn.Dropout(dropout) if aux_channels > 0: self.aux_conv = WNConv2d(aux_channels, channel, 1) if condition_dim > 0: self.convc = WNConv2d(condition_dim, in_channel * 2, 1, bias=False) self.gate = nn.GLU(1) # 0 -> 1 === ReZero -> Residual
def __init__(self, conv, p): super().__init__() self.conv = conv nn.init.kaiming_normal_(self.conv.weight) self.conv = weight_norm(self.conv) self.act = nn.GLU(1) self.dropout = nn.Dropout(p, inplace=True)
def __init__( self, *, chan_in, chan_out=3, num_upsamples=4, end_glu=True, ): super().__init__() self.layers = nn.ModuleList([]) final_chan = chan_out chans = chan_in for ind in range(num_upsamples): last_layer = ind == (num_upsamples - 1) chan_out = chans if (not last_layer or end_glu) else final_chan * 2 layer = nn.Sequential(upsample(), nn.Conv2d(chans, chan_out, 3, padding=1), nn.GLU(dim=1)) self.layers.append(layer) chans //= 2 if end_glu: self.layers.append(nn.Conv2d(chans, final_chan, 3, padding=1))
def __init__(self, d_model, kernel_size, num_heads, dropout, weight_softmax=True): super(LConvBlock, self).__init__() self.embed_dim = d_model padding_l = (kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2)) self.act_linear = LinearNorm(self.embed_dim, 2 * self.embed_dim, bias=True) self.act = nn.GLU() self.conv_layer = LightweightConv( self.embed_dim, kernel_size, padding_l=padding_l, weight_softmax=weight_softmax, num_heads=num_heads, weight_dropout=dropout, ) self.fc1 = LinearNorm(self.embed_dim, 4 * self.embed_dim, bias=True) self.fc2 = LinearNorm(4 * self.embed_dim, self.embed_dim, bias=True) self.layer_norm = nn.LayerNorm(self.embed_dim)
def __init__(self, args, tgt_dict): super().__init__() self.args = args feature_enc_layers = eval(args.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=args.extractor_mode, conv_bias=args.conv_bias, ) self.post_extract_proj = (nn.Linear(self.embed, args.encoder_embed_dim) if self.embed != args.encoder_embed_dim else None) self.mask_prob = args.mask_prob self.mask_selection = args.mask_selection self.mask_other = args.mask_other self.mask_length = args.mask_length self.no_mask_overlap = args.no_mask_overlap self.mask_min_space = args.mask_min_space self.mask_channel_prob = args.mask_channel_prob self.mask_channel_selection = args.mask_channel_selection self.mask_channel_other = args.mask_channel_other self.mask_channel_length = args.mask_channel_length self.no_mask_channel_overlap = args.no_mask_channel_overlap self.mask_channel_min_space = args.mask_channel_min_space self.dropout_input = nn.Dropout(args.dropout_input) self.dropout_features = nn.Dropout(args.dropout_features) self.feature_grad_mult = args.feature_grad_mult self.n_negatives = args.num_negatives self.cross_sample_negatives = args.cross_sample_negatives self.negatives_from_everywhere = args.negatives_from_everywhere self.logit_temp = args.logit_temp final_dim = args.final_dim if args.final_dim > 0 else args.encoder_embed_dim self.project_q = nn.Linear(self.embed, final_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(args.encoder_embed_dim).uniform_()) self.encoder = TransformerEncoder(args) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if args.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU()) self.final_proj = nn.Linear(args.encoder_embed_dim, final_dim) self.phone_proj = nn.Linear(args.encoder_embed_dim, len(tgt_dict))
def __init__( self, attention_dropout, decoder_attention_heads, self_attention_heads, decoder_conv_dim, # ARBABU: need to remove these two type parameters decoder_conv_type, attention_type, self_attention_type, decoder_embed_dim, decoder_ffn_embed_dim, decoder_glu, decoder_normalize_before, dropout, input_dropout, relu_dropout, need_attention, convolution_type, conv=None, self_attention=None, attention=None, ): super().__init__() self.embed_dim = decoder_embed_dim self.conv_dim = decoder_conv_dim if decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = PlaceholderIdentity() self.conv = conv self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout = dropout self.relu_dropout = relu_dropout self.input_dropout = input_dropout self.normalize_before = decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if attention is None: self.no_encoder_attn = True self.encoder_attn = PlaceholderAttentionIdentity() self.encoder_attn_layer_norm = PlaceholderIdentity() else: self.no_encoder_attn = False self.encoder_attn = attention self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) if self_attention is None: self.has_self_attn = False self.self_attn = PlaceholderAttentionIdentity() else: self.has_self_attn = True self.self_attn = self_attention self.fc1 = Linear(self.embed_dim, decoder_ffn_embed_dim) self.fc2 = Linear(decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = need_attention
def __init__(self, init_channels, cond_dim, z_dim, n_upsample=4): """Init. Args: init_channels(int): # channels of initial representation. cond_dim(int): dimension of conditioning variable. z_dim(int): dimension of noise. n_upsamples(int, optional): # upsampling blocks, default=`4`. Raises: AssertionError: 2 ** n_upsample is greater init_channels. """ super().__init__() # use linear layer to project input to 4 x 4 x init_channels # view is used to get appropriate shape in forward() self.fc = nn.Sequential( nn.Linear(cond_dim + z_dim, 4 * 4 * init_channels * 2, bias=False), nn.BatchNorm1d(init_channels * 4 * 4 * 2), nn.GLU(1), ) # halve channels with each block channels = [init_channels // 2**i for i in range(n_upsample + 1)] assert channels[-1] > 0, 'Too many upsampling blocks / Too few channels' self.upsampling_blocks = nn.Sequential(*[ UpsamplingBlock(in_ch, out_ch, 3) for in_ch, out_ch in zip(channels[:-1], channels[1:]) ])