def __init__(self, n_fft=2048, hop_length=None, win_length=None, window='hann', center=True, pad_mode='reflect', freeze_parameters=True): """Calculate spectrogram using pytorch. The STFT is implemented with Conv1d. The function has the same output of librosa.core.stft """ super(ISTFT, self).__init__() assert pad_mode in ['constant', 'reflect'] self.n_fft = n_fft self.hop_length = hop_length self.win_length = win_length self.window = window self.center = center self.pad_mode = pad_mode # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) ifft_window = librosa.filters.get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size ifft_window = librosa.util.pad_center(ifft_window, n_fft) # DFT & IDFT matrix self.W = self.idft_matrix(n_fft) / n_fft self.conv_real = nn.Conv1d(in_channels=n_fft, out_channels=n_fft, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bias=False) self.conv_imag = nn.Conv1d(in_channels=n_fft, out_channels=n_fft, kernel_size=1, stride=1, padding=0, dilation=1, groups=1, bias=False) self.conv_real.weight.data = torch.Tensor( np.real(self.W * ifft_window[None, :]).T)[:, :, None] # (n_fft // 2 + 1, 1, n_fft) self.conv_imag.weight.data = torch.Tensor( np.imag(self.W * ifft_window[None, :]).T)[:, :, None] # (n_fft // 2 + 1, 1, n_fft) if freeze_parameters: for param in self.parameters(): param.requires_grad = False
def __init__( self, learn_pooling: bool = True, learn_filters: bool = True, conv1d_cls=convolution.GaborConv1D, activation=activations.SquaredModulus(), pooling_cls=pooling.GaussianLowpass, n_filters: int = 40, sample_rate: int = 16000, window_len: float = 25., window_stride: float = 10., compression_fn = postprocessing.PCEN( alpha=0.96, smooth_coef=0.04, delta=2.0, floor=1e-12, trainable=True, learn_smooth_coef=True, per_channel_smooth_coef=True), preemp: bool = False, preemp_init = initializers.PreempInit, complex_conv_init = initializers.GaborInit( sample_rate=16000, min_freq=60.0, max_freq=7800.0), pooling_init = initializers.ConstInit(0.4), regularizer_fn = None, mean_var_norm: bool = False, spec_augment: bool = False): super(Leaf, self).__init__() window_size = int(sample_rate * window_len // 1000 + 1) window_stride = int(sample_rate * window_stride // 1000) #TODO: All tf 'SAME' paddings are set to 0, check if it's ok if preemp: self._preemp_conv = nn.Conv1d( in_channels=1, out_channels=1, kernel_size=2, stride=1, padding=0, bias=False, ) for parameter in self._preemp_conv.parameters: parameter.requires_grad = learn_filters self._complex_conv = conv1d_cls( filters=2 * n_filters, kernel_size=window_size, strides=1, padding=0, use_bias=False, #input_shape=(None, None, 1), kernel_initializer=complex_conv_init, kernel_regularizer=regularizer_fn if learn_filters else None, trainable=learn_filters) self._activation = activation self._pooling = pooling_cls( kernel_size=window_size, strides=window_stride, padding=0, use_bias=False, kernel_initializer=pooling_init, kernel_regularizer=regularizer_fn if learn_pooling else None, trainable=learn_pooling) if mean_var_norm: self._instance_norm = nn.InstanceNorm1d(n_filters, affine=True, eps=1e-6) self._compress_fn = compression_fn if compression_fn else torch.clone self._spec_augment_fn = postprocessing.SpecAugment() if spec_augment else torch.clone self._preemp = preemp
def __init__(self, input_size, output_size): super().__init__() self.encoding = PositionalEncoding(input_size) self.input_conv = nn.Conv1d(input_size, input_size, 3, padding=1) self.output_conv = nn.Conv1d(input_size, output_size * 2, 3, padding=1) self.reset_parameters()
def __init__(self, input_size, use_stn=False, use_attention=False): super(PPG2ECG, self).__init__() self.use_stn = use_stn self.use_attention = use_attention # build main transformer self.main = nn.Sequential( # encoder nn.Conv1d(1, 32, kernel_size=31, stride=2, padding=15), nn.PReLU(32), nn.Conv1d(32, 64, 31, 1, 15), nn.PReLU(64), nn.Conv1d(64, 128, 31, 2, 15), nn.PReLU(128), nn.Conv1d(128, 256, 31, 1, 15), nn.PReLU(256), nn.Conv1d(256, 512, 31, 2, 15), nn.PReLU(512), # decoder nn.ConvTranspose1d( 512, 256, kernel_size=31, stride=2, padding=15, output_padding=1), nn.PReLU(256), nn.ConvTranspose1d(256, 128, 31, 1, 15), nn.PReLU(128), nn.ConvTranspose1d(128, 64, 31, 2, 15, 1), nn.PReLU(64), nn.ConvTranspose1d(64, 32, 31, 1, 15), nn.PReLU(32), nn.ConvTranspose1d(32, 1, 31, 2, 15, 1), nn.Tanh(), ) # build stn (optional) if use_stn: # pylint: disable=not-callable self.restriction = torch.tensor( [1, 0, 0, 0], dtype=torch.float, requires_grad=False) self.register_buffer('restriction_const', self.restriction) self.stn_conv = nn.Sequential( nn.Conv1d( in_channels=1, out_channels=8, kernel_size=7, stride=1), nn.MaxPool1d(kernel_size=2, stride=2), nn.Conv1d( in_channels=8, out_channels=10, kernel_size=5, stride=1), nn.MaxPool1d(kernel_size=2, stride=2), ) n_stn_conv = self.get_stn_conv_out(input_size) self.stn_fc = nn.Sequential( Flatten(), nn.Linear(n_stn_conv, 32), nn.ReLU(True), nn.Linear(32, 4) ) self.stn_fc[3].weight.data.zero_() self.stn_fc[3].bias.data = torch.FloatTensor([1, 0, 1, 0]) # build attention network (optional) if use_attention: self.attn = nn.Sequential( nn.Linear(input_size, input_size), nn.ReLU(), nn.Linear(input_size, input_size) ) self.attn_len = input_size
def __init__(self, in_c, out_c, ks=3, stride=1, padding=1, bias=False): super(Conv, self).__init__() self.conv = nn.Conv1d(in_channels=in_c, out_channels=out_c, kernel_size=ks, stride=stride, bias=bias, padding=padding) self.act = nn.LeakyReLU() self.in_size = in_c self.out_size = out_c
def __init__(self): super(ImageNet, self).__init__() self.conv1 = nn.Conv1d(in_channels=51, out_channels=1, kernel_size=1) self.fc1 = nn.Linear(2053, 128) self.tanh = torch.nn.Tanh()
def __init__(self, latent_caps_size, latent_vec_size, num_classes): super(CapsSegNet, self).__init__() self.num_classes = num_classes self.latent_caps_size = latent_caps_size self.seg_convs = nn.Conv1d(latent_vec_size + 16, num_classes, 1)
def __init__(self, sources=4, audio_channels=2, channels=64, depth=6, rewrite=True, glu=True, upsample=False, rescale=0.1, kernel_size=8, stride=4, growth=2., lstm_layers=2, context=3): """ Args: sources (int): number of sources to separate audio_channels (int): stereo or mono channels (int): first convolution channels depth (int): number of encoder/decoder layers rewrite (bool): add 1x1 convolution to each encoder layer and a convolution to each decoder layer. For the decoder layer, `context` gives the kernel size. glu (bool): use glu instead of ReLU upsample (bool): use linear upsampling with convolutions Wave-U-Net style, instead of transposed convolutions rescale (int): rescale initial weights of convolutions to get their standard deviation closer to `rescale` kernel_size (int): kernel size for convolutions stride (int): stride for convolutions growth (float): multiply (resp divide) number of channels by that for each layer of the encoder (resp decoder) lstm_layers (int): number of lstm layers, 0 = no lstm context (int): kernel size of the convolution in the decoder before the transposed convolution. If > 1, will provide some context from neighboring time steps. """ super().__init__() n_d = [ sources, audio_channels, channels, depth, rewrite, glu, upsample, rescale, kernel_size, stride, growth, lstm_layers, context ] n_s = [ 'sources', 'audio_channels', 'channels', 'depth', 'rewrite', 'glu', 'upsample', 'rescale', 'kernel_size', 'stride', 'growth', 'lstm_layers', 'context' ] [print(s, n, '\n') for n, s in zip(n_d, n_s)] self.audio_channels = audio_channels self.sources = sources self.kernel_size = kernel_size self.context = context self.stride = stride self.depth = depth self.upsample = upsample self.channels = channels self.encoder = nn.ModuleList() self.decoder = nn.ModuleList() self.final = None if upsample: self.final = nn.Conv1d(channels + audio_channels, sources * audio_channels, 1) stride = 1 if glu: activation = nn.GLU(dim=1) ch_scale = 2 else: activation = nn.ReLU() ch_scale = 1 in_channels = audio_channels for index in range(depth): encode = [] encode += [ nn.Conv1d(in_channels, channels, kernel_size, stride), nn.ReLU() ] if rewrite: encode += [ nn.Conv1d(channels, ch_scale * channels, 1), activation ] self.encoder.append(nn.Sequential(*encode)) decode = [] if index > 0: out_channels = in_channels else: if upsample: out_channels = channels else: out_channels = sources * audio_channels if rewrite: decode += [ nn.Conv1d(channels, ch_scale * channels, context), activation ] if upsample: decode += [ nn.Conv1d(channels, out_channels, kernel_size, stride=1), ] else: decode += [ nn.ConvTranspose1d(channels, out_channels, kernel_size, stride) ] if index > 0: decode.append(nn.ReLU()) self.decoder.insert(0, nn.Sequential(*decode)) in_channels = channels channels = int(growth * channels) channels = in_channels if lstm_layers: self.lstm = BLSTM(channels, lstm_layers) else: self.lstm = None if rescale: rescale_module(self, reference=rescale)
def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None: super(Wav2Letter, self).__init__() acoustic_num_features = 250 if input_type == "waveform" else num_features acoustic_model = nn.Sequential( nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3), nn.ReLU(inplace=True), nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16), nn.ReLU(inplace=True), nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0), nn.ReLU(inplace=True), nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0), nn.ReLU(inplace=True)) if input_type == "waveform": waveform_model = nn.Sequential( nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45), nn.ReLU(inplace=True)) self.acoustic_model = nn.Sequential(waveform_model, acoustic_model) if input_type in ["power_spectrum", "mfcc"]: self.acoustic_model = acoustic_model
def __init__(self): super(Generator, self).__init__() # 2D Conv Layer self.conv1 = nn.Conv2d( in_channels=1, # TODO 1 ? out_channels=128, kernel_size=(5, 15), stride=(1, 1), padding=(2, 7)) self.conv1_gates = nn.Conv2d( in_channels=1, # TODO 1 ? out_channels=128, kernel_size=(5, 15), stride=1, padding=(2, 7)) # 2D Downsample Layer self.downSample1 = downSample_Generator(in_channels=128, out_channels=256, kernel_size=5, stride=2, padding=2) self.downSample2 = downSample_Generator(in_channels=256, out_channels=256, kernel_size=5, stride=2, padding=2) # 2D -> 1D Conv self.conv2dto1dLayer = nn.Sequential( nn.Conv1d(in_channels=2304, out_channels=256, kernel_size=1, stride=1, padding=0), nn.InstanceNorm1d(num_features=256, affine=True)) # Residual Blocks self.residualLayer1 = ResidualLayer(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) self.residualLayer2 = ResidualLayer(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) self.residualLayer3 = ResidualLayer(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) self.residualLayer4 = ResidualLayer(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) self.residualLayer5 = ResidualLayer(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) self.residualLayer6 = ResidualLayer(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1) # 1D -> 2D Conv self.conv1dto2dLayer = nn.Sequential( nn.Conv1d(in_channels=256, out_channels=2304, kernel_size=1, stride=1, padding=0), nn.InstanceNorm1d(num_features=2304, affine=True)) # UpSample Layer self.upSample1 = self.upSample(in_channels=256, out_channels=1024, kernel_size=5, stride=1, padding=2) self.upSample2 = self.upSample(in_channels=256, out_channels=512, kernel_size=5, stride=1, padding=2) self.lastConvLayer = nn.Conv2d(in_channels=128, out_channels=1, kernel_size=(5, 15), stride=(1, 1), padding=(2, 7))
def __init__(self, dropout=0.5): super(Conv, self).__init__() self.dropout = nn.Dropout(dropout) self.conv = nn.Conv1d(256, 256, 5, padding=2)
def __init__(self, args): super(HPLFlowNetShallow, self).__init__() self.scales_filter_map = args.scales_filter_map assert len(self.scales_filter_map) == 5 conv_module = Conv1dReLU self.conv1 = nn.Sequential( conv_module(args.dim, 32, use_leaky=args.use_leaky), conv_module(32, 32, use_leaky=args.use_leaky), conv_module(32, 64, use_leaky=args.use_leaky), ) self.bcn1 = BilateralConvFlex(args.dim, self.scales_filter_map[0][1], 64 + args.dim + 1, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=True, do_slice=False, last_relu=args.last_relu) self.bcn1_ = BilateralConvFlex(args.dim, self.scales_filter_map[0][1], args.dim + 1 + 64 + 64, [128], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=False, do_slice=True, last_relu=args.last_relu) self.bcn2 = BilateralConvFlex(args.dim, self.scales_filter_map[1][1], 64 + args.dim + 1, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=True, do_slice=False, last_relu=args.last_relu) self.bcn2_ = BilateralConvFlex(args.dim, self.scales_filter_map[1][1], args.dim + 1 + 64 + 64, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=False, do_slice=True, last_relu=args.last_relu) self.bcn3 = BilateralConvFlex(args.dim, self.scales_filter_map[2][1], 64 + args.dim + 1, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=True, do_slice=False, last_relu=args.last_relu) self.bcn3_ = BilateralConvFlex(args.dim, self.scales_filter_map[2][1], args.dim + 1 + 64 * 2 + 64, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=False, do_slice=True, last_relu=args.last_relu) self.corr1 = BilateralCorrelationFlex(args.dim, self.scales_filter_map[2][2], self.scales_filter_map[2][3], 64, [32], [32], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, prev_corr_dim=0, last_relu=args.last_relu) self.corr1_refine = nn.Sequential(conv_module(32 + args.dim + 1, 64, use_leaky=args.use_leaky), conv_module(64, 64, use_leaky=args.use_leaky), conv_module(64, 64, use_leaky=args.use_leaky), ) self.bcn4 = BilateralConvFlex(args.dim, self.scales_filter_map[3][1], 64 + args.dim + 1, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=True, do_slice=False, last_relu=args.last_relu) self.bcn4_ = BilateralConvFlex(args.dim, self.scales_filter_map[3][1], args.dim + 1 + 64 * 2 + 64, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=False, do_slice=True, last_relu=args.last_relu) self.corr2 = BilateralCorrelationFlex(args.dim, self.scales_filter_map[3][2], self.scales_filter_map[3][3], 64, [32], [32], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, prev_corr_dim=64, last_relu=args.last_relu) self.corr2_refine = nn.Sequential(conv_module(32 + args.dim + 1, 64, use_leaky=args.use_leaky), conv_module(64, 64, use_leaky=args.use_leaky), conv_module(64, 64, use_leaky=args.use_leaky), ) self.bcn5 = BilateralConvFlex(args.dim, self.scales_filter_map[4][1], 64 + args.dim + 1, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=True, do_slice=False, last_relu=args.last_relu) self.bcn5_ = BilateralConvFlex(args.dim, self.scales_filter_map[4][1], 64 + 64, [64], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, do_splat=False, do_slice=True, last_relu=args.last_relu) self.corr3 = BilateralCorrelationFlex(args.dim, self.scales_filter_map[4][2], self.scales_filter_map[4][3], 64, [32], [32], args.DEVICE, use_bias=args.bcn_use_bias, use_leaky=args.use_leaky, use_norm=args.bcn_use_norm, prev_corr_dim=64, last_relu=args.last_relu) self.corr3_refine = nn.Sequential(conv_module(32, 64, use_leaky=args.use_leaky), conv_module(64, 64, use_leaky=args.use_leaky), conv_module(64, 64, use_leaky=args.use_leaky), ) self.conv2 = conv_module(128, 1024, use_leaky=args.use_leaky) self.conv3 = conv_module(1024, 512, use_leaky=args.use_leaky) self.conv4 = nn.Conv1d(512, 3, kernel_size=1)
def __init__(self, input_channel, output_size, batch_norm, use_pooling, pooling_method, conv1_kernel_size, conv1_num_kernels, conv1_stride, conv1_dropout, pool1_kernel_size, pool1_stride, conv2_kernel_size, conv2_num_kernels, conv2_stride, conv2_dropout, pool2_kernel_size, pool2_stride, fcs_hidden_size, fcs_num_hidden_layers, fcs_dropout): super(LeNet_1D, self).__init__() # Instance attributes for use in self.forward() later. self.input_channel = input_channel self.batch_norm = batch_norm output_size = output_size input_size = output_size / self.input_channel if input_size.is_integer(): input_size = int(input_size) else: raise ValueError( 'output_size / input_channel = {} / {} = {}'.format( output_size, input_channel, input_size)) # If not using pooling, set all pooling operations to 1 by 1. if use_pooling is False: # warnings.warn('lenet: not using pooling') pool1_kernel_size = 1 pool1_stride = 1 pool2_kernel_size = 1 pool2_stride = 1 # Conv1 conv1_output_size = (conv1_num_kernels, (input_size - conv1_kernel_size) / conv1_stride + 1) self.conv1 = nn.Conv1d( input_channel, conv1_num_kernels, conv1_kernel_size, stride=conv1_stride ) # NOTE: THIS IS CORRECT!!!! CONV doesn't depend on num_features! nn.init.kaiming_normal_(self.conv1.weight.data) self.conv1.bias.data.fill_(0) self.conv1_drop = nn.Dropout2d(p=conv1_dropout) if batch_norm is True: self.batch_norm1 = nn.BatchNorm1d(conv1_num_kernels) # Pool1 pool1_output_size = ( conv1_num_kernels, (conv1_output_size[1] - pool1_kernel_size) / pool1_stride + 1) self.pool1 = nn.MaxPool1d( pool1_kernel_size, stride=pool1_stride) # stride=pool1_kernel_size by default # Conv2 conv2_output_size = ( conv2_num_kernels, (pool1_output_size[1] - conv2_kernel_size) / conv2_stride + 1) self.conv2 = nn.Conv1d( conv1_num_kernels, conv2_num_kernels, conv2_kernel_size, stride=conv2_stride ) # NOTE: THIS IS CORRECT!!!! CONV doesn't depend on num_features! nn.init.kaiming_normal_(self.conv2.weight.data) self.conv2.bias.data.fill_(0) self.conv2_drop = nn.Dropout2d(p=conv2_dropout) if batch_norm is True: self.batch_norm2 = nn.BatchNorm1d(conv2_num_kernels) # Pool2 pool2_output_size = ( conv2_num_kernels, (conv2_output_size[1] - pool2_kernel_size) / pool2_stride + 1) self.pool2 = nn.MaxPool1d( pool2_kernel_size, stride=pool2_stride) # stride=pool1_kernel_size by default # FCs fcs_input_size = pool2_output_size[0] * pool2_output_size[1] self.fcs = FullyConnectedNet(fcs_input_size, output_size, fcs_dropout, batch_norm, fcs_hidden_size, fcs_num_hidden_layers)
def __init__(self, class_size, style_size, hidden_size=128, n_out=1, emb_style=0): super(CountCNN, self).__init__() self.cnn = nn.Sequential( nn.Conv1d(class_size + style_size, hidden_size, kernel_size=3, stride=1, padding=1), nn.GroupNorm(getGroupSize(hidden_size), hidden_size), nn.Dropout2d(0.1), nn.ReLU(inplace=True), nn.Conv1d(hidden_size, hidden_size // 2, kernel_size=3, stride=1, padding=1), nn.GroupNorm(getGroupSize(hidden_size // 2), hidden_size // 2), nn.Dropout2d(0.1), nn.ReLU(inplace=True), nn.Conv1d(hidden_size // 2, hidden_size // 4, kernel_size=3, stride=1, padding=1), nn.GroupNorm(getGroupSize(hidden_size // 4), hidden_size // 4), nn.ReLU(inplace=True), nn.Conv1d(hidden_size // 4, n_out, kernel_size=1, stride=1, padding=0), ) if n_out == 1 or n_out > 2: self.mean = nn.Parameter(torch.FloatTensor(1, n_out).fill_(2)) self.std = nn.Parameter(torch.FloatTensor(1, n_out).fill_(1)) else: self.mean = nn.Parameter( torch.FloatTensor([2.0, 0.0]) ) #These are educated guesses to give the net a good place to start self.std = nn.Parameter(torch.FloatTensor([1.5, 0.5])) if emb_style > 0: if type(emb_style) is float: drop = 0.125 else: drop = 0.5 layers = [PixelNorm()] for i in range(int(emb_style)): layers.append(nn.Linear(style_size, style_size)) layers.append(nn.Dropout(drop, True)) layers.append(nn.LeakyReLU(0.2, True)) self.emb_style = nn.Sequential(*layers) else: self.emb_style = None
def __init__(self, cin, cout, groups, act): super().__init__() self.conv1d = nn.Conv1d(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups) self.act = ACT2FN[act]
def __init__( self, embedding_dim: int, vocab_size: int, num_filters: int, filter_sizes: list, hidden_dim: int, dropout_p: float, num_classes: int, padding_idx: int = 0, ) -> None: """A [convolutional neural network](https://madewithml.com/courses/foundations/convolutional-neural-networks/){:target="_blank"} architecture created for natural language processing tasks where filters convolve across the given text inputs. ![text CNN](https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/images/foundations/embeddings/model.png) Usage: ```python # Initialize model filter_sizes = list(range(1, int(params.max_filter_size) + 1)) model = models.CNN( embedding_dim=int(params.embedding_dim), vocab_size=int(vocab_size), num_filters=int(params.num_filters), filter_sizes=filter_sizes, hidden_dim=int(params.hidden_dim), dropout_p=float(params.dropout_p), num_classes=int(num_classes), ) model = model.to(device) ``` Args: embedding_dim (int): Embedding dimension for tokens. vocab_size (int): Number of unique tokens in vocabulary. num_filters (int): Number of filters per filter size. filter_sizes (list): List of filter sizes for the CNN. hidden_dim (int): Hidden dimension for fully-connected (FC) layers. dropout_p (float): Dropout proportion for FC layers. num_classes (int): Number of unique classes to classify into. padding_idx (int, optional): Index representing the `<PAD>` token. Defaults to 0. """ super().__init__() # Initialize embeddings self.embeddings = nn.Embedding( embedding_dim=embedding_dim, num_embeddings=vocab_size, padding_idx=padding_idx, ) # Conv weights self.filter_sizes = filter_sizes self.conv = nn.ModuleList( [ nn.Conv1d( in_channels=embedding_dim, out_channels=num_filters, kernel_size=f, ) for f in filter_sizes ] ) # FC weights self.dropout = nn.Dropout(dropout_p) self.fc1 = nn.Linear(num_filters * len(filter_sizes), hidden_dim) self.fc2 = nn.Linear(hidden_dim, num_classes)
def __init__(self, n_mel_channels, max_seq_len, n_symbols, padding_idx, symbols_embedding_dim, in_fft_n_layers, in_fft_n_heads, in_fft_d_head, in_fft_conv1d_kernel_size, in_fft_conv1d_filter_size, in_fft_output_size, p_in_fft_dropout, p_in_fft_dropatt, p_in_fft_dropemb, out_fft_n_layers, out_fft_n_heads, out_fft_d_head, out_fft_conv1d_kernel_size, out_fft_conv1d_filter_size, out_fft_output_size, p_out_fft_dropout, p_out_fft_dropatt, p_out_fft_dropemb, dur_predictor_kernel_size, dur_predictor_filter_size, p_dur_predictor_dropout, dur_predictor_n_layers, pitch_predictor_kernel_size, pitch_predictor_filter_size, p_pitch_predictor_dropout, pitch_predictor_n_layers, pitch_embedding_kernel_size, n_speakers, speaker_emb_weight): super(FastPitch, self).__init__() del max_seq_len # unused self.encoder = FFTransformer( n_layer=in_fft_n_layers, n_head=in_fft_n_heads, d_model=symbols_embedding_dim, d_head=in_fft_d_head, d_inner=in_fft_conv1d_filter_size, kernel_size=in_fft_conv1d_kernel_size, dropout=p_in_fft_dropout, dropatt=p_in_fft_dropatt, dropemb=p_in_fft_dropemb, embed_input=True, d_embed=symbols_embedding_dim, n_embed=n_symbols, padding_idx=padding_idx) if n_speakers > 1: self.speaker_emb = nn.Embedding(n_speakers, symbols_embedding_dim) else: self.speaker_emb = None self.speaker_emb_weight = speaker_emb_weight self.duration_predictor = TemporalPredictor( in_fft_output_size, filter_size=dur_predictor_filter_size, kernel_size=dur_predictor_kernel_size, dropout=p_dur_predictor_dropout, n_layers=dur_predictor_n_layers ) self.decoder = FFTransformer( n_layer=out_fft_n_layers, n_head=out_fft_n_heads, d_model=symbols_embedding_dim, d_head=out_fft_d_head, d_inner=out_fft_conv1d_filter_size, kernel_size=out_fft_conv1d_kernel_size, dropout=p_out_fft_dropout, dropatt=p_out_fft_dropatt, dropemb=p_out_fft_dropemb, embed_input=False, d_embed=symbols_embedding_dim ) self.pitch_predictor = TemporalPredictor( in_fft_output_size, filter_size=pitch_predictor_filter_size, kernel_size=pitch_predictor_kernel_size, dropout=p_pitch_predictor_dropout, n_layers=pitch_predictor_n_layers ) self.pitch_emb = nn.Conv1d( 1, symbols_embedding_dim, kernel_size=pitch_embedding_kernel_size, padding=int((pitch_embedding_kernel_size - 1) / 2)) # Store values precomputed for training data within the model self.register_buffer('pitch_mean', torch.zeros(1)) self.register_buffer('pitch_std', torch.zeros(1)) self.proj = nn.Linear(out_fft_output_size, n_mel_channels, bias=True)
def __init__( self, device, num_nodes, dropout=0.3, supports=None, gcn_bool=True, att_bool=True, addaptadj=True, aptinit=None, in_dim=2, out_dim=12, residual_channels=32, dilation_channels=32, skip_channels=256, end_channels=512, kernel_size=2, blocks=4, layers=2, ): super(gwnet, self).__init__() self.dropout = dropout self.blocks = blocks self.layers = layers self.gcn_bool = True self.att_bool = True self.addaptadj = True self.supports = supports self.filter_convs = nn.ModuleList() self.gate_convs = nn.ModuleList() self.residual_convs = nn.ModuleList() self.skip_convs = nn.ModuleList() self.bn = nn.ModuleList() self.avwgconv = nn.ModuleList() self.att_conv = nn.ModuleList() self.start_conv = nn.Conv2d(in_channels=in_dim, out_channels=residual_channels, kernel_size=(1, 1)) self.supports_len = 0 if supports is not None: self.supports_len += len(supports) receptive_field = 1 if self.gcn_bool and self.addaptadj: self.node_embedding = nn.Parameter(torch.randn(num_nodes, 10), requires_grad=True) for b in range(blocks): additional_scope = kernel_size - 1 new_dilation = 1 for i in range(layers): # dilated convolutions self.filter_convs.append( nn.Conv2d( in_channels=residual_channels, out_channels=dilation_channels, kernel_size=(1, kernel_size), dilation=new_dilation, )) self.gate_convs.append( nn.Conv1d( in_channels=residual_channels, out_channels=dilation_channels, kernel_size=(1, kernel_size), dilation=new_dilation, )) # 1x1 convolution for residual connection self.residual_convs.append( nn.Conv1d( in_channels=dilation_channels, out_channels=residual_channels, kernel_size=(1, 1), )) # 1x1 convolution for skip connection self.skip_convs.append( nn.Conv1d( in_channels=dilation_channels, out_channels=skip_channels, kernel_size=(1, 1), )) self.bn.append(nn.BatchNorm2d(residual_channels)) new_dilation *= 2 receptive_field += additional_scope additional_scope *= 2 if self.gcn_bool: if (i + 1) % 2 == 1: self.avwgconv.append( AVWGCN( dilation_channels, residual_channels, dropout, support_len=self.supports_len, )) self.att_conv.append( ST_Attention(2, residual_channels, residual_channels)) else: self.avwgconv.append( AVWGCN(dilation_channels, residual_channels, dropout)) self.end_conv_1 = nn.Conv2d( in_channels=skip_channels, out_channels=end_channels, kernel_size=(1, 1), bias=True, ) self.end_conv_2 = nn.Conv2d( in_channels=end_channels, out_channels=out_dim, kernel_size=(1, 1), bias=True, ) self.receptive_field = receptive_field
def __init__(self, options): super(SincNet, self).__init__() # self.saved_model = \ # torch.load(options['sincnet_saved_model'], map_location='gpu' if torch.cuda.is_available() else 'cpu')[ # 'CNN_model_par'] # print(self.saved_model.keys()) # exit() self.batch_size = options['batch_size'] self.cnn_N_filt = options['cnn_N_filt'] self.cnn_len_filt = options['cnn_len_filt'] self.cnn_max_pool_len = options['cnn_max_pool_len'] self.cnn_act = options['cnn_act'] self.cnn_drop = options['cnn_drop'] self.cnn_use_laynorm = options['cnn_use_laynorm'] self.cnn_use_batchnorm = options['cnn_use_batchnorm'] self.cnn_use_laynorm_inp = options['cnn_use_laynorm_inp'] self.cnn_use_batchnorm_inp = options['cnn_use_batchnorm_inp'] self.input_dim = options['input_dim'] self.fs = options['sampling_rate'] self.N_cnn_lay = len(options['cnn_N_filt']) self.conv = nn.ModuleList([]) self.bn = nn.ModuleList([]) self.ln = nn.ModuleList([]) self.act = nn.ModuleList([]) self.drop = nn.ModuleList([]) if self.cnn_use_laynorm_inp: self.ln0 = LayerNorm(self.input_dim) # self.ln0.beta = nn.Parameter(self.saved_model['ln0.beta']) # self.ln0.gamma = nn.Parameter(self.saved_model['ln0.gamma']) if self.cnn_use_batchnorm_inp: self.bn0 = nn.BatchNorm1d([self.input_dim], momentum=0.05) # self.bn0.weight = nn.Parameter(self.saved_model['bn0.weight']) # self.bn0.bias = nn.Parameter(self.saved_model['bn0.bias']) # self.bn0.running_mean = nn.Parameter(self.saved_model['bn0.running_mean']) # self.bn0.running_var = nn.Parameter(self.saved_model['bn0.running_var']) # self.bn0.num_batches_tracked = nn.Parameter(self.saved_model['bn0.num_batches_tracked']) current_input = self.input_dim for i in range(self.N_cnn_lay): N_filt = int(self.cnn_N_filt[i]) len_filt = int(self.cnn_len_filt[i]) # dropout self.drop.append(nn.Dropout(p=self.cnn_drop[i])) # activation self.act.append(act_fun(self.cnn_act[i])) # layer norm initialization ln = LayerNorm([N_filt, int((current_input - self.cnn_len_filt[i] + 1) / self.cnn_max_pool_len[i])]) # ln.beta = self.saved_model['ln' + str(i) + '.beta'] # ln.gamma = self.saved_model['ln' + str(i) + '.gamma'] self.ln.append(ln) bn = nn.BatchNorm1d(N_filt, int((current_input - self.cnn_len_filt[i] + 1) / self.cnn_max_pool_len[i]), momentum=0.05) # bn.weight = nn.Parameter(self.saved_model['bn' + str(i) + '.weight']) # bn.bias = nn.Parameter(self.saved_model['bn' + str(i) + '.bias']) # bn.running_mean = nn.Parameter(self.saved_model['bn' + str(i) + '.running_mean']) # bn.running_var = nn.Parameter(self.saved_model['bn' + str(i) + '.running_var']) # bn.num_batches_tracked = nn.Parameter(self.saved_model['bn' + str(i) + '.num_batches_tracked']) self.bn.append(bn) if i == 0: self.conv.append(SincConv_fast(self.cnn_N_filt[0], self.cnn_len_filt[0], self.fs)) else: self.conv.append(nn.Conv1d(self.cnn_N_filt[i - 1], self.cnn_N_filt[i], self.cnn_len_filt[i])) current_input = int((current_input - self.cnn_len_filt[i] + 1) / self.cnn_max_pool_len[i]) self.out_dim = current_input * N_filt # self.conv1 = nn.Conv1d(1, 40, 4, 3) # self.bn1 = nn.BatchNorm1d(40) # self.pool1 = nn.MaxPool1d(2, 2) # self.conv2 = nn.Conv1d(40, 40, 4, 3) # self.bn2 = nn.BatchNorm1d(40) # self.pool2 = nn.MaxPool1d(2, 2) self.conv1 = nn.Conv2d(1, 30, (2, 3), 2) self.bn1 = nn.BatchNorm2d(30) self.pool1 = nn.MaxPool2d((1, 2), 2) self.conv2 = nn.Conv2d(30, 30, (2, 3), 2) self.bn2 = nn.BatchNorm2d(30) self.pool2 = nn.MaxPool2d((1, 2), 1) self.fc1 = nn.Linear(55650, 4096) self.ffn_bn1 = nn.BatchNorm1d(4096) self.drp1 = nn.Dropout(0.3) self.fc2 = nn.Linear(4096, 512) self.ffn_bn2 = nn.BatchNorm1d(512) self.drp2 = nn.Dropout(0.3) self.fc3 = nn.Linear(512, 1)
def __init__(self, input_dim, conv_dim=64): super(Inception3, self).__init__() self.cnn = nn.Sequential(nn.Conv1d(input_dim, conv_dim, kernel_size=1), nn.ReLU(), nn.Conv1d(conv_dim, conv_dim, kernel_size=3), nn.ReLU())
layer_after = nn.Linear(2, 2) model1 = nn.Sequential(layer_before, op_layer, layer_after) print('Composed model 1:') print(model1) print('') inp = autograd.Variable(torch.ones(3))[None, ...] print('Model 1 evaluated on a 1x3 tensor variable:') print(model1(inp)) # We can also use convolutional layers with extra channel axes. Since # convolutions without padding reduce the size of the input by # `kernel_size - 1`, the input has to have size 4 here, and the output # will have size 1. layer_before = nn.Conv1d(1, 2, 2) layer_after = nn.Conv1d(2, 1, 2) model2 = nn.Sequential(layer_before, op_layer, layer_after) print('Composed model 2:') print(model2) print('') # Add extra batch and channel axes inp = autograd.Variable(torch.ones(4))[None, None, ...] print('Model 2 evaluated on a 1x3 tensor variable:') print(model2(inp)) # --- Backward --- # # Define a loss function and targets to compare against
def __init__(self, in_depth, out_depth, kernel_size, dilation=1, stride=1, groups=1): super(CausalConv1d, self).__init__() self.padding = (kernel_size - 1) * dilation self.conv = nn.Conv1d(in_depth, out_depth, kernel_size, stride=stride, dilation=dilation, groups=groups)
def __init__(self): super(SimpleConvolutionalEncoder, self).__init__() self.c1 = nn.Conv1d(1, 5, kernel_size=3) self.pool = nn.AdaptiveMaxPool1d(10) self.act = nn.LeakyReLU(negative_slope=0.3) self.out = nn.Linear(50, 3)
def __init__(self): super(ConvAE, self).__init__() self.encoder = nn.Sequential( nn.Conv1d(1, 32, 3, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), # nn.MaxPool1d(2, stride=2) nn.Conv1d(32, 64, 5, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), # nn.MaxPool1d(2, stride=1) nn.Conv1d(64, 64, 4, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.Conv1d(64, 128, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.Conv1d(128, 128, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.Conv1d(128, 256, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.Conv1d(256, 256, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.Conv1d(256, 512, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.Conv1d(512, 512, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), #nn.MaxPool1d(2) ) self.decoder = nn.Sequential( nn.ConvTranspose1d(512, 512, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(512, 256, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(256, 256, 9, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(256, 128, 10, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(128, 64, 20, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(64, 64, 20, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(64, 32, 30, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(32, 32, 40, stride=2, padding=1, dilation=1), nn.ReLU(True), # nn.LeakyReLU(True), nn.ConvTranspose1d(32, 1, 40, stride=2, padding=1, dilation=1), )
def __init__(self, args_dict=wavenet_default_settings): super(WaveNetModel, self).__init__() self.layers = args_dict["layers"] self.blocks = args_dict["blocks"] self.dilation_channels = args_dict["dilation_channels"] self.residual_channels = args_dict["residual_channels"] self.skip_channels = args_dict["skip_channels"] self.end_channels = args_dict["end_channels"] self.output_channels = args_dict["output_channels"] self.output_length = args_dict["output_length"] self.kernel_size = args_dict["kernel_size"] self.dilation_factor = args_dict["dilation_factor"] self.dtype = args_dict["dtype"] self.use_bias = args_dict["bias"] # build model receptive_field = 1 init_dilation = 1 self.dilations = [] self.residual_convs = nn.ModuleList() self.skip_convs = nn.ModuleList() self.end_layers = nn.ModuleList() # 1x1 convolution to create channels self.start_conv = nn.Conv1d( in_channels=1, #self.in_classes, out_channels=self.residual_channels, kernel_size=1, bias=self.use_bias) for b in range(self.blocks): additional_scope = self.kernel_size - 1 new_dilation = 1 for i in range(self.layers): # dilations of this layer self.dilations.append((new_dilation, init_dilation)) # 1x1 convolution for residual connection self.residual_convs.append( nn.Conv1d(in_channels=self.dilation_channels, out_channels=self.residual_channels, kernel_size=1, bias=self.use_bias)) # 1x1 convolution for skip connection self.skip_convs.append( nn.Conv1d(in_channels=self.dilation_channels, out_channels=self.skip_channels, kernel_size=1, bias=self.use_bias)) receptive_field += additional_scope additional_scope *= self.dilation_factor init_dilation = new_dilation new_dilation *= self.dilation_factor in_channels = self.skip_channels for end_channel in self.end_channels: self.end_layers.append( nn.Conv1d(in_channels=in_channels, out_channels=end_channel, kernel_size=1, bias=True)) in_channels = end_channel self.end_layers.append( nn.Conv1d(in_channels=in_channels, out_channels=self.output_channels, kernel_size=1, bias=True)) # self.output_length = 2 ** (layers - 1) self.receptive_field = receptive_field self.activation_unit_init()
def __init__(self, in_planes, out_planes): super(Conv1dBNReLU, self).__init__( nn.Conv1d(in_planes, out_planes, kernel_size=1, bias=False), nn.BatchNorm1d(out_planes), nn.ReLU(inplace=True))
def __init__(self, d_in, d_hid, dropout=0.1): super().__init__() self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise self.layer_norm = LayerNorm(d_in) self.dropout = nn.Dropout(dropout)
def __init__(self, cin, cout, groups, dropout_prob): super().__init__() self.conv1d = nn.Conv1d(in_channels=cin, out_channels=cout, kernel_size=1, groups=groups) self.layernorm = SqueezeBertLayerNorm(cout) self.dropout = nn.Dropout(dropout_prob)
def __init__(self, num_classes=50, points=2048, embed_dim=128, normal_channel=True, pre_blocks=[2, 2, 2, 2], pos_blocks=[2, 2, 2, 2], k_neighbors=[32, 32, 32, 32], reducers=[2, 2, 2, 2], **kwargs): super(get_model, self).__init__() # self.stages = len(pre_blocks) self.num_classes = num_classes self.points = points input_channel = 6 if normal_channel else 3 self.embedding = nn.Sequential(FCBNReLU1D(input_channel, embed_dim), FCBNReLU1D(embed_dim, embed_dim)) self.encoder_stage1 = encoder_stage(anchor_points=points // 4, channel=128, reduce=False, pre_blocks=3, pos_blocks=3, k_neighbor=32) self.encoder_stage2 = encoder_stage(anchor_points=points // 8, channel=256, reduce=True, pre_blocks=3, pos_blocks=3, k_neighbor=32) self.encoder_stage3 = encoder_stage(anchor_points=points // 16, channel=256, reduce=False, pre_blocks=3, pos_blocks=3, k_neighbor=32) self.encoder_stage4 = encoder_stage(anchor_points=points // 32, channel=512, reduce=True, pre_blocks=3, pos_blocks=3, k_neighbor=32) self.fp4 = PointNetFeaturePropagation(in_channel=(512 + 512), mlp=[512, 256, 256]) self.fp3 = PointNetFeaturePropagation(in_channel=256 + 256, mlp=[512, 256, 256]) self.fp2 = PointNetFeaturePropagation(in_channel=256 + 256, mlp=[256, 256]) self.fp1 = PointNetFeaturePropagation(in_channel=256 + 128 + 128, mlp=[256, 256]) self.info_encoder = nn.Sequential( FCBNReLU1D(16 + 3 + input_channel, 128), FCBNReLU1D(128, 128), ) self.global_encoder = nn.Sequential( FCBNReLU1D(512, 256), FCBNReLU1D(256, 128), ) self.conv0 = nn.Conv1d(256, 256, 1) self.bn0 = nn.BatchNorm1d(256) self.drop0 = nn.Dropout(0.4) self.conv1 = nn.Conv1d(256, 128, 1) self.bn1 = nn.BatchNorm1d(128) self.drop1 = nn.Dropout(0.4) self.conv2 = nn.Conv1d(128, num_classes, 1)
def get_layer(in_size, out_size, conv=False): if conv: return nn.Conv1d(in_size, out_size, 1) else: return nn.Linear(in_size, out_size)