def conv__gru_3x368_drop02(num_outputs) -> nn.Module: model = SequentialSequential(*[ ConvExtractor(), LambdaModule(lambda seq, seq_len: (F.relu(seq), seq_len)), RNNEncoder(dropout=0.2, rnn_type="GRU", num_layers=3, hidden_size=368, input_size=512), SequentialLinear(368 * 2, num_outputs, pre_activation=True) ]) return model
def conv_instnorm__gru_2x256_drop02(num_outputs) -> nn.Module: model = SequentialSequential(*[ ConvExtractor(norm=nn.InstanceNorm2d), LambdaModule(lambda seq, seq_len: (F.relu(seq), seq_len)), RNNEncoder(dropout=0.2, rnn_type="GRU", num_layers=2, hidden_size=256, input_size=512), SequentialLinear(256 * 2, num_outputs, pre_activation=True) ]) return model
def conv__gru_2x256_drop02__transf_2x4x128x256_drop01(num_outputs) -> nn.Module: model = SequentialSequential(*[ ConvExtractor(), LambdaModule(lambda seq, seq_len: (F.relu(seq), seq_len)), RNNEncoder(dropout=0.2, rnn_type="GRU", num_layers=2, hidden_size=256, input_size=512), SequentialLinear(256 * 2, 128, pre_activation=True), TransformerEncoder(dropout=0.1, num_layers=2, num_heads=4, dim_model=128, dim_feedforward=256), SequentialLinear(128, num_outputs, pre_activation=True) ]) return model
def __init__(self, num_outputs, dropout=0.2, n_rnn=2, rnn_type="GRU", rnn_dim=256): super().__init__() # input: Bx3x128xL left_context = 19 right_context = 19 + 4 self.encoder = nn.Sequential(*[ nn.ReplicationPad2d([left_context, right_context, 0, 0]), nn.BatchNorm2d(3), nn.Conv2d(3, 64, kernel_size=(3, 3), padding=[1, 0]), # L - 2 nn.ReLU(), nn.BatchNorm2d(64), nn.MaxPool2d(kernel_size=(4, 2), stride=2), # / 2 nn.Conv2d(64, 128, kernel_size=(3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(128), nn.MaxPool2d(kernel_size=(4, 2), stride=2), # /2 nn.Conv2d(128, 256, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(256), nn.Conv2d(256, 256, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(256), nn.ZeroPad2d([0, 0, 2, 1]), # same padding for maxpool2d nn.MaxPool2d(kernel_size=(4, 1), padding=0), # pool_4 nn.Conv2d(256, 512, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(512), nn.Conv2d(512, 512, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(512), nn.ZeroPad2d([0, 0, 1, 2]), # same padding for maxpool2d nn.MaxPool2d(kernel_size=(4, 1), padding=0), nn.Conv2d(512, 512, (2, 2)), # 512x1x255 CxHxW # -1 nn.ReLU(), LambdaModule(lambda x: x.squeeze(dim=2).permute(2, 0, 1)), # LxBxC ]) self.rnn_dropout = nn.Dropout(dropout) rnn_type = getattr(nn, rnn_type) self.n_rnn = n_rnn if n_rnn > 0: self.rnn = rnn_type(input_size=512, hidden_size=rnn_dim, bidirectional=True, dropout=dropout, batch_first=False, num_layers=n_rnn) else: self.rnn = nn.Identity() self.final = nn.Linear(rnn_dim * 2, num_outputs)
def __init__(self, num_outputs, dropout=0.2, n_rnn=2, rnn_type="GRU"): super().__init__() self.num_outputs = num_outputs left_context = 62 right_context = 62 self.encoder = nn.Sequential(*[ nn.ReplicationPad2d([left_context, right_context, 0, 0]), nn.Conv2d(3, 64, kernel_size=(5, 5), padding=(2, 0)), # 128, time -4 nn.MaxPool2d(kernel_size=(2, 2)), # to 64, time / 2 // (24+2) * 2 ResBlock(64, 64, stride_h=2), # to 32, time -4 // 24+2 nn.MaxPool2d(kernel_size=(2, 2)), # to 16, time/2 // 12*2 ResBlock(64, 128, stride_h=2), # to 8, time -4 ResBlock(128, 128), # 8, time -4 ResBlock(128, 256, stride_h=2), # to 4, time -4 ResBlock(256, 256), # 4, time -4 ResBlock(256, 512, stride_h=2), # to 2, time -4 ResBlock(512, 512, stride_h=2), # to 1, time -4 ResBlock(512, 512), # 1, time -4 LambdaModule(lambda x: x.squeeze(2)), # BxCx1xL -> BxCxL nn.BatchNorm1d(512), nn.ReLU(), nn.Conv1d(512, 512, kernel_size=1), nn.BatchNorm1d(512), nn.ReLU(), LambdaModule(lambda x: x.permute(2, 0, 1)), # LxBxC ]) self.rnn_dropout = nn.Dropout(dropout) rnn_type = getattr(nn, rnn_type) self.n_rnn = n_rnn if n_rnn > 0: self.rnn = rnn_type(input_size=512, hidden_size=256, bidirectional=True, dropout=dropout, batch_first=False, num_layers=n_rnn) else: self.rnn = nn.Identity() self.final = nn.Linear(512, num_outputs)
def __init__(self, num_outputs, dropout=0.1, n_layers=2, n_head=4, dim_feedforward=512): super().__init__() # input: Bx3x128xL left_context = 19 right_context = 19 + 4 self.encoder = nn.Sequential(*[ nn.ReplicationPad2d([left_context, right_context, 0, 0]), nn.BatchNorm2d(3), nn.Conv2d(3, 64, kernel_size=(3, 3), padding=[1, 0]), # L - 2 nn.ReLU(), nn.BatchNorm2d(64), nn.MaxPool2d(kernel_size=(4, 2), stride=2), # / 2 nn.Conv2d(64, 128, kernel_size=(3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(128), nn.MaxPool2d(kernel_size=(4, 2), stride=2), # /2 nn.Conv2d(128, 256, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(256), nn.Conv2d(256, 256, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(256), nn.ZeroPad2d([0, 0, 2, 1]), # same padding for maxpool2d nn.MaxPool2d(kernel_size=(4, 1), padding=0), # pool_4 nn.Conv2d(256, 512, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(512), nn.Conv2d(512, 512, (3, 3), padding=[1, 0]), # -2 nn.ReLU(), nn.BatchNorm2d(512), nn.ZeroPad2d([0, 0, 1, 2]), # same padding for maxpool2d nn.MaxPool2d(kernel_size=(4, 1), padding=0), nn.Conv2d(512, 512, (2, 2)), # 512x1x255 CxHxW # -1 nn.ReLU(), LambdaModule(lambda x: x.squeeze(dim=2).permute(2, 0, 1)), # LxBxC ]) self.reduce_dim = nn.Linear(512, 128) self.pos_encoder = PositionalEncoding(128, dropout=dropout) encoder_layers = nn.TransformerEncoderLayer(128, n_head, dim_feedforward=dim_feedforward, dropout=dropout) encoder_norm = nn.LayerNorm(128) self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers, encoder_norm) self.final = nn.Linear(128, num_outputs)