def forward(self, input): ''' Input: (channels_num, batch_size, times_steps, freq_bins)''' interpolate_ratio = 32 x = input.transpose(0, 1) '''(batch_size, channels_num, times_steps, freq_bins)''' x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg') x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_steps) x = x.transpose(1, 2) # (batch_size, time_steps, feature_maps) event_output = torch.sigmoid(self.event_fc(x)) # (batch_size, time_steps, classes_num) elevation_output = self.elevation_fc(x) # (batch_size, time_steps, classes_num) azimuth_output = self.azimuth_fc(x) # (batch_size, time_steps, classes_num) # Interpolate event_output = interpolate(event_output, interpolate_ratio) elevation_output = interpolate(elevation_output, interpolate_ratio) azimuth_output = interpolate(azimuth_output, interpolate_ratio) output_dict = { 'event': event_output, 'elevation': elevation_output, 'azimuth': azimuth_output} return output_dict
def forward(self, input): ''' Input: (batch_size, times_steps, freq_bins)''' interpolate_ratio = 32 x = input[:, None, :, :] '''(batch_size, 1, times_steps, freq_bins)''' x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg') tf_maps = self.conv_block6(x, pool_size=(1, 1), pool_type='avg') '''Time-frequency maps: (batch_size, channels_num, times_steps, freq_bins)''' (framewise_vector, _) = torch.max(tf_maps, dim=3) '''(batch_size, feature_maps, frames_num)''' output_dict = {} framewise_output = torch.sigmoid( self.fc(framewise_vector.transpose(1, 2))) framewise_output = interpolate(framewise_output, interpolate_ratio) '''(batch_size, frames_num, classes_num)''' output_dict['framewise_output'] = framewise_output # Framewise prediction framewise_output = torch.sigmoid( self.fc(framewise_vector.transpose(1, 2))) framewise_output = interpolate(framewise_output, interpolate_ratio) '''(batch_size, frames_num, classes_num)''' output_dict['framewise_output'] = framewise_output # Clipwise prediction if self.strong_target_training: # Obtained by taking the maximum framewise predictions (output_dict['clipwise_output'], _) = torch.max(framewise_output, dim=1) else: # Obtained by applying fc layer on aggregated framewise_vector (aggregation, _) = torch.max(framewise_vector, dim=2) output_dict['clipwise_output'] = torch.sigmoid( self.fc(aggregation)) return output_dict
def forward(self, input): ''' Input: (channels_num, batch_size, times_steps, freq_bins)''' interpolate_ratio = 8 x = input.transpose(0, 1) '''(batch_size, channels_num, times_steps, freq_bins)''' x = F.relu_(self.bn1(self.conv1(x))) x = F.avg_pool2d(x, kernel_size=(2, 2)) x = F.relu_(self.bn2(self.conv2(x))) x = F.avg_pool2d(x, kernel_size=(2, 2)) x = F.relu_(self.bn3(self.conv3(x))) x = F.avg_pool2d(x, kernel_size=(2, 2)) x = F.relu_(self.bn4(self.conv4(x))) x = F.avg_pool2d(x, kernel_size=(1, 1)) '''(batch_size, feature_maps, time_steps, freq_bins)''' x = torch.mean(x, dim=3) # (batch_size, feature_maps, time_steps) x = x.transpose(1, 2) # (batch_size, time_steps, feature_maps) event_output = torch.sigmoid(self.event_fc(x)) # (batch_size, time_steps, classes_num) elevation_output = self.elevation_fc(x) # (batch_size, time_steps, classes_num) azimuth_output = self.azimuth_fc(x) # (batch_size, time_steps, classes_num) # Interpolate event_output = interpolate(event_output, interpolate_ratio) elevation_output = interpolate(elevation_output, interpolate_ratio) azimuth_output = interpolate(azimuth_output, interpolate_ratio) output_dict = { 'event': event_output, 'elevation': elevation_output, 'azimuth': azimuth_output} return output_dict
def forward(self, input): ''' Input: (batch_size, times_steps, freq_bins)''' interpolate_ratio = 8 x = input[:, None, :, :] '''(batch_size, 1, times_steps, freq_bins)''' x = F.relu_(self.bn1(self.conv1(x))) x = F.avg_pool2d(x, kernel_size=(2, 2)) x = F.relu_(self.bn2(self.conv2(x))) x = F.avg_pool2d(x, kernel_size=(2, 2)) x = F.relu_(self.bn3(self.conv3(x))) x = F.avg_pool2d(x, kernel_size=(2, 2)) x = F.relu_(self.bn4(self.conv4(x))) tf_maps = F.avg_pool2d(x, kernel_size=(1, 1)) '''Time-frequency maps: (batch_size, channels_num, times_steps, freq_bins)''' framewise_vector = torch.mean(tf_maps, dim=3) '''(batch_size, feature_maps, frames_num)''' output_dict = {} # Framewise prediction framewise_output = torch.sigmoid( self.fc(framewise_vector.transpose(1, 2))) framewise_output = interpolate(framewise_output, interpolate_ratio) '''(batch_size, frames_num, classes_num)''' output_dict['framewise_output'] = framewise_output # Clipwise prediction if self.strong_target_training: # Obtained by taking the maximum framewise predictions (output_dict['clipwise_output'], _) = torch.max(framewise_output, dim=1) else: # Obtained by applying fc layer on aggregated framewise_vector (aggregation, _) = torch.max(framewise_vector, dim=2) output_dict['clipwise_output'] = torch.sigmoid( self.fc(aggregation)) return output_dict
def forward_orig(self, x, mixup_lambda=None): x = self.spectrogram_extractor(x) # (batch_size, 1, time_steps, freq_bins) x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) # frames_num = x.shape[2] # print( 'x shape: ', x.shape ) # x shape: torch.Size([1, 1, 701, 64]) x = x.transpose(1, 3) x = self.bn0(x) x = x.transpose(1, 3) if self.training: x = self.spec_augmenter(x) # Mixup on spectrogram if self.training and mixup_lambda is not None: x = do_mixup(x, mixup_lambda) x = self.features(x) x = torch.mean(x, dim=3) x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1) x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1) x = x1 + x2 x = F.dropout(x, p=0.5, training=self.training) x = x.transpose(1, 2) x = F.relu_(self.fc1(x)) x = F.dropout(x, p=0.5, training=self.training) segmentwise_output = torch.sigmoid(self.fc_audioset(x)) (clipwise_output, _) = torch.max(segmentwise_output, dim=1) # Get framewise output framewise_output = interpolate(segmentwise_output, self.interpolate_ratio) # TEMP DISABLE framewise_output = pad_framewise_output(framewise_output, frames_num) return framewise_output
def forward_test(self, x): # x = x[:, None, :] # (batch_size, channels_num, data_length) #x2 = self.conv_real(x) #x = F.pad(x, pad=(0, 0, 1,1)) #x0 = F.relu(x) x = self.spectrogram_extractor( x) # (batch_size, 1, time_steps, freq_bins) x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) # frames_num = x.shape[2] # print( 'x shape: ', x.shape ) # x shape: torch.Size([1, 1, 701, 64]) x = x.transpose(1, 3) x = self.bn0(x) x = x.transpose(1, 3) x = self.features(x) x = torch.mean(x, dim=3) x1 = F.max_pool1d(x, kernel_size=3, stride=1) #!!!!, padding=1) x2 = F.avg_pool1d(x, kernel_size=3, stride=1) #!!!!!!!, padding=1) x = x1 + x2 x = F.dropout(x, p=0.5, training=self.training) x = x.transpose(1, 2) x = F.relu_(self.fc1(x)) x = F.dropout(x, p=0.5, training=self.training) segmentwise_output = torch.sigmoid(self.fc_audioset(x)) (clipwise_output, _) = torch.max(segmentwise_output, dim=1) # Get framewise output framewise_output = interpolate(segmentwise_output, self.interpolate_ratio) return x
def forward(self, x, mixup_lambda=None): x = self.spectrogram_extractor( x) # (batch_size, 1, time_steps, freq_bins) melspec = self.logmel_extractor( x) # (batch_size, 1, time_steps, mel_bins) x = melspec # frames_num = x.shape[2] # print( 'x shape: ', x.shape ) # x shape: torch.Size([1, 1, 701, 64]) x = x.transpose(1, 3) x = self.bn0(x) x = x.transpose(1, 3) if self.training: x = self.spec_augmenter(x) # Mixup on spectrogram if self.training and mixup_lambda is not None: x = do_mixup(x, mixup_lambda) x = self.features(x) # (1, 1024, 3, 2) x = torch.mean(x, dim=3) # print( 'x mean: ', x.shape ) # x mean: torch.Size([1, 1024, 3]) # original x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1) x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1) x = x1 + x2 ''' # x = x.unsqueeze(-1) x = x[:,:,:,None] print( 'x_0 ', x.shape ) # torch.Size([1, 1024, 3, 1]) p1d = (0, 0, 1, 1) # pad dim 2 by 1 on each side, dim 3 none x = F.pad(x, p1d, "constant", 0) # effectively zero padding print( 'x_1 ', x.shape ) # torch.Size([1, 1024, 5, 1]) x = x.squeeze(3) ''' ''' p1d = (1, 1) # pad dim 2 by 1 on each side, dim 3 none x = F.pad(x, p1d, "constant", 0) # effectively zero padding features = x # print( 'padded: ', x.shape ) # torch.Size([1, 1024, 5]) x1 = F.max_pool1d(x, kernel_size=3, stride=1) x2 = F.avg_pool1d(x, kernel_size=3, stride=1) # print( 'x1 x2: ', x1.shape, x2.shape ) # torch.Size([1, 1024, 3]) torch.Size([1, 1024, 3]) x = x1 + x2 features = x ''' x = F.dropout(x, p=0.5, training=self.training) x = x.transpose(1, 2) x = F.relu_(self.fc1(x)) x = F.dropout(x, p=0.5, training=self.training) segmentwise_output = torch.sigmoid(self.fc_audioset(x)) (clipwise_output, _) = torch.max(segmentwise_output, dim=1) # Get framewise output framewise_output = interpolate(segmentwise_output, self.interpolate_ratio) # TEMP DISABLE framewise_output = pad_framewise_output(framewise_output, frames_num) return clipwise_output, framewise_output, melspec