Example #1
0
    def forward(self, input):
        '''
        Input: (channels_num, batch_size, times_steps, freq_bins)'''
        
        interpolate_ratio = 32
        
        x = input.transpose(0, 1)
        '''(batch_size, channels_num, times_steps, freq_bins)'''
        
        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')

        x = torch.mean(x, dim=3)    # (batch_size, feature_maps, time_steps)
        x = x.transpose(1, 2)   # (batch_size, time_steps, feature_maps)
        
        event_output = torch.sigmoid(self.event_fc(x))  # (batch_size, time_steps, classes_num)
        elevation_output = self.elevation_fc(x)     # (batch_size, time_steps, classes_num)
        azimuth_output = self.azimuth_fc(x)     # (batch_size, time_steps, classes_num)
        
        # Interpolate
        event_output = interpolate(event_output, interpolate_ratio)
        elevation_output = interpolate(elevation_output, interpolate_ratio)
        azimuth_output = interpolate(azimuth_output, interpolate_ratio)

        output_dict = {
            'event': event_output, 
            'elevation': elevation_output, 
            'azimuth': azimuth_output}

        return output_dict
Example #2
0
    def forward(self, input):
        '''
        Input: (batch_size, times_steps, freq_bins)'''

        interpolate_ratio = 32

        x = input[:, None, :, :]
        '''(batch_size, 1, times_steps, freq_bins)'''

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        tf_maps = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        '''Time-frequency maps: (batch_size, channels_num, times_steps, freq_bins)'''

        (framewise_vector, _) = torch.max(tf_maps, dim=3)
        '''(batch_size, feature_maps, frames_num)'''

        output_dict = {}

        framewise_output = torch.sigmoid(
            self.fc(framewise_vector.transpose(1, 2)))
        framewise_output = interpolate(framewise_output, interpolate_ratio)
        '''(batch_size, frames_num, classes_num)'''

        output_dict['framewise_output'] = framewise_output

        # Framewise prediction
        framewise_output = torch.sigmoid(
            self.fc(framewise_vector.transpose(1, 2)))
        framewise_output = interpolate(framewise_output, interpolate_ratio)
        '''(batch_size, frames_num, classes_num)'''

        output_dict['framewise_output'] = framewise_output

        # Clipwise prediction
        if self.strong_target_training:
            # Obtained by taking the maximum framewise predictions
            (output_dict['clipwise_output'], _) = torch.max(framewise_output,
                                                            dim=1)

        else:
            # Obtained by applying fc layer on aggregated framewise_vector
            (aggregation, _) = torch.max(framewise_vector, dim=2)
            output_dict['clipwise_output'] = torch.sigmoid(
                self.fc(aggregation))

        return output_dict
Example #3
0
    def forward(self, input):
        '''
        Input: (channels_num, batch_size, times_steps, freq_bins)'''
        
        interpolate_ratio = 8
        
        x = input.transpose(0, 1)
        '''(batch_size, channels_num, times_steps, freq_bins)'''
        
        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        
        x = F.relu_(self.bn2(self.conv2(x)))
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        
        x = F.relu_(self.bn3(self.conv3(x)))
        x = F.avg_pool2d(x, kernel_size=(2, 2))
        
        x = F.relu_(self.bn4(self.conv4(x)))
        x = F.avg_pool2d(x, kernel_size=(1, 1))
        '''(batch_size, feature_maps, time_steps, freq_bins)'''
        
        x = torch.mean(x, dim=3)    # (batch_size, feature_maps, time_steps)
        x = x.transpose(1, 2)   # (batch_size, time_steps, feature_maps)
        
        event_output = torch.sigmoid(self.event_fc(x))  # (batch_size, time_steps, classes_num)
        elevation_output = self.elevation_fc(x)     # (batch_size, time_steps, classes_num)
        azimuth_output = self.azimuth_fc(x)     # (batch_size, time_steps, classes_num)
        
        # Interpolate
        event_output = interpolate(event_output, interpolate_ratio)
        elevation_output = interpolate(elevation_output, interpolate_ratio)
        azimuth_output = interpolate(azimuth_output, interpolate_ratio)

        output_dict = {
            'event': event_output, 
            'elevation': elevation_output, 
            'azimuth': azimuth_output}

        return output_dict
Example #4
0
    def forward(self, input):
        '''
        Input: (batch_size, times_steps, freq_bins)'''

        interpolate_ratio = 8

        x = input[:, None, :, :]
        '''(batch_size, 1, times_steps, freq_bins)'''

        x = F.relu_(self.bn1(self.conv1(x)))
        x = F.avg_pool2d(x, kernel_size=(2, 2))

        x = F.relu_(self.bn2(self.conv2(x)))
        x = F.avg_pool2d(x, kernel_size=(2, 2))

        x = F.relu_(self.bn3(self.conv3(x)))
        x = F.avg_pool2d(x, kernel_size=(2, 2))

        x = F.relu_(self.bn4(self.conv4(x)))
        tf_maps = F.avg_pool2d(x, kernel_size=(1, 1))
        '''Time-frequency maps: (batch_size, channels_num, times_steps, freq_bins)'''

        framewise_vector = torch.mean(tf_maps, dim=3)
        '''(batch_size, feature_maps, frames_num)'''

        output_dict = {}

        # Framewise prediction
        framewise_output = torch.sigmoid(
            self.fc(framewise_vector.transpose(1, 2)))
        framewise_output = interpolate(framewise_output, interpolate_ratio)
        '''(batch_size, frames_num, classes_num)'''

        output_dict['framewise_output'] = framewise_output

        # Clipwise prediction
        if self.strong_target_training:
            # Obtained by taking the maximum framewise predictions
            (output_dict['clipwise_output'], _) = torch.max(framewise_output,
                                                            dim=1)

        else:
            # Obtained by applying fc layer on aggregated framewise_vector
            (aggregation, _) = torch.max(framewise_vector, dim=2)
            output_dict['clipwise_output'] = torch.sigmoid(
                self.fc(aggregation))

        return output_dict
    def forward_orig(self, x, mixup_lambda=None):
        x = self.spectrogram_extractor(x)   # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)

        # frames_num = x.shape[2]

        # print( 'x shape: ', x.shape ) # x shape:  torch.Size([1, 1, 701, 64])
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)
        
        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)
        
        x = self.features(x)

        
        x = torch.mean(x, dim=3)

        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        segmentwise_output = torch.sigmoid(self.fc_audioset(x))
        (clipwise_output, _) = torch.max(segmentwise_output, dim=1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output, self.interpolate_ratio)
        # TEMP DISABLE framewise_output = pad_framewise_output(framewise_output, frames_num)

        return framewise_output
Example #6
0
    def forward_test(self, x):
        # x = x[:, None, :]   # (batch_size, channels_num, data_length)
        #x2 = self.conv_real(x)
        #x = F.pad(x, pad=(0, 0, 1,1))
        #x0 = F.relu(x)

        x = self.spectrogram_extractor(
            x)  # (batch_size, 1, time_steps, freq_bins)
        x = self.logmel_extractor(x)  # (batch_size, 1, time_steps, mel_bins)

        # frames_num = x.shape[2]

        # print( 'x shape: ', x.shape ) # x shape:  torch.Size([1, 1, 701, 64])

        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        x = self.features(x)

        x = torch.mean(x, dim=3)

        x1 = F.max_pool1d(x, kernel_size=3, stride=1)  #!!!!, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1)  #!!!!!!!, padding=1)
        x = x1 + x2
        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        segmentwise_output = torch.sigmoid(self.fc_audioset(x))
        (clipwise_output, _) = torch.max(segmentwise_output, dim=1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       self.interpolate_ratio)

        return x
Example #7
0
    def forward(self, x, mixup_lambda=None):
        x = self.spectrogram_extractor(
            x)  # (batch_size, 1, time_steps, freq_bins)
        melspec = self.logmel_extractor(
            x)  # (batch_size, 1, time_steps, mel_bins)
        x = melspec

        # frames_num = x.shape[2]

        # print( 'x shape: ', x.shape ) # x shape:  torch.Size([1, 1, 701, 64])
        x = x.transpose(1, 3)
        x = self.bn0(x)
        x = x.transpose(1, 3)

        if self.training:
            x = self.spec_augmenter(x)

        # Mixup on spectrogram
        if self.training and mixup_lambda is not None:
            x = do_mixup(x, mixup_lambda)

        x = self.features(x)
        #  (1, 1024, 3, 2)

        x = torch.mean(x, dim=3)
        # print( 'x mean: ', x.shape ) # x mean:  torch.Size([1, 1024, 3])

        # original
        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
        x = x1 + x2
        '''
        # x = x.unsqueeze(-1)
        x = x[:,:,:,None]
        print( 'x_0 ', x.shape ) # torch.Size([1, 1024, 3, 1])
        p1d = (0, 0, 1, 1) # pad dim 2 by 1 on each side, dim 3 none
        x = F.pad(x, p1d, "constant", 0)  # effectively zero padding
        print( 'x_1 ', x.shape ) # torch.Size([1, 1024, 5, 1])
        x = x.squeeze(3)
        '''
        '''
        p1d = (1, 1) # pad dim 2 by 1 on each side, dim 3 none
        x = F.pad(x, p1d, "constant", 0)  # effectively zero padding
        features = x
        # print( 'padded: ', x.shape ) # torch.Size([1, 1024, 5])

        
        x1 = F.max_pool1d(x, kernel_size=3, stride=1)
        x2 = F.avg_pool1d(x, kernel_size=3, stride=1)
        # print( 'x1 x2: ', x1.shape, x2.shape ) # torch.Size([1, 1024, 3]) torch.Size([1, 1024, 3])

        x = x1 + x2
        features = x        
        '''

        x = F.dropout(x, p=0.5, training=self.training)
        x = x.transpose(1, 2)
        x = F.relu_(self.fc1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        segmentwise_output = torch.sigmoid(self.fc_audioset(x))
        (clipwise_output, _) = torch.max(segmentwise_output, dim=1)

        # Get framewise output
        framewise_output = interpolate(segmentwise_output,
                                       self.interpolate_ratio)
        # TEMP DISABLE framewise_output = pad_framewise_output(framewise_output, frames_num)

        return clipwise_output, framewise_output, melspec