Ejemplo n.º 1
0
    def __init__(self, args, adj, inp, pretrained=True, model_pth=None, fixed=True):
        super(gcnNet, self).__init__()
        
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)
        self.bn0 = nn.BatchNorm2d(64)
        self.conv_block1 = ConvBlock(1,64)
        self.conv_block2 = ConvBlock(64,128)
        self.conv_block3 = ConvBlock(128,256)
        self.conv_block4 = ConvBlock(256,512)
        self.fc1 = nn.Linear(512, 512, bias=True)
#         self.fc_audioset = nn.Linear(512, 527, bias=True)
        
        if pretrained==True:
            self.init_model(model_pth=model_pth)
        
        if fixed==True:
            for p in self.parameters():
                p.requires_grad=False
        
        self.gcn = GCNnet(num_classes=527, in_channel=300, t=0.3, adj_file=adj)
#         self.alpha = nn.Parameter(torch.cuda.FloatTensor([.5, .5]))

        with open(inp, 'rb') as f:
            self.inp = pickle.load(f)
Ejemplo n.º 2
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Cnn_9layers_FrameAtt, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
        self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid')

        self.init_weights()
Ejemplo n.º 3
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
                 fmax, classes_num):

        super(Cnn_9layers_Gru_FrameAvg, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size,
                                                 hop_length=hop_size,
                                                 win_length=window_size,
                                                 window=window,
                                                 center=center,
                                                 pad_mode=pad_mode,
                                                 freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate,
                                                 n_fft=window_size,
                                                 n_mels=mel_bins,
                                                 fmin=fmin,
                                                 fmax=fmax,
                                                 ref=ref,
                                                 amin=amin,
                                                 top_db=top_db,
                                                 freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=8,
                                               freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        self.gru = nn.GRU(input_size=512,
                          hidden_size=256,
                          num_layers=1,
                          bias=True,
                          batch_first=True,
                          bidirectional=True)

        self.fc = nn.Linear(512, classes_num, bias=True)

        self.init_weights()
Ejemplo n.º 4
0
 def __init__(self, args):
     super(CNN10, self).__init__()
     
     self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
         freq_drop_width=8, freq_stripes_num=2)
     self.bn0 = nn.BatchNorm2d(64)
     self.conv_block1 = ConvBlock(1,64)
     self.conv_block2 = ConvBlock(64,128)
     self.conv_block3 = ConvBlock(128,256)
     self.conv_block4 = ConvBlock(256,512)
 
     self.fc1 = nn.Linear(512, 512, bias=True)
     self.fc_audioset = nn.Linear(512, 527, bias=True)
     
     self.init_weight()
Ejemplo n.º 5
0
    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, 
        fmax, classes_num):
        
        super(Cnn_9layers_Transformer_FrameAvg, self).__init__()

        window = 'hann'
        center = True
        pad_mode = 'reflect'
        ref = 1.0
        amin = 1e-10
        top_db = None

        # Spectrogram extractor
        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, 
            win_length=window_size, window=window, center=center, pad_mode=pad_mode, 
            freeze_parameters=True)

        # Logmel feature extractor
        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, 
            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, 
            freeze_parameters=True)

        # Spec augmenter
        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, 
            freq_drop_width=8, freq_stripes_num=2)

        self.bn0 = nn.BatchNorm2d(64)

        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)

        n_head = 8
        n_hid = 512
        d_k = 64
        d_v = 64
        dropout = 0.2
        self.multihead = MultiHead(n_head, n_hid, d_k, d_v, dropout)

        self.fc = nn.Linear(512, classes_num, bias=True)

        self.init_weights()
Ejemplo n.º 6
0
    def __init__(self, classes_num, time_steps, freq_bins, spec_aug=False):

        super(Cnn6, self).__init__()

        # Spec augmenter
        self.spec_aug = spec_aug

        if self.spec_aug:
            self.spec_augmenter = SpecAugmentation(
                time_drop_width=64,
                time_stripes_num=2,
                freq_drop_width=8,
                freq_stripes_num=2,
            )

        self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
        self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
        self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
        self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)

        self.fc1 = nn.Linear(512, 512, bias=True)
        self.fc_out = nn.Linear(512, classes_num, bias=True)

        self.init_weight()
Ejemplo n.º 7
0
    def __init__(self,
                 input_dim,
                 hidden_dim,
                 value_size=128,
                 key_size=128,
                 pBLSTM_time_reductions=[2],
                 use_spec_augment=False,
                 use_conv_blocks_in_encoder=False):
        super(Encoder, self).__init__()

        self.use_dropout = True

        self.use_spec_augment = use_spec_augment
        print("Encoder, using spec augment:", use_spec_augment)

        self.dropout_layer = nn.Dropout(p=0.1)
        print("Encoder, using dropout:", self.use_dropout, str(0.1))

        self.spec_augmenter = SpecAugmentation(time_drop_width=64,
                                               time_stripes_num=2,
                                               freq_drop_width=4,
                                               freq_stripes_num=2)

        self.use_conv_blocks_in_encoder = use_conv_blocks_in_encoder

        if self.use_conv_blocks_in_encoder:

            self.conv_block = nn.Sequential(
                nn.Conv2d(1, 64, 3, padding=1),
                nn.BatchNorm2d(64),
                nn.ReLU(),
                nn.MaxPool2d(2, 2),
                nn.Conv2d(64, 64, 3, padding=1),
                nn.BatchNorm2d(64),
                nn.ReLU(),
                # nn.Conv2d(64, 1, 3, padding=1),
                # nn.ReLU(),
                nn.MaxPool2d(2, 2))

        self.lstm = nn.LSTM(input_size=input_dim,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            bidirectional=True)

        nb_pBLSTM_layers = len(pBLSTM_time_reductions)
        self.nb_pBLSTM_layers = nb_pBLSTM_layers

        reduction_time_factor = pBLSTM_time_reductions[0]
        self.pblstm1 = pBLSTM(input_dim=hidden_dim * 2 * reduction_time_factor,
                              hidden_dim=hidden_dim,
                              reduction_time_factor=reduction_time_factor)

        if nb_pBLSTM_layers == 1:
            print("Encoder has one pBLSTM layers")
            print(" hidden_dim", hidden_dim)
        if nb_pBLSTM_layers == 2:
            print("Encoder has two pBLSTM layers")
            print(" hidden_dim", hidden_dim)
            reduction_time_factor = pBLSTM_time_reductions[1]
            self.pblstm2 = pBLSTM(input_dim=hidden_dim * 2 *
                                  reduction_time_factor,
                                  hidden_dim=hidden_dim,
                                  reduction_time_factor=reduction_time_factor)
            print(" hidden_dim", hidden_dim)

        elif nb_pBLSTM_layers == 3:
            print("Encoder has three pBLSTM layers")
            print(" hidden_dim", hidden_dim)
            reduction_time_factor = pBLSTM_time_reductions[1]
            self.pblstm2 = pBLSTM(input_dim=hidden_dim * 2 *
                                  reduction_time_factor,
                                  hidden_dim=hidden_dim,
                                  reduction_time_factor=reduction_time_factor)
            print(" hidden_dim", hidden_dim)

            reduction_time_factor = pBLSTM_time_reductions[2]
            self.pblstm3 = pBLSTM(input_dim=hidden_dim * 2 *
                                  reduction_time_factor,
                                  hidden_dim=hidden_dim,
                                  reduction_time_factor=reduction_time_factor)

        self.key_network = nn.Linear(hidden_dim * 2, value_size)
        self.value_network = nn.Linear(hidden_dim * 2, key_size)