def __init__(self, args, adj, inp, pretrained=True, model_pth=None, fixed=True): super(gcnNet, self).__init__() self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(1,64) self.conv_block2 = ConvBlock(64,128) self.conv_block3 = ConvBlock(128,256) self.conv_block4 = ConvBlock(256,512) self.fc1 = nn.Linear(512, 512, bias=True) # self.fc_audioset = nn.Linear(512, 527, bias=True) if pretrained==True: self.init_model(model_pth=model_pth) if fixed==True: for p in self.parameters(): p.requires_grad=False self.gcn = GCNnet(num_classes=527, in_channel=300, t=0.3, adj_file=adj) # self.alpha = nn.Parameter(torch.cuda.FloatTensor([.5, .5])) with open(inp, 'rb') as f: self.inp = pickle.load(f)
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_FrameAtt, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid') self.init_weights()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Gru_FrameAvg, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.gru = nn.GRU(input_size=512, hidden_size=256, num_layers=1, bias=True, batch_first=True, bidirectional=True) self.fc = nn.Linear(512, classes_num, bias=True) self.init_weights()
def __init__(self, args): super(CNN10, self).__init__() self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(1,64) self.conv_block2 = ConvBlock(64,128) self.conv_block3 = ConvBlock(128,256) self.conv_block4 = ConvBlock(256,512) self.fc1 = nn.Linear(512, 512, bias=True) self.fc_audioset = nn.Linear(512, 527, bias=True) self.init_weight()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Transformer_FrameAvg, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) n_head = 8 n_hid = 512 d_k = 64 d_v = 64 dropout = 0.2 self.multihead = MultiHead(n_head, n_hid, d_k, d_v, dropout) self.fc = nn.Linear(512, classes_num, bias=True) self.init_weights()
def __init__(self, classes_num, time_steps, freq_bins, spec_aug=False): super(Cnn6, self).__init__() # Spec augmenter self.spec_aug = spec_aug if self.spec_aug: self.spec_augmenter = SpecAugmentation( time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, ) self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512) self.fc1 = nn.Linear(512, 512, bias=True) self.fc_out = nn.Linear(512, classes_num, bias=True) self.init_weight()
def __init__(self, input_dim, hidden_dim, value_size=128, key_size=128, pBLSTM_time_reductions=[2], use_spec_augment=False, use_conv_blocks_in_encoder=False): super(Encoder, self).__init__() self.use_dropout = True self.use_spec_augment = use_spec_augment print("Encoder, using spec augment:", use_spec_augment) self.dropout_layer = nn.Dropout(p=0.1) print("Encoder, using dropout:", self.use_dropout, str(0.1)) self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=4, freq_stripes_num=2) self.use_conv_blocks_in_encoder = use_conv_blocks_in_encoder if self.use_conv_blocks_in_encoder: self.conv_block = nn.Sequential( nn.Conv2d(1, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2, 2), nn.Conv2d(64, 64, 3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), # nn.Conv2d(64, 1, 3, padding=1), # nn.ReLU(), nn.MaxPool2d(2, 2)) self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True) nb_pBLSTM_layers = len(pBLSTM_time_reductions) self.nb_pBLSTM_layers = nb_pBLSTM_layers reduction_time_factor = pBLSTM_time_reductions[0] self.pblstm1 = pBLSTM(input_dim=hidden_dim * 2 * reduction_time_factor, hidden_dim=hidden_dim, reduction_time_factor=reduction_time_factor) if nb_pBLSTM_layers == 1: print("Encoder has one pBLSTM layers") print(" hidden_dim", hidden_dim) if nb_pBLSTM_layers == 2: print("Encoder has two pBLSTM layers") print(" hidden_dim", hidden_dim) reduction_time_factor = pBLSTM_time_reductions[1] self.pblstm2 = pBLSTM(input_dim=hidden_dim * 2 * reduction_time_factor, hidden_dim=hidden_dim, reduction_time_factor=reduction_time_factor) print(" hidden_dim", hidden_dim) elif nb_pBLSTM_layers == 3: print("Encoder has three pBLSTM layers") print(" hidden_dim", hidden_dim) reduction_time_factor = pBLSTM_time_reductions[1] self.pblstm2 = pBLSTM(input_dim=hidden_dim * 2 * reduction_time_factor, hidden_dim=hidden_dim, reduction_time_factor=reduction_time_factor) print(" hidden_dim", hidden_dim) reduction_time_factor = pBLSTM_time_reductions[2] self.pblstm3 = pBLSTM(input_dim=hidden_dim * 2 * reduction_time_factor, hidden_dim=hidden_dim, reduction_time_factor=reduction_time_factor) self.key_network = nn.Linear(hidden_dim * 2, value_size) self.value_network = nn.Linear(hidden_dim * 2, key_size)