def my_cnn14(n_fft, n_mels, n_classes=100, hop_size=320, fmin=160, fmax=10300): _model_config = { "sample_rate": 32000, "window_size": 1024, "hop_size": 320, "mel_bins": 64, "fmin": 50, "fmax": 14000, "classes_num": 527, } model = Cnn14(**_model_config) model.fc_audioset = nn.Linear(2048, n_classes, bias=True) init_layer(model.fc_audioset) model.spectrogram_extractor = Spectrogram(n_fft=n_fft, hop_length=hop_size, win_length=n_fft) model.logmel_extractor = LogmelFilterBank( sr=SAMPLE_RATE, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, ) model.bn0 = nn.BatchNorm2d(n_mels) init_bn(model.bn0) return model
def __init__(self, backbone, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): """Classifier for a new task using pretrained Cnn14 as a sub module. """ super(AudioClassifierHub, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None self.bn = nn.BatchNorm2d(3) # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.backbone = backbone in_feat = backbone.classifier.in_features self.backbone.classifier = nn.Linear(in_feat, classes_num)
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Encoder_B0_Pretrained, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=4, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) self.bn0.load_state_dict(torch.load('pretrained_bn0_b0')) for p in self.bn0.parameters(): p.requires_grad = False fe = 1280 fe_features = 2048 self.fc1 = nn.Linear(fe, fe_features, bias=True) self.fc1.load_state_dict(torch.load('pretrained_fc1_b0_fold_0')) for p in self.fc1.parameters(): p.requires_grad = False self.att_block = AttBlock(fe_features, classes_num) # self.fe = timm.models.resnest50d_4s2x40d(pretrained=True) self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=False) self.fe = nn.Sequential(*list(self.fe.children())[:-2]) self.fe.load_state_dict(torch.load('pretrained_fe_b0_fold_0')) for p in self.fe.parameters(): p.requires_grad = False
def __init__(self, df: pd.DataFrame, datadir: Path, img_size=224, waveform_transforms=None, period=5, mode=None): self.df = df self.datadir = datadir self.img_size = img_size self.waveform_transforms = waveform_transforms self.period = period self.mode = mode # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length, win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft, n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=True)
def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None self.interpolate_ratio = 30 # Downsampled ratio # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) # Model Encoder self.encoder = encoder_params[encoder]["init_op"]() self.fc1 = nn.Linear(encoder_params[encoder]["features"], 1024, bias=True) self.att_block = AttBlock(1024, classes_num, activation="sigmoid") self.bn0 = nn.BatchNorm2d(mel_bins) self.init_weight()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn14, self).__init__() window = "hann" center = True pad_mode = "reflect" ref = 1.0 amin = 1e-10 top_db = None self.dataset_mean = 0.0 self.dataset_std = 1.0 # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True, ) # Spec augmenter self.spec_augmenter = SpecAugmentation( time_drop_width=32, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, ) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) self.fc1 = nn.Linear(2048, 2048, bias=True) self.fc_audioset = nn.Linear(2048, classes_num, bias=True) self.init_weight()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Encoder_Transformer, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=4, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) fe = 1280 d_model = 512 self.fc1 = nn.Linear(fe, d_model, bias=True) n_head = 4 self.multihead = ResidualAttentionBlock(d_model, n_head) self.att_block = AttBlock(d_model, classes_num) # self.fe = timm.models.resnest50d_4s2x40d(pretrained=True) self.fe = timm.models.tf_efficientnet_b0_ns(pretrained=True) self.fe = nn.Sequential(*list(self.fe.children())[:-2]) self.pos_emb = nn.Embedding(32, d_model)
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Gru_FrameAtt, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.gru = nn.GRU(input_size=512, hidden_size=256, num_layers=1, bias=True, batch_first=True, bidirectional=True) self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid') self.init_weights()
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_Transformer_FrameAtt, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) n_head = 8 n_hid = 512 d_k = 64 d_v = 64 dropout = 0.2 self.multihead = MultiHead(n_head, n_hid, d_k, d_v, dropout) self.att_block = AttBlock(n_in=512, n_out=17, activation='sigmoid') self.init_weights()
def __init__(self, sample_rate: int, window_size: int, hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int, apply_aug: bool, top_db=None): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 self.interpolate_ratio = 32 # Downsampled ratio self.apply_aug = apply_aug # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) self.fc1 = nn.Linear(1024, 1024, bias=True) self.att_block = AttBlockV2(1024, classes_num, activation='sigmoid') self.densenet_features = models.densenet121(pretrained=True).features self.init_weight()
def __init__(self, sample_rate: int, window_size: int, hop_size: int, mel_bins: int, fmin: int, fmax: int, classes_num: int): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None self.interpolate_ratio = 32 # Downsampled ratio # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation( time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(mel_bins) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) self.fc1 = nn.Linear(2048, 2048, bias=True) self.att_block = AttBlock(2048, classes_num, activation='sigmoid') self.init_weight()
def __init__(self, base_model_name: str, pretrained=False, num_classes=24, in_channels=1): super().__init__() # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=CFG.n_fft, hop_length=CFG.hop_length, win_length=CFG.n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=CFG.sample_rate, n_fft=CFG.n_fft, n_mels=CFG.n_mels, fmin=CFG.fmin, fmax=CFG.fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(CFG.n_mels) base_model = timm.create_model(base_model_name, pretrained=pretrained, in_chans=in_channels) layers = list(base_model.children())[:-2] self.encoder = nn.Sequential(*layers) if hasattr(base_model, "fc"): in_features = base_model.fc.in_features else: in_features = base_model.classifier.in_features self.fc1 = nn.Linear(in_features, in_features, bias=True) self.att_block = AttBlockV2(in_features, num_classes, activation="sigmoid") self.init_weight()
def get_valid_all_clip_result(fold: int): # Load Data train_df = pd.read_csv(OUTPUT_DIR / "folds.csv") train_df = train_df[train_df["istp"] == 1].reset_index(drop=True) species_fmin_fmax = pd.read_csv(OUTPUT_DIR / "species_fmin_fmax.csv") f_min_mels = torch.tensor(species_fmin_fmax["f_min_mel"].values, dtype=torch.int) f_max_mels = torch.tensor(species_fmin_fmax["f_max_mel"].values, dtype=torch.int) # Load model model = AudioClassifier(CFG.model_param["encoder"], CFG.model_param["classes_num"]) model.load_state_dict(torch.load(OUTPUT_DIR / f'fold-{fold}.bin')) model = model.to(device) # Get valid valid_fold = train_df[train_df.kfold == fold].reset_index(drop=True) test_dataset = TestDataset( df=valid_fold, period=CFG.period, transforms=None, data_path="../input/train", ) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=CFG.batch_size//32, shuffle=False, drop_last=False, num_workers=CFG.num_workers ) window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None spectrogram_extractor = Spectrogram(n_fft=WINDOW_SIZE, hop_length=HOP_SIZE, win_length=WINDOW_SIZE, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True).to(device) logmel_extractor = LogmelFilterBank(sr=SR, n_fft=WINDOW_SIZE, n_mels=N_MELS, fmin=FMIN, fmax=FMAX, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True).to(device) test_pred, ids = test_epoch(model, spectrogram_extractor, logmel_extractor, test_loader, f_min_mels, f_max_mels, device, resize=True) test_pred_df = pd.DataFrame({ "recording_id": valid_fold.recording_id.values }) test_pred_df["kfold"] = fold for i in range(24): test_pred_df[f"s{i}"] = 0 test_pred_df[[f's{i}' for i in range(24)]] = test_pred return test_pred_df
def __init__(self, n_classes, n_fft=1024, hop_length=256, n_mels=128, sr=22050, fc_output=1024): super().__init__() self.spectrogram = Spectrogram(n_fft=n_fft, hop_length=hop_length, win_length=n_fft) self.logmel_extractor = LogmelFilterBank(sr=sr, n_fft=n_fft, n_mels=n_mels) self.cnn = EfficientNet.from_pretrained('efficientnet-b3', in_channels=1) self.fc = nn.Linear(1536, fc_output) self.att_block = AttentionBlock(fc_output, n_classes) self.bn0 = nn.BatchNorm2d(n_mels)
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(Cnn_9layers_FrameAvg, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.fc = nn.Linear(512, classes_num, bias=True) self.init_weights()
def __init__(self): super(Tmodel, self).__init__() SPEC_HEIGHT = 128 SPEC_WIDTH = 256 NUM_MELS = SPEC_HEIGHT HOP_LENGTH = int( 32000 * 5 / (SPEC_WIDTH - 1)) # sample rate * duration / spec width - 1 == 627 FMIN = 500 FMAX = 12500 classes_num = 398 self.interpolate_ratio = 8 self.spectrogram_extractor = Spectrogram(n_fft=2048, hop_length=HOP_LENGTH, freeze_parameters=True) self.logmel_extractor = LogmelFilterBank(sr=32000, n_mels=NUM_MELS, fmin=FMIN, fmax=FMAX, freeze_parameters=True) self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(128) base_model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest50', pretrained=False) layers = list(base_model.children())[:-2] self.encoder = nn.Sequential(*layers) self.gru = nn.GRU(input_size=2048, hidden_size=1024, num_layers=1, bias=True, batch_first=True, bidirectional=True) self.att_block = AttBlockV2(2048, classes_num, activation='sigmoid') self.init_weights()
def __init__(self, frames_per_second, classes_num): super(Regress_onset_offset_frame_velocity_CRNN, self).__init__() sample_rate = 16000 window_size = 2048 hop_size = sample_rate // frames_per_second mel_bins = 229 fmin = 30 fmax = sample_rate // 2 window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None midfeat = 1792 momentum = 0.01 # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) self.bn0 = nn.BatchNorm2d(mel_bins, momentum) self.frame_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum) self.reg_onset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum) self.reg_offset_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum) self.velocity_model = AcousticModelCRnn8Dropout(classes_num, midfeat, momentum) self.reg_onset_gru = nn.GRU(input_size=88 * 2, hidden_size=256, num_layers=1, bias=True, batch_first=True, dropout=0., bidirectional=True) self.reg_onset_fc = nn.Linear(512, classes_num, bias=True) self.frame_gru = nn.GRU(input_size=88 * 3, hidden_size=256, num_layers=1, bias=True, batch_first=True, dropout=0., bidirectional=True) self.frame_fc = nn.Linear(512, classes_num, bias=True) self.init_weight()
def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.encoder = encoder_params[encoder]["init_op"]() self.avg_pool = AdaptiveAvgPool2d((1, 1)) #self.max_pool = AdaptiveMaxPool2d((1, 1)) self.dropout = Dropout(0.3) self.fc = Linear(encoder_params[encoder]['features'], classes_num)
def __init__(self, encoder, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super().__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None #self.interpolate_ratio = 29 # Downsampled ratio self.interpolate_ratio = 29 # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.batch_norm = nn.BatchNorm2d(mel_bins) self.encoder = encoder_params[encoder]["init_op"]() #self.encoder.last_linear = Linear(encoder_params[encoder]['features'], 2048, bias=True) #self.encoder.classifier = Linear(2048, encoder_params[encoder]['features'], bias=True) #self.fc = Linear(encoder_params[encoder]['features'], 2048, bias=True) #self.encoder.fc = nn.Linear(2048, 2048) self.dropout = Dropout(0.3) self.att_head = AttentionHead(1000, classes_num, activation='sigmoid') #self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) self.init_weight()
def __init__(self, frames_per_second, classes_num): super(Regress_pedal_CRNN, self).__init__() sample_rate = 16000 window_size = 2048 hop_size = sample_rate // frames_per_second mel_bins = 229 fmin = 30 fmax = sample_rate // 2 window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None midfeat = 1792 momentum = 0.01 # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) self.bn0 = nn.BatchNorm2d(mel_bins, momentum) self.reg_pedal_onset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum) self.reg_pedal_offset_model = AcousticModelCRnn8Dropout(1, midfeat, momentum) self.reg_pedal_frame_model = AcousticModelCRnn8Dropout(1, midfeat, momentum) self.init_weight()
def train_loop(fold): LOGGER.info(f"========== fold: {fold} training ==========") train_df = pd.read_csv(OUTPUT_DIR / 'folds.csv') if CFG.debug: train_df = train_df.sample(n=1000, random_state=42) train_fold = train_df[train_df.kfold != fold] valid_fold = train_df[train_df.kfold == fold] columns = [ "recording_id", "species_id", "t_min", "f_min", "t_max", "f_max", "istp", "f_min_mel", "f_max_mel", "kfold" ] train_fold = train_fold[columns] print(f"train fold before concat: {train_fold.shape}") train_dataset = AudioDataset( df=train_fold, period=CFG.period, time=CFG.duration, transforms=augmenter, data_path="../input/train", ) valid_dataset = ValidDataset(df=valid_fold, period=CFG.period, transforms=None, data_path="../input/train") train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=CFG.batch_size, shuffle=True, drop_last=True, num_workers=CFG.num_workers, ) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=CFG.batch_size // 4, shuffle=False, drop_last=False, num_workers=CFG.num_workers) window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None spectrogram_extractor = Spectrogram(n_fft=WINDOW_SIZE, hop_length=HOP_SIZE, win_length=WINDOW_SIZE, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True).to(device) logmel_extractor = LogmelFilterBank(sr=SR, n_fft=WINDOW_SIZE, n_mels=N_MELS, fmin=FMIN, fmax=FMAX, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True).to(device) # ==================================================== # scheduler # ==================================================== def get_scheduler(optimizer): if CFG.scheduler == 'ReduceLROnPlateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=CFG.factor, patience=CFG.patience, verbose=True, eps=CFG.min_lr) elif CFG.scheduler == 'CosineAnnealingLR': scheduler = CosineAnnealingLR(optimizer, T_max=CFG.T_max, eta_min=CFG.min_lr, last_epoch=-1) elif CFG.scheduler == 'CosineAnnealingWarmRestarts': scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=CFG.T_0, T_mult=1, eta_min=CFG.min_lr, last_epoch=-1) return scheduler # ==================================================== # model & optimizer # ==================================================== model = AudioClassifier(CFG.model_param["encoder"], CFG.model_param["classes_num"]) model = model.to(device) # optimizer = Adam(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay, amsgrad=False) # scheduler = get_scheduler(optimizer) optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr) num_train_steps = int(len(train_loader) * CFG.epochs) num_warmup_steps = int(0.1 * CFG.epochs * len(train_loader)) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps) # criterion = nn.BCEWithLogitsLoss() criterion = BCEFocalLoss() best_score = -np.inf for epoch in range(CFG.epochs): if epoch < CFG.mixup_epochs: p_mixup = CFG.p_mixup else: p_mixup = 0. start_time = time.time() # train train_avg, train_loss = train_epoch(model, spectrogram_extractor, logmel_extractor, train_loader, criterion, optimizer, scheduler, epoch, device, p_mixup, spec_aug=True) # valid valid_avg, valid_loss = valid_epoch(model, spectrogram_extractor, logmel_extractor, valid_loader, criterion, epoch, device) if isinstance(scheduler, ReduceLROnPlateau): scheduler.step(valid_loss) elif isinstance(scheduler, CosineAnnealingLR): scheduler.step() elif isinstance(scheduler, CosineAnnealingWarmRestarts): scheduler.step() elapsed = time.time() - start_time LOGGER.info( f'Epoch {epoch+1} - avg_train_loss: {train_loss:.5f} avg_val_loss: {valid_loss:.5f} time: {elapsed:.0f}s' ) LOGGER.info( f"Epoch {epoch+1} - train_LWLRAP:{train_avg['lwlrap']:0.5f} valid_LWLRAP:{valid_avg['lwlrap']:0.5f}" ) LOGGER.info( f"Epoch {epoch+1} - train_F1:{train_avg['f1score']:0.5f} valid_F1:{valid_avg['f1score']:0.5f}" ) if valid_avg['f1score'] > best_score: LOGGER.info( f">>>>>>>> Model Improved From {best_score} ----> {valid_avg['f1score']}" ) torch.save(model.state_dict(), OUTPUT_DIR / f'fold-{fold}.bin') best_score = valid_avg['f1score']
def __init__( self, encoder, in_features, num_classes, n_fft, hop_length, sample_rate, n_mels, fmin, fmax, dropout_rate=0.5, freeze_spectrogram_parameters=True, freeze_logmel_parameters=True, use_spec_augmentation=True, time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, spec_augmentation_method=None, apply_mixup=False, apply_spec_shuffle=False, spec_shuffle_prob=0, use_gru_layer=False, apply_tta=False, use_loudness=False, use_spectral_centroid=False, apply_delta_spectrum=False, apply_time_freq_encoding=False, min_db=120, apply_pcen=False, freeze_pcen_parameters=False, use_multisample_dropout=False, multisample_dropout=0.5, num_multisample_dropout=5, pooling_kernel_size=3, **params, ): super().__init__() self.n_mels = n_mels self.dropout_rate = dropout_rate self.apply_mixup = apply_mixup self.apply_spec_shuffle = apply_spec_shuffle self.spec_shuffle_prob = spec_shuffle_prob self.use_gru_layer = use_gru_layer self.apply_tta = apply_tta self.use_loudness = use_loudness self.use_spectral_centroid = use_spectral_centroid self.apply_delta_spectrum = apply_delta_spectrum self.apply_time_freq_encoding = apply_time_freq_encoding self.apply_pcen = apply_pcen self.use_multisample_dropout = use_multisample_dropout self.num_multisample_dropout = num_multisample_dropout self.pooling_kernel_size = pooling_kernel_size # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=freeze_spectrogram_parameters, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=freeze_logmel_parameters, is_log=False, ) self.power_to_db = torchaudio.transforms.AmplitudeToDB() # Spec augmenter self.spec_augmenter = None if use_spec_augmentation and (spec_augmentation_method is None): self.spec_augmenter = SpecAugmentation( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, ) elif use_spec_augmentation and (spec_augmentation_method is not None): self.spec_augmenter = SpecAugmentationPlusPlus( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, method=spec_augmentation_method, ) if self.use_loudness: self.loudness_bn = nn.BatchNorm1d(1) self.loudness_extractor = Loudness( sr=sample_rate, n_fft=n_fft, min_db=min_db, ) if self.use_spectral_centroid: self.spectral_centroid_bn = nn.BatchNorm1d(1) if self.apply_pcen: self.pcen_transform = PCENTransform( trainable=~freeze_pcen_parameters, ) # layers = list(encoder.children())[:-2] # self.encoder = nn.Sequential(*layers) self.encoder = encoder if self.use_multisample_dropout: self.big_dropout = nn.Dropout(p=multisample_dropout)
def __init__( self, encoder, in_features, num_classes, n_fft, hop_length, sample_rate, n_mels, fmin, fmax, dropout_rate=0.1, freeze_spectrogram_parameters=True, freeze_logmel_parameters=True, use_spec_augmentation=True, time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2, spec_augmentation_method=None, apply_mixup=False, apply_spec_shuffle=False, spec_shuffle_prob=0, use_gru_layer=False, apply_tta=False, apply_encoder=False, **params, ): super().__init__() self.n_mels = n_mels self.dropout_rate = dropout_rate self.apply_mixup = apply_mixup self.apply_spec_shuffle = apply_spec_shuffle self.spec_shuffle_prob = spec_shuffle_prob self.use_gru_layer = use_gru_layer self.apply_tta = apply_tta # Spectrogram extractor self.spectrogram_extractor = Spectrogram( n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window="hann", center=True, pad_mode="reflect", freeze_parameters=freeze_spectrogram_parameters, ) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank( sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, ref=1.0, amin=1e-10, top_db=None, freeze_parameters=freeze_logmel_parameters, is_log=False, ) # Spec augmenter self.spec_augmenter = None if use_spec_augmentation and (spec_augmentation_method is None): self.spec_augmenter = SpecAugmentation( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, ) elif use_spec_augmentation and (spec_augmentation_method is not None): self.spec_augmenter = SpecAugmentationPlusPlus( time_drop_width=time_drop_width, time_stripes_num=time_stripes_num, freq_drop_width=freq_drop_width, freq_stripes_num=freq_stripes_num, method=spec_augmentation_method, ) # encoder self.conformer = nn.Sequential(*[ ConformerBlock( dim=n_mels, dim_head=64, heads=8, ff_mult=4, conv_expansion_factor=2, conv_kernel_size=31, attn_dropout=dropout_rate, ff_dropout=dropout_rate, conv_dropout=dropout_rate, ) for _ in range(3) ]) self.fc = nn.Sequential( nn.Dropout(dropout_rate), nn.Linear(n_mels, num_classes), )