def fasfasa(): import audiomentations from src import core crop_size = 98303 # 768 * 128 - 1 transforms = audiomentations.Compose([ core.transforms.ToMono(), core.transforms.Squeeze(), core.transforms.ToNumpy(), audiomentations.TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5), audiomentations.PitchShift(min_semitones=-4, max_semitones=4, p=0.5), core.transforms.ToTorch() ]) train_dataset = DSD100(root=Path.home() / 'Data' / 'Audio' / 'DSD100', crop_size=crop_size, transforms=transforms) for i, e in zip(range(10), train_dataset): print(e)
for fold, (t_idx, v_idx) in enumerate(kfold.split(X, y)): train_gby.loc[v_idx, "kfold"] = fold train_df = train_df.merge(train_gby[["recording_id", "kfold"]], on="recording_id", how="left") print(train_df.kfold.value_counts()) train_df.to_csv(OUTPUT_DIR / "folds.csv", index=False) species_fmin_fmax.to_csv(OUTPUT_DIR / "species_fmin_fmax.csv", index=False) ################################################ # audiomentations # ################################################ augmenter = A.Compose([ A.AddGaussianNoise(min_amplitude=0.01, max_amplitude=0.03, p=0.2), A.PitchShift(min_semitones=-3, max_semitones=3, p=0.2), A.Gain(p=0.2) ]) ################################################ # Dataset # ################################################ def cut_spect(spect: torch.Tensor, fmin_mel: int, fmax_mel: int): return spect[fmin_mel:fmax_mel] def do_normalize(img: torch.Tensor): bs, ch, w, h = img.shape _img = img.clone()
import audiomentations import cv2 from model.mixers import UseMixerWithProb, RandomMixer, SigmoidConcatMixer, AddMixer, SigmoidVerticalConcatMixer from model.random_resized_crop import RandomResizedCrop, RandomResizedCrop2 from model.transforms import Compose, UseWithProb, SpecAugment, SpectreScale, PreprocessMelImage, GaussNoise, OneOf, \ PadToSize, RandomCrop, PreprocessSingleChannelMelImage wave_augmentations = { 0: None, 1: audiomentations.Compose([ audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5), audiomentations.TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5), audiomentations.PitchShift(min_semitones=-4, max_semitones=4, p=0.5), audiomentations.Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5), ], p=0.96), # (1-(1-0.5)^4)*0.96==0.9 - In total there will be 90% augmented samples 2: audiomentations.Compose([ audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.010, p=0.95), audiomentations.Shift(min_fraction=-0.1, max_fraction=0.1, p=0.3), ], p=1), } size_4_sec_750_hop = 256 _base_mel_post_process = { 'none': [], '3ch_1': [ # Use librosa.feature.delta with order 1 and 2 for creating 2 additional channels then divide by 100 PreprocessMelImage(), ], '1ch_1': [PreprocessSingleChannelMelImage(), ],
def __init__(self, sample_rate, min_semitones=-4, max_semitones=4, p=0.5, **kwargs): store_attr('min_semitones'), store_attr('max_semitones'), store_attr('p') super().__init__(**kwargs) self.tfm = partial(aug.PitchShift(min_semitones=min_semitones, max_semitones=max_semitones, p=p), sample_rate=sample_rate)