def get_waveform_transforms(transforms: DictConfig, device=None): """ get all necessary transforms from config :param transforms: transforms from config :param device: device if transforms need to be ported to device :return: transforms composed into aud.Compose """ if transforms is None: return None if device is not None: return aud.Compose([ hydra.utils.instantiate(transform).to(device) for transform in transforms ]) else: return aud.Compose( [hydra.utils.instantiate(transform) for transform in transforms])
def get_transforms(transforms: DictConfig): """ get all necessary transforms from config :param transforms: transforms from config :return: transforms composed into aud.Compose """ if transforms is None: return None return aud.Compose( [hydra.utils.instantiate(transform) for transform in transforms])
def get_transform(cfg): def get_object(trans): params = trans.params if trans.params is not None else {} if trans.name in {"Compose", "OneOf"}: augs_tmp = [get_object(aug) for aug in trans.member] return getattr(kvt.augmentation, trans.name)(augs_tmp, **params) if hasattr(audi, trans.name): return getattr(audi, trans.name)(**params) elif hasattr(kvt.augmentation, trans.name): return getattr(kvt.augmentation, trans.name)(**params) else: return eval(trans.name)(**params) augs = [get_object(t) for t in cfg] return audi.Compose(augs)
def fasfasa(): import audiomentations from src import core crop_size = 98303 # 768 * 128 - 1 transforms = audiomentations.Compose([ core.transforms.ToMono(), core.transforms.Squeeze(), core.transforms.ToNumpy(), audiomentations.TimeStretch(min_rate=0.7, max_rate=1.3, p=0.5), audiomentations.PitchShift(min_semitones=-4, max_semitones=4, p=0.5), core.transforms.ToTorch() ]) train_dataset = DSD100(root=Path.home() / 'Data' / 'Audio' / 'DSD100', crop_size=crop_size, transforms=transforms) for i, e in zip(range(10), train_dataset): print(e)
"target" : label, "id" : record['recording_id'] } ############################### #Augmentations ############################### import audiomentations as AA train_audio_transform = AA.Compose([ AA.AddGaussianNoise(p=0.5), AA.AddGaussianSNR(p=0.5), #AA.AddBackgroundNoise("../input/train_audio/", p=1) #AA.AddImpulseResponse(p=0.1), #AA.AddShortNoises("../input/train_audio/", p=1) #AA.FrequencyMask(min_frequency_band=0.0, max_frequency_band=0.2, p=0.1), #AA.TimeMask(min_band_part=0.0, max_band_part=0.2, p=0.1), #AA.PitchShift(min_semitones=-0.5, max_semitones=0.5, p=0.1), #AA.Shift(p=0.1), #AA.Normalize(p=0.1), #AA.ClippingDistortion(min_percentile_threshold=0, max_percentile_threshold=1, p=0.05), #AA.PolarityInversion(p=0.05), #AA.Gain(p=0.2) ]) ############################### #Utils ############################### def _lwlrap_sklearn(truth, scores): """Reference implementation from https://colab.research.google.com/drive/1AgPdhSp7ttY18O3fEoHOQKlt_3HJDLi8""" sample_weight = np.sum(truth > 0, axis=1)
for fold, (t_idx, v_idx) in enumerate(kfold.split(X, y)): train_gby.loc[v_idx, "kfold"] = fold train_df = train_df.merge(train_gby[["recording_id", "kfold"]], on="recording_id", how="left") print(train_df.kfold.value_counts()) train_df.to_csv(OUTPUT_DIR / "folds.csv", index=False) species_fmin_fmax.to_csv(OUTPUT_DIR / "species_fmin_fmax.csv", index=False) ################################################ # audiomentations # ################################################ augmenter = A.Compose([ A.AddGaussianNoise(min_amplitude=0.01, max_amplitude=0.03, p=0.2), A.PitchShift(min_semitones=-3, max_semitones=3, p=0.2), A.Gain(p=0.2) ]) ################################################ # Dataset # ################################################ def cut_spect(spect: torch.Tensor, fmin_mel: int, fmax_mel: int): return spect[fmin_mel:fmax_mel] def do_normalize(img: torch.Tensor): bs, ch, w, h = img.shape _img = img.clone()
def __init__(self, dir, extract_chunks=True, sample_rate=16000, num_fbanks=40, label_delay=100, no_augment=False, **kwargs): self.extract_chunks = extract_chunks self.min_length = kwargs["min_chunk_length"] self.max_length = kwargs["max_chunk_length"] self.sample_rate = sample_rate self.num_fbanks = num_fbanks self.label_delay = label_delay reco2wav = {} reco2trs = {} with open(f"{dir}/wav.scp") as f: for l in f: ss = l.split() reco2wav[ss[0]] = ss[1] with open(f"{dir}/reco2trs.scp") as f: for l in f: ss = l.split() reco2trs[ss[0]] = ss[1] self.sections = [] # Sections are of different length; # we add a section section_length/avg_chunk_len + 1 number of times self.index2section = [] avg_chunk_len = self.max_length - self.min_length for reco in tqdm.tqdm( reco2trs.keys(), desc=f"Loading transcriptions and audios for {dir}"): try: transcription = trs.Transcritpion(reco2wav[reco], reco2trs[reco]) for section in transcription.get_speech_sections(): self.sections.append(section) section_length = section.wav_tensor.shape[0] / sample_rate if extract_chunks: self.index2section.extend( [len(self.sections) - 1] * int(section_length // avg_chunk_len + 1)) else: self.index2section.append(len(self.sections) - 1) except: logging.warn(f"Cannot load transcription/audio for {reco}", exc_info=True) self.augment = None if not no_augment: augmentations = [] if kwargs["rir_dir"] != "": augmentations.append( audiomentations.AddImpulseResponse( ir_path=kwargs["rir_dir"], p=0.3, lru_cache_size=1024)) if kwargs["noise_dir"] != "": augmentations.append( audiomentations.AddBackgroundNoise( sounds_path=kwargs["noise_dir"], p=0.3, lru_cache_size=1024)) if kwargs["short_noise_dir"] != "": augmentations.append( audiomentations.AddShortNoises( sounds_path=kwargs["short_noise_dir"], p=0.3, lru_cache_size=1024)) if len(augmentations) > 0: self.augment = audiomentations.Compose(augmentations)
def get_training_augmentation(): train_transform = [ # audi.Normalize(), ] return audi.Compose(train_transform)
def get_test_augmentation(): """Add paddings to make audio shape divisible by 32""" test_transform = [ # audi.Normalize(), ] return audi.Compose(test_transform)
import audiomentations import cv2 from model.mixers import UseMixerWithProb, RandomMixer, SigmoidConcatMixer, AddMixer, SigmoidVerticalConcatMixer from model.random_resized_crop import RandomResizedCrop, RandomResizedCrop2 from model.transforms import Compose, UseWithProb, SpecAugment, SpectreScale, PreprocessMelImage, GaussNoise, OneOf, \ PadToSize, RandomCrop, PreprocessSingleChannelMelImage wave_augmentations = { 0: None, 1: audiomentations.Compose([ audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5), audiomentations.TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5), audiomentations.PitchShift(min_semitones=-4, max_semitones=4, p=0.5), audiomentations.Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5), ], p=0.96), # (1-(1-0.5)^4)*0.96==0.9 - In total there will be 90% augmented samples 2: audiomentations.Compose([ audiomentations.AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.010, p=0.95), audiomentations.Shift(min_fraction=-0.1, max_fraction=0.1, p=0.3), ], p=1), } size_4_sec_750_hop = 256 _base_mel_post_process = { 'none': [], '3ch_1': [ # Use librosa.feature.delta with order 1 and 2 for creating 2 additional channels then divide by 100 PreprocessMelImage(), ], '1ch_1': [PreprocessSingleChannelMelImage(), ],