Exemple #1
0
    def __init__(self,
                 train_img_path,
                 batch_size,
                 shuffle=True,
                 validation_split=0.0,
                 num_workers=1,
                 p_augment=0.0,
                 training=True):

        img_transforms = transforms.Compose([
            transforms.Resize((512, 300)),
            transforms.ToTensor(),
            transforms.RandomChoice([
                transforms.RandomApply(
                    [audiotransforms.FrequencyMasking(freq_mask_param=50)],
                    p=p_augment),
                transforms.RandomApply(
                    [audiotransforms.TimeMasking(time_mask_param=100)],
                    p=p_augment)
            ]),
        ])

        self.train_img_path = train_img_path
        self.dataset = ImageFolder(root=self.train_img_path,
                                   transform=img_transforms)
        super().__init__(self.dataset, batch_size, shuffle, validation_split,
                         num_workers)
Exemple #2
0
    def train_input_per_sample_transform(self) -> Callable:
        transforms = []
        if self.time_mask_param is not None:
            transforms.append(TAudio.TimeMasking(time_mask_param=self.time_mask_param))

        if self.freq_mask_param is not None:
            transforms.append(TAudio.FrequencyMasking(freq_mask_param=self.freq_mask_param))

        transforms += [T.ToTensor(), T.Resize(self.spectrogram_size)]
        return T.Compose(transforms)
Exemple #3
0
    def __init__(self,
                 train_img_path,
                 batch_size,
                 shuffle=True,
                 validation_split=0.0,
                 num_workers=1,
                 p_augment=0.0,
                 training=True):

        alexnet_transforms = transforms.Compose([
            transforms.Resize(256),
            transforms.ToTensor(),
            transforms.RandomChoice([
                transforms.RandomApply(
                    [audiotransforms.FrequencyMasking(freq_mask_param=50)],
                    p=p_augment),
                transforms.RandomApply(
                    [audiotransforms.TimeMasking(time_mask_param=100)],
                    p=p_augment)
            ]),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])
        """
        alexnet_transforms = A.Compose([
                                         A.Resize(256,256,always_apply=True),
                                         A.OneOf([
                                            A.GaussNoise(p=0.4),
                                            A.RandomBrightnessContrast(p=0.4),
                                            A.ShiftScaleRotate(shift_limit_x = 0.1,scale_limit = 0, rotate_limit=0,p=0.4)
                                        ], p = p_augment),
                                         A.Normalize(mean=[0.485, 0.456, 0.406], 
                                                             std=[0.229, 0.224, 0.225],always_apply=True),
                                         ToTensorV2(always_apply=True)
                                        ]) 
       
        alexnet_transforms = transforms.Compose([
                                         transforms.Resize(256),
                                         transforms.ToTensor(),
                                         #transforms.Lambda(lambda x: torch.unsqueeze(x,1)),
                                         #transforms.Lambda(lambda x: torch.cat((x, x, x), 1)),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                             std=[0.229, 0.224, 0.225])
                                        ])
        """

        self.train_img_path = train_img_path
        #self.dataset = ImageFolderAlbumentations(root=self.train_img_path, transform=alexnet_transforms)
        self.dataset = ImageFolder(root=self.train_img_path,
                                   transform=alexnet_transforms)
        super().__init__(self.dataset, batch_size, shuffle, validation_split,
                         num_workers)
    def spec_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec,
                                                                    mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec,
                                                               mask_value)

        return aug_spec
def construct_transforms(log_compression,
                         time_cutout,
                         freq_cutout,
                         n_mels=40,
                         **kwargs):
    train_tfs = {
        'audio':
        Compose([
            ToTensor(),
            transforms.MelSpectrogram(sample_rate=8192,
                                      n_fft=512,
                                      hop_length=128,
                                      n_mels=n_mels),
            LogCompress(ratio=log_compression),
            transforms.TimeMasking(time_cutout),
            transforms.FrequencyMasking(freq_cutout),
            TorchUnsqueeze()
        ]),
        'target':
        Compose([
            ShortTermAverageTransform(frame_length=512,
                                      hop_length=128,
                                      threshold=0.5),
            ToTensor()
        ])
    }
    dev_tfs = {
        'audio':
        Compose([
            ToTensor(),
            transforms.MelSpectrogram(sample_rate=8192,
                                      n_fft=512,
                                      hop_length=128,
                                      n_mels=n_mels),
            LogCompress(ratio=log_compression),
            TorchUnsqueeze(),
        ]),
        'target':
        Compose([
            ShortTermAverageTransform(frame_length=512,
                                      hop_length=128,
                                      threshold=0.5),
            ToTensor()
        ]),
    }
    return train_tfs, dev_tfs
 def __init__(self, root_dir, transform=None, train=False):
     """
     Args:
         root_dir (string): Directory with all the subdirectory for each speaker and their audio.
         transform (callable, optional): Optional transform to be applied
             on a sample.
     """
     if train:
         self.root_dir = os.path.join(root_dir, "train")
     else:
         self.root_dir = os.path.join(root_dir, "test")
     self.speaker_frame, self.name_dict = self._create_speaker_dataframe()
     self.transform = transform
     self.sample_rate = 16000
     self.resample_trans = torchaudio.transforms.Resample(
         48000, self.sample_rate)
     self.freq_masking = T.FrequencyMasking(freq_mask_param=80,
                                            iid_masks=True)
     self.time_masking = T.TimeMasking(time_mask_param=80, iid_masks=True)
 def __getitem__(self, index):
     audio, sr = load(self.file_paths[index])
     audio = torch.mean(audio, dim=0, keepdim=True)
     if self.sr != sr:
         audio = transforms.Resample(sr, self.sr)(audio)
     mel_spectrogram = transforms.MelSpectrogram(sample_rate=self.sr,
                                                 n_fft=self.n_fft,
                                                 win_length=self.win_length,
                                                 hop_length=self.hop_length,
                                                 n_mels=self.n_mels,
                                                 f_max=self.sr / 2)(audio)
     if self.log_mel:
         offset = 1e-6
         mel_spectrogram = torch.log(mel_spectrogram + offset)
     else:
         mel_spectrogram = transforms.AmplitudeToDB(
             stype="power", top_db=80)(mel_spectrogram)
     if self.augment:
         audio = transforms.FrequencyMasking(freq_mask_param=20)(audio)
         audio = transforms.TimeMasking(time_mask_param=10)(audio)
     label = self.labels[index]
     return mel_spectrogram, label
Exemple #8
0
def train_default_transforms(
        spectrogram_size: Tuple[int, int], time_mask_param: Optional[int],
        freq_mask_param: Optional[int]) -> Dict[str, Callable]:
    """During training we apply the default transforms with optional ``TimeMasking`` and ``Frequency Masking``."""
    augs = []

    if time_mask_param is not None:
        augs.append(
            ApplyToKeys(DefaultDataKeys.INPUT,
                        TAudio.TimeMasking(time_mask_param=time_mask_param)))

    if freq_mask_param is not None:
        augs.append(
            ApplyToKeys(
                DefaultDataKeys.INPUT,
                TAudio.FrequencyMasking(freq_mask_param=freq_mask_param)))

    if len(augs) > 0:
        return merge_transforms(
            default_transforms(spectrogram_size),
            {"post_tensor_transform": nn.Sequential(*augs)})
    return default_transforms(spectrogram_size)
Exemple #9
0
 def test_TimeMasking(self):
     tensor = torch.rand((10, 2, 50, 10, 2))
     self._assert_consistency(
         T.TimeMasking(time_mask_param=30, iid_masks=False), tensor)
Exemple #10
0
transform = {
    'val': {
        'base': base
    },
    'test': {
        'base': base
    },
    'train': {
        'base': base
    },
}

# Augmentation
logmel_aug = T.Compose([
    TA.TimeMasking(time_mask_param=30),
    TA.FrequencyMasking(freq_mask_param=15)
])

logmel_A = T.Lambda(lambd=lambda x: torch.cat((x, logmel_aug(x)), dim=2))

augment = {
    'val': {
        'logmel': logmel_A
    },
    'test': {
        'logmel': logmel_A
    },
    'train': {
        'logmel': logmel_A
    },
rate = 0.9
spec_ = stretch(spec, rate)
plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304)

######################################################################
# TimeMasking
# ~~~~~~~~~~~
#

torch.random.manual_seed(4)

spec = get_spectrogram()
plot_spectrogram(spec[0], title="Original")

masking = T.TimeMasking(time_mask_param=80)
spec = masking(spec)

plot_spectrogram(spec[0], title="Masked along time axis")

######################################################################
# FrequencyMasking
# ~~~~~~~~~~~~~~~~
#


torch.random.manual_seed(4)

spec = get_spectrogram()
plot_spectrogram(spec[0], title="Original")
Exemple #12
0
    def __init__(
            self,
            wake_words_root_path="D:/Workspace/Projects/Voice2Command/recordings/positive",
            background_sounds_root_path="D:/Storage/UrbanSound8K/audio/fold1",
            max_length=3,
            sampling_rate=44100,  #44.1 Hz
            testing=False):
        self.wake_words_positive_root_path = wake_words_root_path + "/positive"
        self.wake_words_negative_root_path = wake_words_root_path + "/negative"
        self.sampling_rate = sampling_rate
        self.background_sounds_root_path = background_sounds_root_path

        self.generated_samples = []
        self.sample_size = sampling_rate * max_length

        self.background_noise_sound_paths = list(
            pathlib.Path(background_sounds_root_path).glob('*.wav'))
        self.wake_words = self._load_wake_words(
            self.wake_words_positive_root_path)
        self.wake_words_negative = self._load_wake_words(
            self.wake_words_negative_root_path)

        #Spec Augment transforms
        self.transforms = nn.Sequential(
            transforms.FrequencyMasking(freq_mask_param=2),
            transforms.TimeMasking(time_mask_param=4))

        number_of_samples = 400
        if testing == True:
            number_of_samples = 50

        for idx, path in enumerate(
                self.background_noise_sound_paths[:number_of_samples]):
            y, sr = librosa.core.load(path, sr=sampling_rate)

            if len(y) < self.sample_size:
                y = np.pad(y, (0, self.sample_size - len(y)))
            else:
                y = y[:self.sample_size]

            y_false = np.array(y, copy=True)
            y_true = y

            #Positive
            wake_word = self.sample_wake_word(self.wake_words)
            interval = self._get_random_time_interval(
                len(wake_word), max_length * sampling_rate)
            self._overlay_wakeword(y_true[interval[0]:interval[1]], wake_word)
            # self._save_sound(y)

            S_true = librosa.feature.melspectrogram(y=y_true,
                                                    sr=sr,
                                                    hop_length=128)
            S_db_true = librosa.core.power_to_db(S_true)
            S_db_true = self.transforms(torch.from_numpy(S_db_true))

            #Negative
            if random.random() > 0.5:
                wake_word = self.sample_wake_word(self.wake_words_negative)
                interval = self._get_random_time_interval(
                    len(wake_word), max_length * sampling_rate)
                self._overlay_wakeword(y_false[interval[0]:interval[1]],
                                       wake_word)
                self._save_sound(y_false)

            S_false = librosa.feature.melspectrogram(y=y_false,
                                                     sr=sr,
                                                     hop_length=128)
            S_db_false = librosa.core.power_to_db(S_false)
            S_db_false = self.transforms(torch.from_numpy(S_db_false))

            # Labels for position detection of the wake word
            # label = np.zeros(sample_size)
            # label[interval[1]:interval[1] + 50] = 1

            self.generated_samples.append((S_db_true.unsqueeze(dim=0).float(),
                                           torch.tensor([1]).float()))

            self.generated_samples.append((S_db_false.unsqueeze(dim=0).float(),
                                           torch.tensor([0]).float()))
from convolution_net.learner import Learner
from convolution_net.load import fetch_dataloaders, build_register, train_dev_test

# environment
# torch.manual_seed(0)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
torch.set_num_threads(4)
# np.random.seed(0)

train_tfs = {
    'audio': Compose([
        ToTensor(),
        transforms.MelSpectrogram(sample_rate=8192, n_fft=512, hop_length=128, n_mels=40),
        LogCompress(ratio=1),
        transforms.TimeMasking(4),
        transforms.FrequencyMasking(4),
        TorchUnsqueeze()
    ]),
    'target': Compose([
        ShortTermAverageTransform(frame_length=512, hop_length=128, threshold=0.5),
        # ThresholdPoolSequence(0.001),  # was 0.125
        ToTensor()
    ])
}
dev_tfs = {
    'audio': Compose([
        ToTensor(),
        transforms.MelSpectrogram(sample_rate=8192, n_fft=512, hop_length=128, n_mels=40),
        LogCompress(ratio=1),
        TorchUnsqueeze(),
Exemple #14
0
def main(args, config):

    
    model = AudioOnly(8, base_model=args.arch)
 
    import torchaudio.transforms as at

    t = []
    if args.masking_time != 0:
        t.append(at.TimeMasking(args.masking_time))

    if args.masking_freq != 0:
        t.append(at.FrequencyMasking(args.masking_freq))

    transform = transforms.Compose(t)

    dataset = AudioDataSet("train", transform=transform)

    val_transform = transforms.Compose([
       
        ])
 
    sampler = None
    

    train_loader = torch.utils.data.DataLoader(
        dataset,
        sampler=sampler,
        batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True, collate_fn=None, drop_last=False)

    val_loader = torch.utils.data.DataLoader(
        AudioDataSet("val",transform=val_transform),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True, collate_fn=None)


    logger = config.get_logger('train')
    logger.info(model)

    criterion_categorical = getattr(module_loss, config['loss'])
    criterion_continuous = getattr(module_loss, config['loss_continuous'])

    metrics = [getattr(module_metric, met) for met in config['metrics']]
    metrics_continuous = [getattr(module_metric, met) for met in config['metrics_continuous']]

    # policies = model.get_optim_policies(lr=args.lr)
    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer)

    for param_group in optimizer.param_groups:
        print(param_group['lr'])
    trainer = Trainer(model, criterion_categorical, criterion_continuous, metrics, metrics_continuous, optimizer,
                      categorical=True,
                      continuous=False,
                      config=config,
                      data_loader=train_loader,
                      valid_data_loader=val_loader,
                      lr_scheduler=lr_scheduler)

    trainer.train()


    test_loader = torch.utils.data.DataLoader(
        AudioDataSet("test",transform=val_transform),
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True, collate_fn=None)
   

    """ load best model and test """
    cp = torch.load(str(trainer.checkpoint_dir / 'model_best.pth'))

    model.load_state_dict(cp['state_dict'],strict=True)
    print('loaded', str(trainer.checkpoint_dir / 'model_best.pth'), 'best_epoch', cp['epoch'])

    trainer = Trainer(model, criterion_categorical, criterion_continuous, metrics, metrics_continuous, optimizer,
                      categorical=True,
                      continuous=False,
                      config=config,
                      data_loader=train_loader,
                      valid_data_loader=test_loader,
                      lr_scheduler=lr_scheduler)


    trainer.test()