Beispiel #1
0
 def __init__(self,
              filter_length=1024,
              hop_length=256,
              win_length=1024,
              n_mel_channels=80,
              sampling_rate=22050,
              mel_fmin=0.0,
              mel_fmax=8000.0,
              dynamic_range_compression='nvidia'):
     super(MelTransformer, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.dynamic_range_compression = dynamic_range_compression
     self.stft_fn = STFT(filter_length, hop_length, win_length)
     mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                n_mel_channels, mel_fmin, mel_fmax)
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer('mel_basis', mel_basis)
Beispiel #2
0
 def __init__(
     self,
     filter_length=1024,
     hop_length=256,
     win_length=1024,
     n_mel_channels=80,
     sampling_rate=22050,
     mel_fmin=0.0,
     mel_fmax=None,
 ):
     super(TacotronSTFT, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.stft_fn = STFT(filter_length, hop_length, win_length)
     mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                n_mel_channels, mel_fmin, mel_fmax)
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer("mel_basis", mel_basis)
Beispiel #3
0
    def get_mel(self, y, center=False):
        sampling_rate = self.target_sr
        n_mels = self.n_mels
        n_fft = self.n_fft
        win_size = self.win_size
        hop_length = self.hop_length
        fmin = self.fmin
        fmax = self.fmax
        clip_val = self.clip_val

        if torch.min(y) < -1.:
            print('min value is ', torch.min(y))
        if torch.max(y) > 1.:
            print('max value is ', torch.max(y))

        if fmax not in self.mel_basis:
            mel = librosa_mel_fn(sampling_rate, n_fft, n_mels, fmin, fmax)
            self.mel_basis[str(fmax) + '_' +
                           str(y.device)] = torch.from_numpy(mel).float().to(
                               y.device)
            self.hann_window[str(y.device)] = torch.hann_window(
                self.win_size).to(y.device)

        y = torch.nn.functional.pad(y.unsqueeze(1), (int(
            (n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
                                    mode='reflect')
        y = y.squeeze(1)

        spec = torch.stft(y,
                          n_fft,
                          hop_length=hop_length,
                          win_length=win_size,
                          window=self.hann_window[str(y.device)],
                          center=center,
                          pad_mode='reflect',
                          normalized=False,
                          onesided=True)

        spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))

        spec = torch.matmul(self.mel_basis[str(fmax) + '_' + str(y.device)],
                            spec)
        spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
        return spec
def mel_spectrogram(y,
                    n_fft,
                    num_mels,
                    sampling_rate,
                    hop_size,
                    win_size,
                    fmin,
                    fmax,
                    center=False):
    if torch.min(y) < -1.0:
        print("min value is ", torch.min(y))
    if torch.max(y) > 1.0:
        print("max value is ", torch.max(y))

    global mel_basis, hann_window
    if fmax not in mel_basis:
        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
        mel_basis[str(fmax) + "_" +
                  str(y.device)] = torch.from_numpy(mel).float().to(y.device)
        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)

    y = torch.nn.functional.pad(y.unsqueeze(1), (int(
        (n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
                                mode="reflect")
    y = y.squeeze(1)

    spec = torch.stft(
        y,
        n_fft,
        hop_length=hop_size,
        win_length=win_size,
        window=hann_window[str(y.device)],
        center=center,
        pad_mode="reflect",
        normalized=False,
        onesided=True,
    )

    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))

    spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec)
    spec = spectral_normalize_torch(spec)

    return spec
Beispiel #5
0
    def __init__(self, n_fft, hop_length, win_length, sample_rate, n_mels,
                 f_min, f_max, preemph):
        super().__init__()

        window = torch.hann_window(win_length).float()
        self.register_buffer("window", window)

        mel_basis = torch.from_numpy(
            librosa_mel_fn(sample_rate, n_fft, n_mels, f_min, f_max)).float()
        self.register_buffer("mel_basis", mel_basis)

        preemph_kernel = torch.FloatTensor([[[-preemph, 1]]])
        self.register_buffer("preemph_kernel", preemph_kernel)

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.win_length = win_length
        self.sample_rate = sample_rate
        self.n_mels = n_mels
Beispiel #6
0
 def __init__(self):
     super(TacotronSTFT, self).__init__()
     self.n_mel_channels = hps.n_mel_channels
     self.sampling_rate = hps.sampling_rate
     self.filter_length = hps.filter_length
     self.hop_length = hps.hop_length
     self.win_length = hps.win_length
     self.mel_fmin = hps.mel_fmin
     self.mel_fmax = hps.mel_fmax
     self.stft_fn = STFT(filter_length=self.filter_length,
                         hop_length=self.hop_length,
                         win_length=self.win_length)
     # numpy
     mel_basis = librosa_mel_fn(self.sampling_rate, self.filter_length,
                                self.n_mel_channels, self.mel_fmin,
                                self.mel_fmax)
     # np -> torch
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer('mel_basis', mel_basis)
Beispiel #7
0
 def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
              n_mel_channels=40, sampling_rate=16000, mel_fmin=0.0,
              mel_fmax=8000.0):
     """ mel 特征抽取
     :param filter_length: fft采样点数
     :param hop_length:  移动 stride
     :param win_length: 窗长
     :param n_mel_channels: mel channel 个数
     :param sampling_rate: 采样率
     :param mel_fmin:   最小截止频率
     :param mel_fmax:  最大截止频率
     """
     super(MelSpec, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.stft_fn = STFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length)
     mel_bias = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
     mel_bias = torch.from_numpy(mel_bias).float()
     self.register_buffer('mel_bias', mel_bias)
Beispiel #8
0
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(
            filter_length, hop_length,
            win_length)  # hop and window length are in samples.
        mel_basis = librosa_mel_fn(
            sampling_rate, filter_length, n_mel_channels, mel_fmin,
            mel_fmax)  ### filter_length = number of FFT components

        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
Beispiel #9
0
 def __init__(
     self,
     n_fft=1024,
     hop_length=256,
     win_length=1024,
     sampling_rate=22050,
     n_mel_channels=80,
     mel_fmin=0.0,
     mel_fmax=None,
 ):
     super(Audio2Mel, self).__init__()
     ##############################################
     # FFT Parameters                              #
     ##############################################
     mel_basis = librosa_mel_fn(sampling_rate, n_fft, n_mel_channels,
                                mel_fmin, mel_fmax)
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer("mel_basis", mel_basis)
     self.stft = STFT(n_fft, hop_length, win_length, sampling_rate)
     self.n_mel_channels = n_mel_channels
Beispiel #10
0
    def __init__(self, hparams: TSTFTHParams, logger: Logger):
        super().__init__()
        self.logger = logger
        self.n_mel_channels = hparams.n_mel_channels
        self.sampling_rate = hparams.sampling_rate
        self.stft_fn = STFT(
            filter_length=hparams.filter_length,
            hop_length=hparams.hop_length,
            win_length=hparams.win_length,
            window=hparams.window,
        )

        mel_basis = librosa_mel_fn(
            sr=hparams.sampling_rate,
            n_fft=hparams.filter_length,
            n_mels=hparams.n_mel_channels,
            fmin=hparams.mel_fmin,
            fmax=hparams.mel_fmax,
        )
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
Beispiel #11
0
    def __init__(self, hparams):
        self.hparams = hparams
        # Module does not support ModuleList yet.
        for i in range(hparams.n_flows):
            setattr(self, f'WN_{i}', WN(hparams))

        n_half = hparams.n_samples_per_group // 2
        n_remaining_channels = hparams.n_samples_per_group
        for k in range(hparams.n_flows):
            if k % hparams.n_early_every == 0 and k > 0:
                n_half = n_half - hparams.n_early_size // 2
                n_remaining_channels = n_remaining_channels - hparams.n_early_size
        self.n_remaining_channels = n_remaining_channels

        mel_basis = librosa_mel_fn(hparams.sr,
                                   hparams.n_fft,
                                   n_mels=hparams.n_mels,
                                   fmin=hparams.mel_fmin,
                                   fmax=hparams.mel_fmax)
        self.basis = nn.Variable.from_numpy_array(mel_basis[None, ...])
        self.rng = np.random.RandomState(hparams.seed)
Beispiel #12
0
 def __init__(self,
              filter_length=1024,
              hop_length=256,
              win_length=1024,
              n_mel_channels=80,
              sampling_rate=22050,
              mel_fmin=0.0,
              mel_fmax=8000.0,
              clamp_val=1e-5,
              stft_dtype=torch.float32):
     super(TacotronSTFT, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.clip_val = clamp_val
     self.stft_fn = STFT(filter_length,
                         hop_length,
                         win_length,
                         dtype=stft_dtype)
     mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                n_mel_channels, mel_fmin, mel_fmax)
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer('mel_basis', mel_basis)
Beispiel #13
0
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=None,
                 ref_level_db=10.,
                 min_level_db=-100.):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

        # to be used for mel_spectrogram_dbver
        self.ref_level_db = ref_level_db
        self.min_level_db = min_level_db
Beispiel #14
0
    def __init__(self, metadata, hparams, shuffle=False, rng=None):
        if rng is None:
            rng = np.random.RandomState(hparams.seed)
        super().__init__(shuffle=shuffle, rng=rng)

        # read text and wave files
        texts, waves = list(), list()
        path = Path(hparams.data_dir)
        with open(path / metadata, encoding='utf-8') as f:
            for line in f:
                inputs = line.strip().split('|')
                waves.append(str(path / 'wavs' / f'{inputs[0]}.wav'))
                texts.append(inputs[2])

        # split data
        n = len(waves)
        index = self._rng.permutation(n) if shuffle else np.arange(n)
        if hasattr(hparams, 'comm'):  # distributed learning
            num = n // hparams.comm.n_procs
            index = index[num * hparams.comm.rank:num *
                          (hparams.comm.rank + 1)]

        self._waves = [waves[i] for i in index]
        self._texts = [texts[i] for i in index]
        self._path = Path(hparams.save_data_dir)
        self._size = len(self._waves)
        self._variables = hparams.out_variables
        self.hparams = hparams
        self._char2idx = {ch: i for i, ch in enumerate(hparams.vocab)}
        self._idx2char = {i: ch for i, ch in enumerate(hparams.vocab)}

        # compute the mel basis
        self.mel_basis = librosa_mel_fn(hparams.sr,
                                        hparams.n_fft,
                                        n_mels=hparams.n_mels,
                                        fmin=hparams.mel_fmin,
                                        fmax=hparams.mel_fmax)
        self.reset()
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--in-dir",
                        default="data/raw/",
                        help="Path to raw audio.")

    valid_methods = ["mel", "if", "taco"]
    parser.add_argument("--method",
                        default=valid_methods[2],
                        help="Preprocessing method to use.",
                        choices=valid_methods)

    parser.add_argument("--out-dir",
                        default="data/",
                        help="Output base directory")

    args = parser.parse_args()

    args.fs = 22050  # 48000

    if args.method not in valid_methods:
        raise ValueError(
            "Expected method to be one of {}".format(valid_methods))

    args.out_dir = join(args.out_dir, args.method)
    makedirs(args.out_dir, exist_ok=True)

    if args.method == valid_methods[0]:
        preprocess_mel(args.in_dir, args)
    if args.method == valid_methods[1]:
        preprocess_if(args._in_dir, args)
    if args.method == valid_methods[2]:
        mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000)
        mel_basis = torch.from_numpy(mel_basis).float()
        stft = STFT(1024, 256, 1024)
        embed_taco = partial(taco_mel, mel_basis, stft)
        preprocess_taco = partial(preprocess_wrap, embed_taco, 'taco')
        preprocess_taco(args.in_dir, args)
Beispiel #16
0
 def __init__(
     self,
     n_fft=1024,
     hop_length=256,
     win_length=1024,
     sampling_rate=22050,
     n_mel_channels=80,
     mel_fmin=0.0,
     mel_fmax=None,
 ):
     super().__init__()
     window = torch.hann_window(win_length).float()
     mel_basis = librosa_mel_fn(sampling_rate, n_fft, n_mel_channels,
                                mel_fmin, mel_fmax)
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer("mel_basis", mel_basis)
     self.register_buffer("window", window)
     self.n_fft = n_fft
     self.hop_length = hop_length
     self.win_length = win_length
     self.sampling_rate = sampling_rate
     self.n_mel_channels = n_mel_channels
     self.pad_size = (n_fft - hop_length) // 2
Beispiel #17
0
 def __init__(
     self,
     n_fft: int = 1024,
     hop_length: int = 256,
     win_length: int = 1024,
     sampling_rate: int = 22050,
     n_mel_channels: int = 80,
     mel_fmin: float = 0.0,
     mel_fmax: float = None,
 ):
     """
     @TODO
     Args:
         n_fft:
         hop_length:
         win_length:
         sampling_rate:
         n_mel_channels:
         mel_fmin:
         mel_fmax:
     """
     super().__init__()
     ##############################################
     # FFT Parameters                              #
     ##############################################
     window = torch.hann_window(win_length).float()
     mel_basis = librosa_mel_fn(
         sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax
     )
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer("mel_basis", mel_basis)
     self.register_buffer("window", window)
     self.n_fft = n_fft
     self.hop_length = hop_length
     self.win_length = win_length
     self.sampling_rate = sampling_rate
     self.n_mel_channels = n_mel_channels
Beispiel #18
0
    def __init__(self, gen, dis, gen_optim, dis_optim, dataloader, hp):
        self.gen = gen
        self.dis = dis
        self.gen_optim = gen_optim
        self.dis_optim = dis_optim
        self.dataloader = dataloader
        self.hp = hp

        # compute mel basis
        mel_basis = librosa_mel_fn(hp.sr,
                                   hp.n_fft,
                                   n_mels=hp.n_mels,
                                   fmin=hp.mel_fmin,
                                   fmax=hp.mel_fmax)
        self.mel_basis = nn.Variable.from_numpy_array(mel_basis[None, ...])

        self.one_epoch_train = dataloader['train'].size // hp.batch_size
        self.one_epoch_valid = dataloader['valid'].size // hp.batch_size
        self.placeholder = dict()

        self.monitor = ProgressMeter(self.one_epoch_train,
                                     hp.output_path,
                                     quiet=hp.comm.rank > 0)
        hp.save(Path(hp.output_path) / 'settings.json')
Beispiel #19
0
def log_mel_spectrogram(wave, sr, window_size, n_mels=80):
    """Return log mel-spectrogram.

    Args:
        wave (nn.Variable): Input waveform of shape (B, 1, L).
        sr (int): Sampling rate.
        window_size (int): Window size.
        n_mels (int): Number of mel banks.
        jitter (bool): Whether to apply random crop. Defaults to False.
        max_jitter_steps (int): Maximum number of jitter steps if jitter is
            set to `True`.

    Returns:
        nn.Variable: Log mel-spectrogram.
    """
    linear = spectrogram(wave, window_size)
    mel_basis = librosa_mel_fn(sr,
                               window_size,
                               n_mels=n_mels,
                               fmin=80.0,
                               fmax=7600.0)
    basis = nn.Variable.from_numpy_array(mel_basis[None, ...])
    mels = F.batch_matmul(basis, linear)
    return F.log(mels * 1e4 + 1.0)
Beispiel #20
0
    def __init__(self,
                 type,
                 sampling_rate,
                 max_wave_value,
                 sfft={
                     'filter_length': 1024,
                     'hop_length': 256,
                     'win_length': 1024
                 },
                 mel=None):
        """Everything that we need to init"""
        self.type = type
        self.sampling_rate = sampling_rate
        self.max_wave_value = max_wave_value
        self.sfft = sfft

        assert not mel is None
        mel_fmax = None if mel['mel_fmax'] == 'None' else mel['mel_fmax']
        self.mel_basis = librosa_mel_fn(self.sampling_rate,
                                        sfft['filter_length'],
                                        mel['n_mel_channels'], mel['mel_fmin'],
                                        mel_fmax)
        self.inv_mel_basis = np.linalg.pinv(self.mel_basis)
        self.num_channels = mel['n_mel_channels']
Beispiel #21
0
    def __init__(self,
            n_fft=hp.n_fft,  # filter length
            hop_length=hp.hop_size,
            win_length=hp.win_size,
            sampling_rate=hp.sample_rate,
            n_mel=hp.n_mel,
            mel_fmin=hp.mel_fmin,
            mel_fmax=hp.mel_fmax):
        super(MySTFT, self).__init__()

        self.n_mel_channels = n_mel
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.sampling_rate = sampling_rate

        # does reflection padding with n_fft // 2 on both sides of signal
        self.stft = STFT(n_fft, hop_length, win_length, window='hann')

        mel_basis = librosa_mel_fn(
            sampling_rate, n_fft, n_mel, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        mel_inverse = torch.pinverse(mel_basis)
        self.register_buffer('mel_basis', mel_basis)
        self.register_buffer('mel_inverse', mel_inverse)
Beispiel #22
0
    def __init__(self, config):
        super(Network, self).__init__()

        input_shape = config['input_shape']
        n_classes = config['n_classes']

        base_channels = config['base_channels']
        block_type = config['block_type']
        depth = config['depth']
        self.pooling_padding = config.get("pooling_padding", 0) or 0
        self.use_raw_spectograms = config.get("use_raw_spectograms") or False
        self.apply_softmax = config.get("apply_softmax") or False

        assert block_type in ['basic', 'bottleneck']
        if self.use_raw_spectograms:
            mel_basis = librosa_mel_fn(
                22050, 2048, 256)
            mel_basis = torch.from_numpy(mel_basis).float()
            self.register_buffer('mel_basis', mel_basis)
        if block_type == 'basic':
            block = BasicBlock
            n_blocks_per_stage = (depth - 2) // 6
            assert n_blocks_per_stage * 6 + 2 == depth
        else:
            block = BottleneckBlock
            n_blocks_per_stage = (depth - 2) // 9
            assert n_blocks_per_stage * 9 + 2 == depth
        n_blocks_per_stage = [n_blocks_per_stage, n_blocks_per_stage, n_blocks_per_stage]

        if config.get("n_blocks_per_stage") is not None:
            shared_globals.console.warning(
                "n_blocks_per_stage is specified ignoring the depth param, nc=" + str(config.get("n_channels")))
            n_blocks_per_stage = config.get("n_blocks_per_stage")

        n_channels = config.get("n_channels")
        if n_channels is None:
            n_channels = [
                base_channels,
                base_channels * 2 * block.expansion,
                base_channels * 4 * block.expansion
            ]
        if config.get("grow_a_lot"):
            n_channels[2] = base_channels * 8 * block.expansion

        self.in_c = nn.Sequential(nn.Conv2d(
            input_shape[1],
            n_channels[0],
            kernel_size=5,
            stride=2,
            padding=1,
            bias=False),
            nn.BatchNorm2d(n_channels[0]),
            nn.ReLU(True)
        )
        self.stage1 = self._make_stage(
            n_channels[0], n_channels[0], n_blocks_per_stage[0], block, stride=1, maxpool=config['stage1']['maxpool'],
            k1s=config['stage1']['k1s'], k2s=config['stage1']['k2s'])
        if n_blocks_per_stage[1] == 0:
            self.stage2 = nn.Sequential()
            n_channels[1] = n_channels[0]
            print("WARNING: stage2 removed")
        else:
            self.stage2 = self._make_stage(
                n_channels[0], n_channels[1], n_blocks_per_stage[1], block, stride=1, maxpool=config['stage2']['maxpool'],
                k1s=config['stage2']['k1s'], k2s=config['stage2']['k2s'])
        if n_blocks_per_stage[2] == 0:
            self.stage3 = nn.Sequential()
            n_channels[2] = n_channels[1]
            print("WARNING: stage3 removed")
        else:
            self.stage3 = self._make_stage(
                n_channels[1], n_channels[2], n_blocks_per_stage[2], block, stride=1, maxpool=config['stage3']['maxpool'],
                k1s=config['stage3']['k1s'], k2s=config['stage3']['k2s'])

        ff_list = []

        ff_list += [nn.Conv2d(
            n_channels[2],
            n_classes,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False),
            nn.BatchNorm2d(n_classes),
        ]

        self.stop_before_global_avg_pooling = False
        if config.get("stop_before_global_avg_pooling"):
            self.stop_before_global_avg_pooling = True
        else:
            ff_list.append(nn.AdaptiveAvgPool2d((1, 1)))

        self.feed_forward = nn.Sequential(
            *ff_list
        )

        # initialize weights
        if config.get("weight_init") == "fixup":
            self.apply(initialize_weights)
            if isinstance(self.feed_forward[0], nn.Conv2d):
                self.feed_forward[0].weight.data.zero_()
            self.apply(initialize_weights_fixup)
        else:
            self.apply(initialize_weights)
        self.use_check_point = config.get("use_check_point") or False
Beispiel #23
0
# limitations under the License.

import multiprocessing
import random
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

import librosa as lr
import numpy as np
from librosa.filters import mel as librosa_mel_fn
from tqdm import tqdm

from hparams import hparams as hp

mel_basis = librosa_mel_fn(
    hp.sr, hp.n_fft, n_mels=hp.n_mels,
    fmin=hp.mel_fmin, fmax=hp.mel_fmax
)


def process(line):
    r"""Read audio waveform and preprocess it.

    Args:
        line (str): A line from metadata.
    """
    path = Path(hp.corpus_path) / 'wavs'
    meta = line.strip().split('|')
    wave = lr.load(path / f'{meta[0]}.wav', sr=hp.sr)[0]
    np.savez(
        Path(hp.precomputed_path) / 'data' / (meta[0] + '.npz'),
        wave=wave
Beispiel #24
0
                    default="mag",
                    required=False,
                    help='data_type')

args = parser.parse_args()
os.environ["CUDA_VISIBLE_DEVICES"] = args.rank
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
## hyperparamerter
hp = create_hparams(f"hp_config/{args.hp_config}")

## create logger
logger = prepare_directories_and_logger(
    Logger, output_directory=f'output/{args.output_directory}')

if args.feat_type == 'mel':
    mel_basis = librosa_mel_fn(22050, 1024, 80, 0, None)
    inv_mel_basis = np.linalg.pinv(mel_basis)

####################################################################
"""
   Data Loader Part
"""


def make_inf_iterator(data_iterator):
    while True:
        for data in data_iterator:
            yield data


class AudioLoader(torch.utils.data.Dataset):
Beispiel #25
0
    def __init__(self, config):
        super(Network, self).__init__()

        input_shape = config['input_shape']
        n_classes = config['n_classes']

        base_channels = config['base_channels']
        block_type = config['block_type']
        depth = config['depth']
        self.pooling_padding = config.get("pooling_padding", 0) or 0
        self.use_raw_spectograms = config.get("use_raw_spectograms") or False
        global shift_augment
        shift_augment = config.get("features_shift") or False
        assert block_type in ['basic', 'bottleneck']
        if self.use_raw_spectograms:
            mel_basis = librosa_mel_fn(22050, 2048, 256)
            mel_basis = torch.from_numpy(mel_basis).float()
            self.register_buffer('mel_basis', mel_basis)
        if block_type == 'basic':
            block = BasicBlock
            n_blocks_per_stage = (depth - 2) // 6
            assert n_blocks_per_stage * 6 + 2 == depth
        else:
            block = BottleneckBlock
            n_blocks_per_stage = (depth - 2) // 9
            assert n_blocks_per_stage * 9 + 2 == depth
        n_blocks_per_stage = [
            n_blocks_per_stage, n_blocks_per_stage, n_blocks_per_stage
        ]
        if config.get("n_blocks_per_stage") is not None:
            shared_globals.console.warning(
                "n_blocks_per_stage is specified ignoring the depth param")
            n_blocks_per_stage = config.get("n_blocks_per_stage")

        n_channels = config.get("n_channels")
        if n_channels is None:
            n_channels = [
                base_channels, base_channels * 2 * block.expansion,
                base_channels * 4 * block.expansion
            ]

        self.in_c = nn.Sequential(
            nn.Conv2d(input_shape[1],
                      n_channels[0],
                      kernel_size=5,
                      stride=2,
                      padding=1,
                      bias=False), nn.BatchNorm2d(n_channels[0]),
            nn.ReLU(True))
        self.stage1 = self._make_stage(n_channels[0],
                                       n_channels[0],
                                       n_blocks_per_stage[0],
                                       block,
                                       stride=1,
                                       maxpool=config['stage1']['maxpool'],
                                       k1s=config['stage1']['k1s'],
                                       k2s=config['stage1']['k2s'])
        self.stage2 = self._make_stage(n_channels[0],
                                       n_channels[1],
                                       n_blocks_per_stage[1],
                                       block,
                                       stride=1,
                                       maxpool=config['stage2']['maxpool'],
                                       k1s=config['stage2']['k1s'],
                                       k2s=config['stage2']['k2s'])
        self.stage3 = self._make_stage(n_channels[1],
                                       n_channels[2],
                                       n_blocks_per_stage[2],
                                       block,
                                       stride=1,
                                       maxpool=config['stage3']['maxpool'],
                                       k1s=config['stage3']['k1s'],
                                       k2s=config['stage3']['k2s'])
        ff_list = []
        if config.get("attention_avg"):
            if config.get("attention_avg") == "sum_all":
                ff_list.append(
                    AttentionAvg(n_channels[2], n_classes, sum_all=True))
            else:
                ff_list.append(
                    AttentionAvg(n_channels[2], n_classes, sum_all=False))
        else:
            ff_list += [
                nn.Conv2d(n_channels[2],
                          n_classes,
                          kernel_size=1,
                          stride=1,
                          padding=0,
                          bias=False),
                nn.BatchNorm2d(n_classes),
            ]

        self.stop_before_global_avg_pooling = False
        if config.get("stop_before_global_avg_pooling"):
            self.stop_before_global_avg_pooling = True
        else:
            ff_list.append(nn.AdaptiveAvgPool2d((1, 1)))

        self.feed_forward = nn.Sequential(*ff_list)
        # # compute conv feature size
        # with torch.no_grad():
        #     self.feature_size = self._forward_conv(
        #         torch.zeros(*input_shape)).view(-1).shape[0]
        #
        # self.fc = nn.Linear(self.feature_size, n_classes)

        # initialize weights
        if config.get("weight_init") == "fixup":
            self.apply(initialize_weights)
            if isinstance(self.feed_forward[0], nn.Conv2d):
                self.feed_forward[0].weight.data.zero_()
            self.apply(initialize_weights_fixup)
        else:
            self.apply(initialize_weights)