コード例 #1
0
 def __init__(self,
              filter_length=1024,
              hop_length=256,
              win_length=1024,
              n_mel_channels=80,
              sampling_rate=48000,
              mel_fmin=0.0,
              mel_fmax=8000.0):
     super(TacotronSTFT, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.stft_fn = STFT(filter_length, hop_length, win_length)
     mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                n_mel_channels, mel_fmin, mel_fmax)
     mel_basis = torch.from_numpy(mel_basis).float()
     self.register_buffer('mel_basis', mel_basis)
コード例 #2
0
ファイル: melspec.py プロジェクト: taalua/speech2singing
    def __init__(self, hp):
        super(MelSpectrogram, self).__init__()
        self.n_mel_channels = hp.n_mel_channels
        self.sampling_rate = hp.sampling_rate

        self.stft_fn = STFT(hp.filter_length, hp.hop_length,
                            hp.win_length).cuda()
        mel_basis = librosa_mel_fn(hp.sampling_rate, hp.filter_length,
                                   hp.n_mel_channels, hp.mel_fmin, None)

        inv_mel_basis = np.linalg.pinv(mel_basis)

        mel_basis = torch.from_numpy(mel_basis).float()
        inv_mel_basis = torch.from_numpy(inv_mel_basis).float().cuda()

        self.register_buffer('mel_basis', mel_basis)
        self.register_buffer('inv_mel_basis', inv_mel_basis)
コード例 #3
0
    def __init__(self,
                 filter_length: int = 1024,
                 hop_length: int = 256,
                 win_length: int = 1024,
                 n_mel_channels: int = 80,
                 sampling_rate: int = 22050,
                 mel_fmin: float = 0.0,
                 mel_fmax: float = 8000.0):
        super(TCTRN_Stft, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)

        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
コード例 #4
0
ファイル: melspec.py プロジェクト: taalua/speech2singing
class MelSpectrogram(torch.nn.Module):
    def __init__(self, hp):
        super(MelSpectrogram, self).__init__()
        self.n_mel_channels = hp.n_mel_channels
        self.sampling_rate = hp.sampling_rate

        self.stft_fn = STFT(hp.filter_length, hp.hop_length,
                            hp.win_length).cuda()
        mel_basis = librosa_mel_fn(hp.sampling_rate, hp.filter_length,
                                   hp.n_mel_channels, hp.mel_fmin, None)

        inv_mel_basis = np.linalg.pinv(mel_basis)

        mel_basis = torch.from_numpy(mel_basis).float()
        inv_mel_basis = torch.from_numpy(inv_mel_basis).float().cuda()

        self.register_buffer('mel_basis', mel_basis)
        self.register_buffer('inv_mel_basis', inv_mel_basis)

    def _griffin_lim(self, S):
        angles = 3.1415 * (torch.rand_like(S) - 0.5)
        y = self.stft_fn.inverse(S, angles)
        y = y.squeeze(1)
        num_samples = y.size(1)
        for i in range(100):
            angles = (self.stft_fn.transform(y))[1]
            angles = angles[:, :, :S.size(2)]
            y = self.stft_fn.inverse(S, angles)
            y = y.squeeze(1)
            y = y[:, :num_samples]
        return y

    def transform(self, y):
        magnitudes, phases = self.stft_fn.transform(y)

        magnitudes = torch.abs(magnitudes)
        mel = torch.matmul(self.mel_basis, magnitudes)
        log_mel_spec = torch.log10(torch.clamp(mel, min=1e-5))

        return log_mel_spec

    def inverse(self, S):
        S = 10**(S)
        S = torch.matmul(self.inv_mel_basis, S)
        wav = self._griffin_lim(S)
        return wav
コード例 #5
0
    def __init__(self):

        # Initialize node
        rospy.init_node('detect_alpha', anonymous=True)

        # Get ros parameters
        sleep(10)
        fs = rospy.get_param("sampling_rate")
        #fs = 125
        print fs
        channel_count = rospy.get_param("eeg_channel_count")
        print channel_count

        # Initialize STFT
        self.stft = STFT(fs, 1.0, 0.25, channel_count)
        self.stft.remove_dc()
        self.stft.bandpass(5.0, 15.0)
        self.stft.window('hann')
        self.freq_bins = self.stft.freq_bins
        self.FFT = np.zeros((len(self.freq_bins), channel_count))

        # Choose channels
        self.channel_mask = np.full(channel_count, False, dtype=bool)
        self.channel_mask[7 - 1] = True
        self.channel_mask[8 - 1] = True

        # Define bands
        self.G1_mask = np.logical_and(5 < self.freq_bins, self.freq_bins < 7.5)
        self.Al_mask = np.logical_and(8.5 < self.freq_bins,
                                      self.freq_bins < 11.5)
        self.G2_mask = np.logical_and(12.5 < self.freq_bins,
                                      self.freq_bins < 15)

        # Initialize filters
        self.movavg = MovAvg(4)
        self.ignore = Ignore(0)

        # Setup publishers
        self.pub_guard1 = rospy.Publisher('guard1', Float32, queue_size=1)
        self.pub_alpha = rospy.Publisher('alpha', Float32, queue_size=1)
        self.pub_guard2 = rospy.Publisher('guard2', Float32, queue_size=1)
        self.pub_eyes = rospy.Publisher('eyes_closed', Bool, queue_size=1)

        # Subscribe
        rospy.Subscriber("eeg_channels", BCIuVolts, self.newSample)
コード例 #6
0
class Denoiser(torch.nn.Module):
    """ Removes model bias from audio produced with waveglow """
    def __init__(self,
                 waveglow,
                 filter_length=1024,
                 n_overlap=4,
                 win_length=1024,
                 mode='zeros',
                 device="cuda"):
        super(Denoiser, self).__init__()

        self.device = torch.device(
            "cpu" if not torch.cuda.is_available() else device)

        self.stft = STFT(filter_length=filter_length,
                         hop_length=int(filter_length / n_overlap),
                         win_length=win_length)
        self.stft.to(self.device)

        if mode == 'zeros':
            mel_input = torch.zeros((1, 80, 88),
                                    dtype=waveglow.upsample.weight.dtype,
                                    device=waveglow.upsample.weight.device)
        elif mode == 'normal':
            mel_input = torch.randn((1, 80, 88),
                                    dtype=waveglow.upsample.weight.dtype,
                                    device=waveglow.upsample.weight.device)
        else:
            raise Exception("Mode {} if not supported".format(mode))

        with torch.no_grad():
            bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
            bias_spec, _ = self.stft.transform(bias_audio)

        self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])

    def forward(self, audio, strength=0.1):
        audio_spec, audio_angles = self.stft.transform(
            audio.to(self.device).float())
        audio_spec_denoised = audio_spec - self.bias_spec * strength
        audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0)
        audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles)

        return audio_denoised
コード例 #7
0
 def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
              n_mel_channels=40, sampling_rate=16000, mel_fmin=0.0,
              mel_fmax=8000.0):
     """ mel 特征抽取
     :param filter_length: fft采样点数
     :param hop_length:  移动 stride
     :param win_length: 窗长
     :param n_mel_channels: mel channel 个数
     :param sampling_rate: 采样率
     :param mel_fmin:   最小截止频率
     :param mel_fmax:  最大截止频率
     """
     super(MelSpec, self).__init__()
     self.n_mel_channels = n_mel_channels
     self.sampling_rate = sampling_rate
     self.stft_fn = STFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length)
     mel_bias = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
     mel_bias = torch.from_numpy(mel_bias).float()
     self.register_buffer('mel_bias', mel_bias)
コード例 #8
0
ファイル: layers.py プロジェクト: rishabhjain16/tacotron2
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y, ref_level_db=20, magnitude_power=1.5):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) <= 1)

        #print('y' ,y.max(), y.mean(), y.min())
        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        #print('stft_fn', magnitudes.max(), magnitudes.mean(), magnitudes.min())
        mel_output = torch.matmul(self.mel_basis,
                                  torch.abs(magnitudes)**magnitude_power)
        #print('_linear_to_mel', mel_output.max(), mel_output.mean(), mel_output.min())
        mel_output = self.spectral_normalize(mel_output) - ref_level_db
        #print('_amp_to_db', mel_output.max(), mel_output.mean(), mel_output.min())
        mel_output = mel_normalize(mel_output)
        #print('_normalize', mel_output.max(), mel_output.mean(), mel_output.min())
        #spec = mel_denormalize(mel_output)
        #print('_denormalize', spec.max(), spec.mean(), spec.min())
        #spec = self.spectral_de_normalize(spec + ref_level_db)**(1/magnitude_power)
        #print('db_to_amp', spec.max(), spec.mean(), spec.min())
        return mel_output
コード例 #9
0
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(
            filter_length, hop_length,
            win_length)  # hop and window length are in samples.
        mel_basis = librosa_mel_fn(
            sampling_rate, filter_length, n_mel_channels, mel_fmin,
            mel_fmax)  ### filter_length = number of FFT components

        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)
コード例 #10
0
ファイル: denoiser.py プロジェクト: SortAnon/hifi-gan
    def __init__(
        self, hifigan, filter_length=1024, n_overlap=4, win_length=1024, mode="zeros"
    ):
        super(Denoiser, self).__init__()
        self.stft = STFT(
            filter_length=filter_length,
            hop_length=int(filter_length / n_overlap),
            win_length=win_length,
        ).cuda()
        if mode == "zeros":
            mel_input = torch.zeros((1, 80, 88)).cuda()
        elif mode == "normal":
            mel_input = torch.randn((1, 80, 88)).cuda()
        else:
            raise Exception("Mode {} if not supported".format(mode))

        with torch.no_grad():
            bias_audio = hifigan(mel_input).view(1, -1).float()
            bias_spec, _ = self.stft.transform(bias_audio)

        self.register_buffer("bias_spec", bias_spec[:, :, 0][:, :, None])
コード例 #11
0
ファイル: mel2samp.py プロジェクト: sajattack/waveglow
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, training_files, segment_length, filter_length,
                 hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
        self.audio_files = files_to_list(training_files)
        random.seed(1234)
        random.shuffle(self.audio_files)
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_length = segment_length
        self.sampling_rate = sampling_rate

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

        # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = audio / MAX_WAV_VALUE

        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
コード例 #12
0
ファイル: mel2samp.py プロジェクト: sajattack/waveglow
 def __init__(self, training_files, segment_length, filter_length,
              hop_length, win_length, sampling_rate, mel_fmin, mel_fmax):
     self.audio_files = files_to_list(training_files)
     random.seed(1234)
     random.shuffle(self.audio_files)
     self.stft = TacotronSTFT(filter_length=filter_length,
                              hop_length=hop_length,
                              win_length=win_length,
                              sampling_rate=sampling_rate,
                              mel_fmin=mel_fmin,
                              mel_fmax=mel_fmax)
     self.segment_length = segment_length
     self.sampling_rate = sampling_rate
コード例 #13
0
ファイル: layers.py プロジェクト: yhgon/NanoFlow
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=None,
                 ref_level_db=10.,
                 min_level_db=-100.):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

        # to be used for mel_spectrogram_dbver
        self.ref_level_db = ref_level_db
        self.min_level_db = min_level_db
コード例 #14
0
ファイル: nnet.py プロジェクト: yaoydong/voice-filter
    def __init__(self,
                 frame_len,
                 frame_hop,
                 round_pow_of_two=True,
                 embedding_dim=512,
                 log_mag=False,
                 mvn_mag=False,
                 lstm_dim=400,
                 linear_dim=600,
                 l2_norm=True,
                 bidirectional=False,
                 non_linear="relu"):
        super(VoiceFilter, self).__init__()
        supported_nonlinear = {
            "relu": F.relu,
            "sigmoid": th.sigmoid,
            "tanh": th.tanh
        }
        if non_linear not in supported_nonlinear:
            raise RuntimeError(
                "Unsupported non-linear function: {}".format(non_linear))
        N = 2**math.ceil(
            math.log2(frame_len)) if round_pow_of_two else frame_len
        num_bins = N // 2 + 1

        self.stft = STFT(frame_len,
                         frame_hop,
                         round_pow_of_two=round_pow_of_two)
        self.istft = iSTFT(frame_len,
                           frame_hop,
                           round_pow_of_two=round_pow_of_two)
        self.cnn_f = Conv2dBlock(1, 64, kernel_size=(7, 1))
        self.cnn_t = Conv2dBlock(64, 64, kernel_size=(1, 7))
        blocks = []
        for d in range(5):
            blocks.append(
                Conv2dBlock(64, 64, kernel_size=(5, 5), dilation=(1, 2**d)))
        self.cnn_tf = nn.Sequential(*blocks)
        self.proj = Conv2dBlock(64, 8, kernel_size=(1, 1))
        self.lstm = nn.LSTM(8 * num_bins + embedding_dim,
                            lstm_dim,
                            batch_first=True,
                            bidirectional=bidirectional)
        self.mask = nn.Sequential(
            nn.Linear(lstm_dim * 2 if bidirectional else lstm_dim, linear_dim),
            nn.ReLU(), nn.Linear(linear_dim, num_bins))
        self.non_linear = supported_nonlinear[non_linear]
        self.embedding_dim = embedding_dim
        self.l2_norm = l2_norm
        self.log_mag = log_mag
        self.bn = nn.BatchNorm1d(num_bins) if mvn_mag else None
コード例 #15
0
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(
            filter_length, hop_length,
            win_length)  # hop and window length are in samples.
        mel_basis = librosa_mel_fn(
            sampling_rate, filter_length, n_mel_channels, mel_fmin,
            mel_fmax)  ### filter_length = number of FFT components

        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) <= 1)
        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        if (FLAGS.OLS or FLAGS.DenseModel):
            return mel_output[:, :, 3].unsqueeze(-1)
            #stft_fn.transform pads sequence with reflection to be twice the original size.
            #hence 5 MFCC framea are produced for the 50ms window. We take the middle one which should correspond best to the original frame.
        else:
            return mel_output
コード例 #16
0
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=None,
                 n_group=256):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length,
                            hop_length,
                            win_length,
                            n_group=n_group)
        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        sq_mag = magnitudes**2
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output
コード例 #17
0
class MelSpec(torch.nn.Module):
    """
    这个类负责计算mel特征,并进行特征压缩
    """
    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
                 n_mel_channels=40, sampling_rate=16000, mel_fmin=0.0,
                 mel_fmax=8000.0):
        """ mel 特征抽取
        :param filter_length: fft采样点数
        :param hop_length:  移动 stride
        :param win_length: 窗长
        :param n_mel_channels: mel channel 个数
        :param sampling_rate: 采样率
        :param mel_fmin:   最小截止频率
        :param mel_fmax:  最大截止频率
        """
        super(MelSpec, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length)
        mel_bias = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
        mel_bias = torch.from_numpy(mel_bias).float()
        self.register_buffer('mel_bias', mel_bias)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        """ mel 特征计算
        :param y: 幅值归一化后的音频数据
        :return: mel 特征
        """
        assert torch.min(y) >= -1 and torch.max(y) <= 1
        magnitudes, phase = self.stft_fn.transform(y)   # 傅里叶变换
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_bias, magnitudes) # apply mel 三角滤波器组
        mel_output = self.spectral_normalize(mel_output) # 动态范围压缩 normalization
        return mel_output


# if __name__ == '__main__':
#     from utils import load_wav_to_torch
#     wav = load_wav_to_torch('./dataset/cough/-HG6SJVD3mQ_0.000.wav')[0]
#     mel_fn = MelSpec(filter_length=512,hop_length=160,win_length=400,n_mel_channels=40,sampling_rate=16000, mel_fmin=50,mel_fmax=800)
#     wav_norm = wav / 32768.
#     wav_norm = wav_norm.unsqueeze(0)
#     mels = mel_fn.mel_spectrogram(wav_norm)
#     print(mels)
コード例 #18
0
ファイル: denoiser.py プロジェクト: BaiYunLiu/newPLC
    def __init__(self, waveglow, filter_length=512, n_overlap=2,
                 win_length=320, mode='zeros'):
        super(Denoiser, self).__init__()
        self.stft = STFT(filter_length=filter_length,
                         hop_length=int(filter_length/n_overlap),
                         win_length=win_length).cuda()
        if mode == 'zeros':
            mel_input = torch.zeros(
                (1, 80, 88),
                dtype=waveglow.upsampling.weight.dtype,
                device=waveglow.upsampling.weight.device)
        elif mode == 'normal':
            mel_input = torch.randn(
                (1, 80, 88),
                dtype=waveglow.upsampling.weight.dtype,
                device=waveglow.upsampling.weight.device)
        else:
            raise Exception("Mode {} if not supported".format(mode))

        with torch.no_grad():
            bias_audio = waveglow.infer(mel_input, sigma=0.0).float()
            bias_spec, _ = self.stft.transform(bias_audio)

        self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None])
コード例 #19
0
ファイル: layers.py プロジェクト: CookiePPP/codedump
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y, ref_level_db=20, magnitude_power=1.5):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(
            y
        )  # get magnitudes at each (overlapped) window [B, T] ->  # [B, filter_length, T//hop_length + 1]
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis,
                                  magnitudes)  # put into mel bins(?)
        mel_output = self.spectral_normalize(
            mel_output)  # convert magnitudes to log scale
        return mel_output  # [B, n_mel_channels, T+//hop_length + 1]
コード例 #20
0
def wav_to_image(
    filename,
    wlen,
    mindata,
    maxdata,
    save=False,
    name_save=None,
):
    h = wlen / 4
    K = np.sum(hamming(wlen, False)) / wlen

    nfft = int(2**(np.ceil(np.log2(wlen))))
    Fs, data_seq = wavfile.read(filename)
    raw_data = data_seq.astype('float32')
    max_dt = np.amax(np.absolute(raw_data))
    raw_data = raw_data / max_dt
    stft_data, _, _ = STFT(raw_data, wlen, h, nfft, Fs)
    s = abs(stft_data) / wlen / K
    if np.fmod(nfft, 2):
        s[1:, :] *= 2
    else:
        s[1:-2] *= 2
    data_temp = 20 * np.log10(s + 10**-6)
    outdata = data_temp.transpose()
    """Scaling"""
    mindata = np.amin(outdata, axis=0, keepdims=True)
    maxdata = np.amax(outdata, axis=0, keepdims=True)
    outdata -= mindata
    outdata /= (maxdata - mindata)
    outdata *= 0.8
    outdata += 0.1
    figmin = np.zeros((5, outdata.shape[1]))
    figmax = np.ones((5, outdata.shape[1]))
    outdata = np.concatenate((outdata, figmin, figmax), axis=0)

    dpi = 96
    a = float(outdata.shape[0]) / dpi
    b = float(outdata.shape[1]) / dpi

    f = plt.figure(figsize=(b, a), dpi=dpi)
    f.figimage(outdata)
    if save:
        f.savefig(name_save, dpi=f.dpi)
    return f
コード例 #21
0
ファイル: detect_alpha.py プロジェクト: sahabi/shared-Control
	def __init__(self):

		# Initialize node
		rospy.init_node('detect_alpha', anonymous=True)

		# Get ros parameters
		sleep(10)
		fs = rospy.get_param("sampling_rate")
		#fs = 125
		print fs
		channel_count = rospy.get_param("eeg_channel_count")
		print channel_count

		# Initialize STFT
		self.stft = STFT(fs, 1.0, 0.25, channel_count)
		self.stft.remove_dc()
		self.stft.bandpass(5.0, 15.0)
		self.stft.window('hann')
		self.freq_bins = self.stft.freq_bins
		self.FFT = np.zeros((len(self.freq_bins), channel_count))

		# Choose channels
		self.channel_mask = np.full(channel_count, False, dtype = bool)
		self.channel_mask[7 -1] = True
		self.channel_mask[8 -1] = True

		# Define bands
		self.G1_mask = np.logical_and(5 < self.freq_bins, self.freq_bins < 7.5)
		self.Al_mask = np.logical_and(8.5 < self.freq_bins, self.freq_bins < 11.5)
		self.G2_mask = np.logical_and(12.5 < self.freq_bins, self.freq_bins < 15)

		# Initialize filters
		self.movavg = MovAvg(4)
		self.ignore = Ignore(0)

		# Setup publishers
		self.pub_guard1 = rospy.Publisher('guard1', Float32, queue_size=1)
		self.pub_alpha = rospy.Publisher('alpha', Float32, queue_size=1)
		self.pub_guard2 = rospy.Publisher('guard2', Float32, queue_size=1)
		self.pub_eyes = rospy.Publisher('eyes_closed', Bool, queue_size=1)

		# Subscribe
		rospy.Subscriber("eeg_channels", BCIuVolts, self.newSample)
コード例 #22
0
def inference(wav, model, sample_length):

    vocal = []
    bgm = []
    print(len(wav))
    print(sample_length)
    batch_size = 2**13
    for i in tqdm.tqdm(range(len(wav) // (sample_length * batch_size))):
        start = i * sample_length * batch_size
        end = min((i + 1) * sample_length, len(wav))
        small_wavs = np.stack([
            wav[start + j * sample_length:start + (j + 1) * sample_length]
            for j in range(batch_size)
        ])
        #print(small_wavs.shape)
        in_wav = torch.autograd.Variable(torch.FloatTensor(small_wavs),
                                         requires_grad=False).cuda()
        #print(in_wav.shape)
        stft = STFT(input_data=in_wav).cuda()

        magnitude, phase = stft()
        magnitude = torch.squeeze(magnitude)
        phase = torch.squeeze(phase)
        size = [in_wav.size(1) for _ in range(in_wav.size(0))]

        #print(magnitude.shape)
        vocal_recon, noise_recon = model(magnitude.transpose(1, 2))

        #print(vocal_recon.shape)
        #print(noise_recon.shape)i

        vocal.append(
            reConstructWav(size,
                           vocal_recon.transpose(1, 2).cpu().detach(),
                           phase.cpu().detach()).view(-1))
        bgm.append(
            reConstructWav(size,
                           noise_recon.transpose(1, 2).cpu().detach(),
                           phase.cpu().detach()).view(-1))

    print(torch.cat(vocal).shape)
    return torch.cat(vocal).numpy(), torch.cat(bgm).numpy()
コード例 #23
0
def prepareDataFiles(store_data, song_name, mix_path, vocal_path, bgm_path):
    try:
        os.mkdir(os.path.join(store_data, song_name))
        os.mkdir(os.path.join(os.path.join(store_data, song_name), "mixture"))
        os.mkdir(os.path.join(os.path.join(store_data, song_name), "vocal"))
        os.mkdir(os.path.join(os.path.join(store_data, song_name), "noise"))
    except:
        pass

    mixture, mix_rate = librosa.core.load(mix_path, sr=16000)
    vocal, vocal_rate = librosa.core.load(vocal_path, sr=16000)
    bgm, bgm_rate = librosa.core.load(bgm_path, sr=16000)

    # Loop through wave form and zero out any values that are close to zero so that
    # there are no points that will explode into large values.
    # Need to check effect on Spectrum, since loss is done with the spectrums rather
    # than the waveforms themselves

    for stype, data, rate in zip(["mixture", "vocal", "noise"],
                                 [mixture, vocal, bgm],
                                 [mix_rate, vocal_rate, bgm_rate]):
        path = os.path.join(os.path.join(store_data, song_name), stype)
        filename = song_name

        in_wav = torch.autograd.Variable(torch.FloatTensor(data),
                                         requires_grad=False).unsqueeze(0)
        stft = STFT(input_data=in_wav)
        magnitude, phase = stft()
        magnitude = torch.squeeze(magnitude)
        phase = torch.squeeze(phase)
        size = in_wav.size(1)
        # f, t, Sxx = signal.stft(data,rate,nperseg=1000)
        # magnitude = np.abs(Sxx)
        # phase = np.unwrap(np.angle(Sxx),axis=-2)

        np.save(os.path.join(path, "rate_" + filename), rate)
        # np.save(os.path.join(path,"freq_"+ filename),f)
        # np.save(os.path.join(path,"time_"+ filename),t)
        np.save(os.path.join(path, "magnitude_" + filename), magnitude)
        np.save(os.path.join(path, "phase_" + filename), phase)
        np.save(os.path.join(path, "size_" + filename), size)
コード例 #24
0
    def __init__(self,
                 model,
                 sound,
                 target,
                 decoder,
                 sample_rate=16000,
                 device="cpu",
                 save=None):
        """
        model: deepspeech model
        sound: raw sound data [-1 to +1] (read from torchaudio.load)
        label: string
        """
        self.sound = sound
        self.sample_rate = sample_rate
        self.target_string = target
        self.target = target
        self.__init_target()

        self.model = model
        self.model.to(device)
        self.model.train()
        self.decoder = decoder
        self.criterion = nn.CTCLoss()
        self.device = device
        n_fft = int(self.sample_rate * 0.02)
        hop_length = int(self.sample_rate * 0.01)
        win_length = int(self.sample_rate * 0.02)
        self.torch_stft = STFT(n_fft=n_fft,
                               hop_length=hop_length,
                               win_length=win_length,
                               window='hamming',
                               center=True,
                               pad_mode='reflect',
                               freeze_parameters=True,
                               device=self.device)
        self.save = save
コード例 #25
0
class TCTRN_Stft(torch.nn.Module):
    def __init__(self,
                 filter_length: int = 1024,
                 hop_length: int = 256,
                 win_length: int = 1024,
                 n_mel_channels: int = 80,
                 sampling_rate: int = 22050,
                 mel_fmin: float = 0.0,
                 mel_fmax: float = 8000.0):
        super(TCTRN_Stft, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)

        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) < 1)

        magnitudes, phase = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output
コード例 #26
0
# from torchaudio_stft import ISTFT, STFT

x = torch.rand((64, 1, 8000), requires_grad=True, dtype=torch.float32).cuda()
n_fft = 320
hop_length = 160

# ##### Test torchaudio.transforms.Spectrogram on multi-gpus
# window_fn = torch.hann_window
# power = None
# spectrogram = torchaudio.transforms.Spectrogram(
#     n_fft=n_fft,
#     hop_length=hop_length,
#     window_fn=window_fn,
#     power=power
# )
# spectrogram = nn.DataParallel(spectrogram)
# spectrogram.cuda()
# out = spectrogram(input_data)

##### Test torch.stft and torch.istft
x = F.pad(x, pad=(0, n_fft // 2), mode='constant', value=0)
stft_extractor = STFT(n_fft=n_fft, hop_length=hop_length, window='hann')
stft_extractor = nn.DataParallel(stft_extractor)
stft_extractor.cuda()
x_stft_real, x_stft_imag = stft_extractor(x)

istft_extractor = ISTFT(n_fft=n_fft, hop_length=hop_length, window='hann')
istft_extractor = nn.DataParallel(istft_extractor)
istft_extractor.cuda()
x_reconst = istft_extractor(x_stft_real, x_stft_imag, length=8000)
print(torch.max(torch.abs(x[..., :8000] - x_reconst)))
コード例 #27
0
ファイル: layers.py プロジェクト: yhgon/NanoFlow
class TacotronSTFT(torch.nn.Module):
    def __init__(self,
                 filter_length=1024,
                 hop_length=256,
                 win_length=1024,
                 n_mel_channels=80,
                 sampling_rate=22050,
                 mel_fmin=0.0,
                 mel_fmax=None,
                 ref_level_db=10.,
                 min_level_db=-100.):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        mel_basis = librosa_mel_fn(sampling_rate, filter_length,
                                   n_mel_channels, mel_fmin, mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

        # to be used for mel_spectrogram_dbver
        self.ref_level_db = ref_level_db
        self.min_level_db = min_level_db

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output

    def mel_spectrogram_dbver(self, y):
        """Alternative mel-spectrograms from a batch of waves with normalized db scale (from r9y9 wavenet_vocoder)
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert (torch.min(y.data) >= -1)
        assert (torch.max(y.data) <= 1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        S = audio_processing.amp_to_db(mel_output,
                                       self.min_level_db) - self.ref_level_db
        S = audio_processing.normalize(S, self.min_level_db)
        return S

    def dbmel_to_tacomel(self, mel):
        """
        method that converts the db-normalized melspec back to taco2 version of logmel to be used with WG
        """
        denorm = audio_processing.denormalize(
            mel, self.min_level_db) + self.ref_level_db
        amp = audio_processing.db_to_amp(denorm)
        mel_output = self.spectral_normalize(amp)
        return mel_output
コード例 #28
0
ファイル: detect_alpha.py プロジェクト: sahabi/shared-Control
class DetectAlpha():
	def __init__(self):

		# Initialize node
		rospy.init_node('detect_alpha', anonymous=True)

		# Get ros parameters
		sleep(10)
		fs = rospy.get_param("sampling_rate")
		#fs = 125
		print fs
		channel_count = rospy.get_param("eeg_channel_count")
		print channel_count

		# Initialize STFT
		self.stft = STFT(fs, 1.0, 0.25, channel_count)
		self.stft.remove_dc()
		self.stft.bandpass(5.0, 15.0)
		self.stft.window('hann')
		self.freq_bins = self.stft.freq_bins
		self.FFT = np.zeros((len(self.freq_bins), channel_count))

		# Choose channels
		self.channel_mask = np.full(channel_count, False, dtype = bool)
		self.channel_mask[7 -1] = True
		self.channel_mask[8 -1] = True

		# Define bands
		self.G1_mask = np.logical_and(5 < self.freq_bins, self.freq_bins < 7.5)
		self.Al_mask = np.logical_and(8.5 < self.freq_bins, self.freq_bins < 11.5)
		self.G2_mask = np.logical_and(12.5 < self.freq_bins, self.freq_bins < 15)

		# Initialize filters
		self.movavg = MovAvg(4)
		self.ignore = Ignore(0)

		# Setup publishers
		self.pub_guard1 = rospy.Publisher('guard1', Float32, queue_size=1)
		self.pub_alpha = rospy.Publisher('alpha', Float32, queue_size=1)
		self.pub_guard2 = rospy.Publisher('guard2', Float32, queue_size=1)
		self.pub_eyes = rospy.Publisher('eyes_closed', Bool, queue_size=1)

		# Subscribe
		rospy.Subscriber("eeg_channels", BCIuVolts, self.newSample)

	def newSample(self, msg):
		newFFT = self.stft.ingestSample(msg.data)
		if newFFT is not None:
			self.FFT = newFFT

			# Mask and average data
			guard1 = np.mean(newFFT[self.G1_mask, :][:, self.channel_mask])
			alpha = np.mean(newFFT[self.Al_mask, :][:, self.channel_mask])
			guard2 = np.mean(newFFT[self.G2_mask, :][:, self.channel_mask])

			detected = self.movavg.step(alpha > (guard1 + guard2)*1.1) > 0.5
			if detected and not self.ignore.test():
				self.movavg.reset()
				self.ignore.reset(4)
			else:
				detected = False
				
			# Publish messages
			msg = Float32()
			msg.data = guard1
			self.pub_guard1.publish(msg)

			msg = Float32()
			msg.data = alpha
			self.pub_alpha.publish(msg)

			msg = Float32()
			msg.data = guard2
			self.pub_guard2.publish(msg)

			msg = Bool()
			msg.data = detected
			self.pub_eyes.publish(msg)

	def updatePlot(self, line):
		line.set_ydata(np.sum(self.FFT[:,self.channel_mask], axis = 1))
		line.figure.canvas.draw()
コード例 #29
0
ファイル: generate.py プロジェクト: ktho22/vctts
if args.gpu is None:
    args.use_gpu = False
    args.gpu = []
else:
    args.use_gpu = True
    torch.cuda.manual_seed(args.seed)
    torch.cuda.set_device(args.gpu[0])

model = Tacotron(args)
if args.init_from:
    model.load_state_dict(checkpoint['state_dict'])
    model.reset_decoder_states()
    print('loaded checkpoint %s' % (args.init_from))

stft = STFT(filter_length=args.n_fft)
model = model.eval()
if args.use_gpu:
    model = model.cuda()
    stft = stft.cuda()


def main():
    db = TTSDataset()
    collate = collate_class(use_txt=args.use_txt)
    loader = torch.utils.data.DataLoader(db,
                                         batch_size=1,
                                         shuffle=False,
                                         collate_fn=collate.fn,
                                         drop_last=True)
    model_name = args.init_from.split('/')[-1][:-3]
コード例 #30
0
class DetectAlpha():
    def __init__(self):

        # Initialize node
        rospy.init_node('detect_alpha', anonymous=True)

        # Get ros parameters
        sleep(10)
        fs = rospy.get_param("sampling_rate")
        #fs = 125
        print fs
        channel_count = rospy.get_param("eeg_channel_count")
        print channel_count

        # Initialize STFT
        self.stft = STFT(fs, 1.0, 0.25, channel_count)
        self.stft.remove_dc()
        self.stft.bandpass(5.0, 15.0)
        self.stft.window('hann')
        self.freq_bins = self.stft.freq_bins
        self.FFT = np.zeros((len(self.freq_bins), channel_count))

        # Choose channels
        self.channel_mask = np.full(channel_count, False, dtype=bool)
        self.channel_mask[7 - 1] = True
        self.channel_mask[8 - 1] = True

        # Define bands
        self.G1_mask = np.logical_and(5 < self.freq_bins, self.freq_bins < 7.5)
        self.Al_mask = np.logical_and(8.5 < self.freq_bins,
                                      self.freq_bins < 11.5)
        self.G2_mask = np.logical_and(12.5 < self.freq_bins,
                                      self.freq_bins < 15)

        # Initialize filters
        self.movavg = MovAvg(4)
        self.ignore = Ignore(0)

        # Setup publishers
        self.pub_guard1 = rospy.Publisher('guard1', Float32, queue_size=1)
        self.pub_alpha = rospy.Publisher('alpha', Float32, queue_size=1)
        self.pub_guard2 = rospy.Publisher('guard2', Float32, queue_size=1)
        self.pub_eyes = rospy.Publisher('eyes_closed', Bool, queue_size=1)

        # Subscribe
        rospy.Subscriber("eeg_channels", BCIuVolts, self.newSample)

    def newSample(self, msg):
        newFFT = self.stft.ingestSample(msg.data)
        if newFFT is not None:
            self.FFT = newFFT

            # Mask and average data
            guard1 = np.mean(newFFT[self.G1_mask, :][:, self.channel_mask])
            alpha = np.mean(newFFT[self.Al_mask, :][:, self.channel_mask])
            guard2 = np.mean(newFFT[self.G2_mask, :][:, self.channel_mask])

            detected = self.movavg.step(alpha > (guard1 + guard2) * 1.1) > 0.5
            if detected and not self.ignore.test():
                self.movavg.reset()
                self.ignore.reset(4)
            else:
                detected = False

            # Publish messages
            msg = Float32()
            msg.data = guard1
            self.pub_guard1.publish(msg)

            msg = Float32()
            msg.data = alpha
            self.pub_alpha.publish(msg)

            msg = Float32()
            msg.data = guard2
            self.pub_guard2.publish(msg)

            msg = Bool()
            msg.data = detected
            self.pub_eyes.publish(msg)

    def updatePlot(self, line):
        line.set_ydata(np.sum(self.FFT[:, self.channel_mask], axis=1))
        line.figure.canvas.draw()
コード例 #31
0
def reConstructWav(size, magnitude, phase):
    """the differentiable reconstruction for mixture spectrogram with vocal and noise"""
    stft = STFT(size=size, magnitude=magnitude, phase=phase)
    stft = stft.cuda()
    xrec = stft(inv=True)
    return xrec
コード例 #32
0
        for i in range(len(batch)):
            wav = batch[i][0]
            assert wav.shape[-1] >= n_samples
            wav_truncated[i, :] = wav[0, :n_samples]
        return wav_truncated


dataset = torchaudio.datasets.LJSPEECH('./data')
dataset = torch.utils.data.Subset(dataset, range(100))

dataloader = torch.utils.data.DataLoader(dataset,
                                         batch_size=8,
                                         shuffle=True,
                                         collate_fn=Collate())

stft_deterministic = STFT(filter_length=256, hop_length=128, win_length=256)
stft_model = STFT(filter_length=256,
                  hop_length=128,
                  win_length=256,
                  trainable=True)

criterion = nn.MSELoss()
optimizer = Adam(stft_model.parameters(), lr=1e-1)

n_epoch = 100
torch.save(stft_model.state_dict(), f'./experiments/trainable_fft_{-1}')
for epoch in range(n_epoch):

    for i, batch in enumerate(dataloader):
        stft_model.zero_grad()
        targ = stft_deterministic(batch)