def pysptk_mfcc(self):
        self.frame_length = 1024
        self.hop_length = 80
        self.pitch = pysptk.swipe(self.audio.astype(np.float64),
                                  fs=self.sr,
                                  hopsize=self.hop_length,
                                  min=60,
                                  max=240,
                                  otype="pitch")
        self.source_excitation = pysptk.excite(self.pitch, self.hop_length)

        # Note that almost all of pysptk functions assume input array is C-contiguous and np.float4 element type
        frames = librosa.util.frame(self.audio,
                                    frame_length=self.frame_length,
                                    hop_length=self.hop_length).astype(
                                        np.float64).T

        # Windowing
        frames *= pysptk.blackman(self.frame_length)

        assert frames.shape[1] == self.frame_length

        # Order of mel-cepstrum
        self.order = 25
        self.alpha = 0.41

        self.mc = pysptk.mcep(frames, self.order, self.alpha)
        logH = pysptk.mgc2sp(self.mc, self.alpha, 0.0, self.frame_length).real
        librosa.display.specshow(logH.T,
                                 sr=self.sr,
                                 hop_length=self.hop_length,
                                 x_axis="time",
                                 y_axis="linear")
Example #2
0
    def __call__(self, pkg, cached_file=None):
        pkg = format_package(pkg)
        wav = pkg['chunk']
        wav = wav.data.numpy()
        max_frames = wav.shape[0] // self.hop
        if cached_file is not None:
            # load pre-computed data
            proso = torch.load(cached_file)
            beg_i = pkg['chunk_beg_i'] // self.hop
            end_i = pkg['chunk_end_i'] // self.hop
            proso = proso[:, beg_i:end_i]
            pkg[self.name] = proso
        else:
            # first compute logF0 and voiced/unvoiced flag
            # f0 = pysptk.rapt(wav.astype(np.float32),
            #                 fs=self.sr, hopsize=self.hop,
            #                 min=self.f0_min, max=self.f0_max,
            #                 otype='f0')
            f0 = pysptk.swipe(wav.astype(np.float64),
                              fs=self.sr, hopsize=self.hop,
                              min=self.f0_min,
                              max=self.f0_max,
                              otype='f0')
            # sound = pm.Sound(wav.astype(np.float32), self.sr)
            # f0 = sound.to_pitch(self.hop / 16000).selected_array['frequency']
            if len(f0) < max_frames:
                pad = max_frames - len(f0)
                f0 = np.concatenate((f0, f0[-pad:]), axis=0)
            lf0 = np.log(f0 + 1e-10)
            lf0, uv = interpolation(lf0, -1)
            lf0 = torch.tensor(lf0.astype(np.float32)).unsqueeze(0)[:, :max_frames]
            uv = torch.tensor(uv.astype(np.float32)).unsqueeze(0)[:, :max_frames]
            if torch.sum(uv) == 0:
                # if frame is completely unvoiced, make lf0 min val
                lf0 = torch.ones(uv.size()) * np.log(self.f0_min)
            # assert lf0.min() > 0, lf0.data.numpy()
            # secondly obtain zcr
            zcr = librosa.feature.zero_crossing_rate(y=wav,
                                                     frame_length=self.win,
                                                     hop_length=self.hop)
            zcr = torch.tensor(zcr.astype(np.float32))
            zcr = zcr[:, :max_frames]
            # finally obtain energy
            egy = librosa.feature.rmse(y=wav, frame_length=self.win,
                                       hop_length=self.hop,
                                       pad_mode='constant')
            egy = torch.tensor(egy.astype(np.float32))
            egy = egy[:, :max_frames]
            proso = torch.cat((lf0, uv, egy, zcr), dim=0)
  
            if self.der_order > 0 :
                deltas=[proso]
                for n in range(1,self.der_order+1):
                    deltas.append(librosa.feature.delta(proso.numpy(),order=n))
                proso=torch.from_numpy(np.concatenate(deltas))

            pkg[self.name] = proso
        # Overwrite resolution to hop length
        pkg['dec_resolution'] = self.hop
        return pkg
Example #3
0
def get_coefs(wav_file_path):
    sample_rate, x = wavfile.read(wav_file_path)
    # al.play(x.astype(float) / x.max(), fs=sample_rate)

    frames = librosa.util.frame(
        x,
        frame_length=frame_length,
        hop_length=hop_length).astype(np.float64).T
    frames *= pysptk.blackman(frame_length)
    f0 = pysptk.swipe(
        x.astype(np.float64),
        fs=sample_rate,
        hopsize=hop_length,
        min=50,
        max=500)

    # order = 40
    # alpha = 0.41
    mc = np.apply_along_axis(
        pysptk.mcep,
        1,
        frames,
        order,
        alpha)

    return sample_rate, f0, mc  # sample_rate- ?, f0, mel-cepstrum coefs
Example #4
0
    def cal_prosody(self,wav):
        # Input: wav: audio signal in numpy.array format
        # Output: proso: Tensor, [max_frames, 4]
        max_frames = wav.shape[0] // self.hop

        f0 = pysptk.swipe(wav.astype(np.float64),
                          fs=self.sr, hopsize=self.hop,
                          min=self.f0_min,
                          max=self.f0_max,
                          otype='f0')
        lf0 = np.log(f0 + 1e-10)
        lf0, uv = self.interpolation(lf0, -1)

        lf0 = torch.tensor(lf0.astype(np.float32)).unsqueeze(0)[:, :max_frames]# (1,num_frame)
        uv = torch.tensor(uv.astype(np.float32)).unsqueeze(0)[:, :max_frames]
        if torch.sum(uv) == 0:
            # if frame is completely unvoiced, make lf0 min val
            lf0 = torch.ones(uv.size()) * np.log(self.f0_min)
        assert lf0.min() > 0, lf0.data.numpy()
        # secondly obtain zcr
        zcr = librosa.feature.zero_crossing_rate(y=wav,
                                                 frame_length=self.win,
                                                 hop_length=self.hop)
        zcr = torch.tensor(zcr.astype(np.float32))
        zcr = zcr[:, :max_frames]
        # finally obtain energy
        egy = librosa.feature.rms(y=wav, frame_length=self.win,
                                   hop_length=self.hop,
                                   pad_mode='constant')
        egy = torch.tensor(egy.astype(np.float32))
        egy = egy[:, :max_frames]
        proso = torch.cat((lf0, uv, egy, zcr), dim=0).unsqueeze(0)#(1,4,num_frame)
        return proso
Example #5
0
def pysptk_featurize(audiofile):
    labels = list()
    features = list()
    fs, x = wavfile.read(audiofile)

    f0_swipe = pysptk.swipe(x.astype(np.float64),
                            fs=fs,
                            hopsize=80,
                            min=60,
                            max=200,
                            otype="f0")
    features = features + stats(f0_swipe)
    labels = stats_labels('f0_swipe', labels)

    f0_rapt = pysptk.rapt(x.astype(np.float32),
                          fs=fs,
                          hopsize=80,
                          min=60,
                          max=200,
                          otype="f0")
    features = features + stats(f0_rapt)
    labels = stats_labels('f0_rapt', labels)

    mgc = pysptk.mgcep(xw, 20, 0.0, 0.0)
    features = features + stats(mgc)
    labels = stats_labels('mel-spectrum envelope', labels)

    return features, labels
Example #6
0
def test_swipe_regression():
    # Grund truth data is generated by:
    #
    # $ wav2raw pysptk/example_audio_data/arctic_a0007.wav
    #
    # $ x2x +sf ./pysptk/example_audio_data/arctic_a0007.raw | \
    #    pitch -a 1 -s 16 -p 80 -L 60 -H 240 -o 0 > \
    #    arctic_a007_p16_L60_H240_o0_swipe.pitch
    #
    # $ dmp +f arctic_a007_p16_L60_H240_o0_swuoe.pitch | awk '{print $2}' >\
    #    arctic_a007_p16_L60_H240_o0_swipe.txt
    #
    # $ pitch -h
    #  ...
    #
    # SPTK: version 3.10
    # CVS Info: $Id: pitch.c,v 1.53 2016/12/25 05:00:19 uratec Exp $

    ground_truth_path = join(
        dirname(__file__), "data", "arctic_a007_p16_L60_H240_o0_swipe.txt"
    )
    with open(ground_truth_path) as f:
        ground_truth = np.asarray([float(s) for s in [line for line in f.readlines()]])
    ground_truth = ground_truth.astype(np.float32)

    fs, x = wavfile.read(pysptk.util.example_audio_file())
    assert fs == 16000

    # Since SPTK might have memory corruption bug and the result might be
    # non-deterministic, test it with multiple time...
    for _ in range(5):
        f0 = pysptk.swipe(
            x.astype(np.float64), fs=fs, hopsize=80, min=60, max=240, otype=0
        )
        assert np.allclose(ground_truth, f0)
def source_excitation_generation(np_data, rate):
    pitch = ps.swipe(np_data.astype(np.float64),
                     fs=rate,
                     hopsize=HOP_LENGTH,
                     min=60,
                     max=240,
                     otype="pitch")
    source_excitation = ps.excite(pitch, HOP_LENGTH)
    return source_excitation
Example #8
0
def F0_swipe(
        waveform,
        hop_length=None,
        sr=None,
        hop_time=None,
        f_min=60,  # default in swipe
        f_max=240,  # default in swipe
        threshold=0.5,  # custom defualt (0.3 in swipe)
):
    if hop_length is not None:
        hopsize = hop_length
    else:
        hopsize = int(sr * hop_time)

    if waveform.ndim == 1:
        return torch.from_numpy(
            swipe(
                waveform.contiguous().double().numpy(),
                fs=sr,
                hopsize=hopsize,
                min=f_min,
                max=f_max,
                threshold=threshold,
                otype="f0",
            )).float()
    elif waveform.ndim == 2:  # (B, N)
        f0 = []
        for audio in waveform:
            f0.append(
                torch.from_numpy(
                    swipe(
                        audio.contiguous().double().numpy(),
                        fs=sr,
                        hopsize=hopsize,
                        min=f_min,
                        max=f_max,
                        threshold=threshold,
                        otype="f0",
                    )).float())
        return torch.stack(f0)
Example #9
0
def get_synt_wav(wav_file_path):
    # # Synthesis from mel-cepstrum
    sample_rate, x = wavfile.read(wav_file_path)
    # assert sample_rate == 16000
    # al.play(x.astype(float) / x.max(), fs=sample_rate)  # Audio(x, rate=sample_rate)

    # all of pysptk functions assume input array is C-contiguous
    # and np.float4 element type
    frames = librosa.util.frame(
        x,
        frame_length=frame_length,
        hop_length=hop_length).astype(np.float64).T

    # Windowing
    frames *= pysptk.blackman(frame_length)

    # assert frames.shape[1] == frame_length

    # F0 estimation
    f0 = pysptk.swipe(
        x.astype(np.float64),
        fs=sample_rate,
        hopsize=hop_length,
        min=50,
        max=500)

    generator = excite.ExcitePulse(sample_rate, hop_length, False)
    source_excitation = generator.gen(f0)

    # apply function along with `time` axis (=1)
    mc = np.apply_along_axis(
        pysptk.mcep,
        1,
        frames,
        order,
        alpha)

    # Convert mel-cesptrum to MLSADF coefficients
    b = np.apply_along_axis(pysptk.mc2b, 1, mc, alpha)

    synthesizer = pysptk.synthesis.Synthesizer(
        pysptk.synthesis.MLSADF(
            order=order, alpha=alpha),
        hop_length)

    x_synthesized = synthesizer.synthesis(source_excitation, b)
    # Audio(x_synthesized, rate=sample_rate)
    # al.play(x_synthesized.astype(float) / x_synthesized.max(), fs=sample_rate)
    return x_synthesized
Example #10
0
def pitch_detect(sndarray, fs, chunk_size):
    """
    pitch_detect(sndarray,fs, chunk_size)
    
			pitch_detect computes the fundamental frequency/pitches of blocks/ of Chunks
		Parameters:sndarray - Discrete Data
				   fs -Sampling frequency
				   chunk_size
		Returns f0 
    """
    new_sndarray = numpy.asarray(numpy.float64(sndarray))
    f0 = pysptk.swipe(numpy.asarray(new_sndarray), fs, chunk_size, 65, 500,
                      0.001, 1)

    return f0
Example #11
0
 def __call__(self, pkg, cached_file=None):
     pkg = format_package(pkg)
     wav = pkg['chunk']
     wav = wav.data.numpy()
     max_frames = wav.shape[0] // self.hop
     if cached_file is not None:
         # load pre-computed data
         proso = torch.load(cached_file)
         beg_i = pkg['chunk_beg_i'] // self.hop
         end_i = pkg['chunk_end_i'] // self.hop
         proso = proso[:, beg_i:end_i]
         pkg['prosody'] = proso
     else:
         # first compute logF0 and voiced/unvoiced flag
         f0 = pysptk.swipe(wav.astype(np.float64),
                           fs=self.sr,
                           hopsize=self.hop,
                           min=self.f0_min,
                           max=self.f0_max,
                           otype='f0')
         lf0 = np.log(f0 + 1e-10)
         lf0, uv = interpolation(lf0, -1)
         lf0 = torch.tensor(lf0.astype(
             np.float32)).unsqueeze(0)[:, :max_frames]
         uv = torch.tensor(uv.astype(
             np.float32)).unsqueeze(0)[:, :max_frames]
         if torch.sum(uv) == 0:
             # if frame is completely unvoiced, make lf0 min val
             lf0 = torch.ones(uv.size()) * np.log(self.f0_min)
         assert lf0.min() > 0, lf0.data.numpy()
         # secondly obtain zcr
         zcr = librosa.feature.zero_crossing_rate(y=wav,
                                                  frame_length=self.win,
                                                  hop_length=self.hop)
         zcr = torch.tensor(zcr.astype(np.float32))
         zcr = zcr[:, :max_frames]
         # finally obtain energy
         egy = librosa.feature.rmse(y=wav,
                                    frame_length=self.win,
                                    hop_length=self.hop,
                                    pad_mode='constant')
         egy = torch.tensor(egy.astype(np.float32))
         egy = egy[:, :max_frames]
         proso = torch.cat((lf0, uv, egy, zcr), dim=0)
         pkg['prosody'] = proso
     return pkg
Example #12
0
def pysptk_features(x):
    import pysptk

    wav_max = 2**15-1
    x = (x * wav_max).astype(np.float64)

    frame_length = 512
    hop_length = 160
    frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).T
    frames *= pysptk.blackman(frame_length)
    order = 25 # seems to be pretty standard, results in 26 values
    alpha = 0.42 # this value is best for 16kHz sampling according to docs http://ftp.jaist.ac.jp/pub/pkgsrc/distfiles/SPTKref-3.9.pdf
    mcep = pysptk.mcep(frames, order, alpha)

    f0 = pysptk.swipe(x, fs=16000, hopsize=hop_length, min=60, max=240, otype="f0")
    f0 = f0[1:1+mcep.shape[0]] # cut off ends to match mcep lengths

    return np.concatenate([f0[:,np.newaxis], mcep], 1).astype(np.float32)
Example #13
0
    def process_audio(self, x):
        pitch = pysptk.swipe(x, fs=self.sr, hopsize=self.hop_length, min=self.f0_floor, max=self.f0_ceil, otype="pitch")

        f0, timeaxis = pyworld.dio(x, fs=self.sr, f0_floor=self.f0_floor, f0_ceil=self.f0_ceil, frame_period=self.frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, self.sr)

        # x_frame = self.samples_to_frames(x)

        if self.use_mel:
            mel = librosa.feature.melspectrogram(x, sr=self.sr, n_fft=self.fft_length, hop_length=self.hop_length)
            mfcc = librosa.feature.mfcc(S=mel, sr=self.sr, n_mfcc=self.n_mfcc)
            if self.norm_mfcc:
                mfcc = self.standardize_mfcc(mfcc)
            return {'mel': mel.T, 'mfcc': mfcc.T, 'f0': f0, 'pitch': pitch}
        else:
            ap = pyworld.d4c(x, f0, timeaxis, self.sr, fft_size=self.fft_length)  # Aperiodicity
            sp = pyworld.cheaptrick(x, f0, timeaxis, self.sr,
                                    fft_size=self.fft_length)
            return {'sp': sp, 'ap': ap, 'f0': f0, 'pitch': pitch}
 def get_MCEP(self, utterance):
     utterance = librosa.util.normalize(utterance)
     utterance = utterance + np.random.normal(
         loc=0, scale=0.0000001, size=utterance.shape[0])
     utterance = librosa.util.normalize(utterance)
     utterance = utterance.astype(np.float64)  # necessary for synthesizer
     frames = librosa.util.frame(utterance,
                                 frame_length=self.frame_length,
                                 hop_length=self.hop_length).astype(
                                     np.float64).T
     # Windowing
     frames *= pysptk.blackman(self.frame_length)
     assert frames.shape[1] == self.frame_length
     # Pitch
     pitch = pysptk.swipe(utterance.astype(np.float64),
                          fs=self.sr,
                          hopsize=self.hop_length,
                          min=60,
                          max=240,
                          otype="pitch")
     mcep = pysptk.mcep(frames, self.order, self.alpha)
     return mcep, pitch
Example #15
0
f.write("order,time\n")
for order in (0, 4, 9, 14, 24, 49):
    start = time.time()
    # Note that almost all of pysptk functions assume input array is C-contiguous and np.float64 element type
    frames = librosa.util.frame(x,
                                frame_length=frame_length,
                                hop_length=hop_length).astype(np.float64).T

    # Windowing
    frames *= pysptk.blackman(frame_length)

    assert frames.shape[1] == frame_length

    pitch = pysptk.swipe(x.astype(np.float64),
                         fs=sr,
                         hopsize=hop_length,
                         min=60,
                         max=240,
                         otype="pitch")
    source_excitation = pysptk.excite(pitch, hop_length)

    # Order of mel-cepstrum

    mc = pysptk.mcep(frames, order, alpha)
    logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real
    print(mc.shape)
    #plt.plot(mc)
    #plotname="x_syn_coefs_" + str(order) + ".png"
    #plt.savefig(plotname)

    # Convert mel-cesptrum to MLSADF coefficients
    b = pysptk.mc2b(mc, alpha)
Example #16
0
 def __test(x, fs, hopsize, otype):
     pysptk.swipe(x, fs, hopsize, otype=otype)
Example #17
0
 def __test(x, fs, hopsize, otype):
     f0 = pysptk.swipe(x, fs, hopsize, otype=otype)
     assert np.all(np.isfinite(f0))
     if otype == 1:
         assert np.all(f0 >= 0)
Example #18
0
OUT_WAVE_FILE = "out.wav"  # 分析再合成した音声

# 音声の読み込み
fs, x = wavfile.read(IN_WAVE_FILE)
x = x.astype(np.float64)

# 音声の切り出しと窓掛け
frames = librosa.util.frame(x,
                            frame_length=FRAME_LENGTH,
                            hop_length=HOP_LENGTH).astype(np.float64).T
frames *= pysptk.blackman(FRAME_LENGTH)  # 窓掛け(ブラックマン窓)

# ピッチ抽出
pitch = pysptk.swipe(x,
                     fs=fs,
                     hopsize=HOP_LENGTH,
                     min=MIN_F0,
                     max=MAX_F0,
                     otype="pitch")

# 励振源信号(声帯音源)の生成
source_excitation = pysptk.excite(pitch, HOP_LENGTH)

# 線形予測分析による線形予測符号化(LPC)係数の抽出
lpc = pysptk.lpc(frames, ORDER)
lpc[:, 0] = np.log(lpc[:, 0])

# LPC係数をPARCOR係数に変換
parcor = pysptk.lpc2par(lpc)

# 全極フィルタの作成
synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH)
Example #19
0
def get_target_feats(utterance, utterance_wav, alignments):
    '''Generates target features for an utterance (in this case duration, 
    initial phone fundamental frequency, final phone fundamental frequency, and
    energy'''
    #phone_start = int(alignments[0] * fs)
    first_phone_start = alignments[0][0]
    first_phone_end = alignments[0][1]

    #print("START: " + str(phone_start))
    #phone_end = int(alignments[1] * fs)
    #print("END: " + str(phone_end))
    last_phone_start = alignments[-1][0]
    last_phone_end = alignments[-1][1]
    #print(phone_start)
    #print(phone_end)
    #print(utterance_wav)
    #print(len(utterance_wav))
    duration = last_phone_end - first_phone_start
    first_phone_samples = utterance_wav[first_phone_start:first_phone_end]
    last_phone_samples = utterance_wav[last_phone_start:last_phone_end]
    all_phone_samples = utterance_wav[first_phone_start:last_phone_end]
    #phone_test = utterance_wav[phone_start]

    #try:
    '''
    print(first_phone_start)
    print(first_phone_end)
    print(last_phone_start)
    print(last_phone_end)
    '''
    try:
        f_0_init = np.mean(pysptk.swipe(first_phone_samples.astype(np.float64), 
                                    fs=fs, 
                                    hopsize=100, 
                                    otype='f0'))
        #print(f_0_init)
        f_0_end = np.mean(pysptk.swipe(last_phone_samples.astype(np.float64), 
                                   fs=fs, 
                                   hopsize=100, 
                                   otype='f0'))
    except:
        print("Outof bounds!!")
        print(utterance)
        print(first_phone_start)
        print(first_phone_end)
        print(last_phone_start)
        print(last_phone_end)
        print(alignments)
        raise Exception("Out of bounds")
        return (0, 0, 0, 0)

    #print(f_0_end)
    #except IndexError:
        # For "Index Error: Out of bounds on buffer access (axis 0)


    #mfcc = pysptk.mfcc(samples)

    #pitch = pysptk.swipe(phone_samples.astype(np.float64), fs=fs, hopsize=100, otype='pitch')

    #excitation = pysptk.excite(pitch)
    #excitation_mu = np.mean(excitation)
    #excitation_std = np.std(excitation)
    #print()
    energy = np.sum(np.square(all_phone_samples)) / duration
    return duration, f_0_init, f_0_end, energy
Example #20
0
def eval_pitch(ted_audio_path, user_audio_path, png_save_path):
    sr, x = wavfile.read(ted_audio_path)  # ted 목소리
    assert sr == 16000
    x = x.astype(np.float64)
    frame_length = 1024
    hop_length = 80
    f_you = pysptk.swipe(x.astype(np.float64),
                         fs=sr,
                         hopsize=hop_length,
                         min=60,
                         max=240,
                         otype="f0")
    sr1, x1 = wavfile.read(user_audio_path)
    assert sr1 == 16000
    x1 = x1.astype(np.float64)
    frame_length = 1024
    hop_length = 80

    # F0 estimation  # 각 주파수에서 기본 주파수 뽑기 ,def strength 와 같은 과정.

    f_ted = pysptk.swipe(x1.astype(np.float64),
                         fs=sr1,
                         hopsize=hop_length,
                         min=60,
                         max=240,
                         otype="f0")
    plt.figure(figsize=(20, 5))
    ##############
    width = int(len(f_ted) / 22)  # width 조정 해주기
    width1 = int(len(f_you) / 22)
    if np.where(f_you >= 60)[0][0] > np.where(
            f_ted >= 60)[0][0]:  #테드가 왼쪽에서 더 빠르게 시작하면?
        diff = np.where(f_you >= 60)[0][0] - np.where(f_ted >= 60)[0][0]
        zero_ = np.zeros(diff)
        new_f0 = np.r_[zero_, f_ted]
        cal0 = copy.copy(new_f0)
        cal0[np.where(cal0 < 75)] = 0
        new_f0[np.where(new_f0 < 75)] = 0
        value = []
        loc = []
        c3 = 0
        i = 0
        new_f0 = list(new_f0)
        while i < len(range(len(new_f0))):
            ten = new_f0[i:i + width]
            if ten == []:
                break
            a = max(ten)
            b = ten.index(a)
            value.append(a)
            loc.append(i + b)
            if i + width > len(new_f0):
                break
            else:
                c3 = c3 + 1
                i = width * c3
        base = np.empty(len(new_f0))
        base.fill(np.nan)
        for i in range(len(loc)):
            location = loc[i]
            base[location] = value[i]
        df_blue = pd.DataFrame(base)
        plt.figure(figsize=(20, 5))
        df_blue.interpolate(method='polynomial',
                            order=2,
                            linewidth=2,
                            inplace=True)

        bbb = list(df_blue[0])
        ccc = list(map(lambda x: 0 if x < 0 else x, bbb))
        df_blue[0] = ccc
        df_blue.fillna(0, inplace=True)
        for g in range(len(df_blue)):
            if df_blue[0][g] > max(f_ted) * 1.2:
                df_blue[0][g] = max(f_ted) * 1.2
        #합친게 you0
        you0 = f_you
        you0[np.where(you0 < 75)] = 0
        value1 = []
        loc1 = []
        c1 = 0
        i1 = 0
        you0 = list(you0)
        while i < len(range(len(you0))):
            ten1 = you0[i1:i1 + width1]
            if ten1 == []:
                break
            a1 = max(ten1)
            b1 = ten1.index(a1)
            value1.append(a1)
            loc1.append(i1 + b1)
            if i1 + width1 > len(you0):
                break
            else:
                c1 = c1 + 1
                i1 = width1 * c1
        base1 = np.empty(len(you0))
        base1.fill(np.nan)
        for i in range(len(loc1)):
            location1 = loc1[i]
            base1[location1] = value1[i]
        df_red = pd.DataFrame(base1)
        df_red.interpolate(method='polynomial',
                           order=2,
                           linewidth=2,
                           inplace=True)
        bbb1 = list(df_red[0])
        ccc1 = list(map(lambda x: 0 if x < 0 else x, bbb1))
        df_red[0] = ccc1
        df_red.fillna(0, inplace=True)
        for h in range(len(df_red)):
            if df_red[0][h] > max(f_you) * 1.2:
                df_red[0][h] = max(f_you) * 1.2
        df_red[0] = df_red[0] * max(f_ted) / max(f_you)
        area = []
        diff_areas = []
        if len(df_red[0]) > len(df_blue[0]):
            for i in range(diff, len(df_blue[0])):
                area.append(df_blue[0][i])
                diff_areas.append(abs(df_red[0][i] - df_blue[0][i]))
            result = 1 - (sum(diff_areas / sum(area)))
        else:
            for i in range(diff, len(df_red[0])):
                area.append(df_blue[0][i])
                diff_areas.append(abs(df_red[0][i] - df_blue[0][i]))
            result = 1 - (sum(diff_areas) / sum(area))

        ranks = []
        for i in range(1, len(df_blue[0]) - 1, 1):
            if df_blue[0][i] > 60:
                if df_blue[0][i] > df_blue[0][
                        i - 1] and df_blue[0][i] > df_blue[0][i + 1]:
                    ranks.append(df_blue[0][i] / max(df_blue[0]))
        ranks1 = []
        for i in range(1, len(df_red[0]) - 1, 1):
            if df_red[0][i] > 60:
                if df_red[0][i] > df_red[0][
                        i - 1] and df_red[0][i] > df_red[0][i + 1]:
                    ranks1.append(df_red[0][i] / max(df_red[0]))
        diffrent = []
        if len(ranks) > len(ranks1):
            for i in range(len(ranks1)):
                diffrent.append(abs(ranks1[i] - ranks[i]))
        else:
            for i in range(len(ranks)):
                diffrent.append(abs(ranks1[i] - ranks[i]))
        points4 = 1 - (sum(diffrent) / sum(ranks))

    else:  # 테드가 더늦겟시작
        diff = np.where(f_ted >= 60)[0][0] - np.where(f_you >= 60)[0][0]
        zero_ted = np.zeros(diff)
        new_ted = np.r_[zero_ted, f_you]
        new_ted[np.where(new_ted < 75)] = 0
        value1 = []
        loc1 = []
        c1 = 0
        i1 = 0
        new_ted = list(new_ted)
        while i1 < len(range(len(new_ted))):
            ten1 = new_ted[i1:i1 + width1]
            if ten1 == []:
                break
            a1 = max(ten1)
            b1 = ten1.index(a1)
            value1.append(a1)
            loc1.append(i1 + b1)
            if i1 + width1 > len(new_ted):
                break
            else:
                c1 = c1 + 1
                i1 = width1 * c1
        base = np.empty(len(new_ted))
        base.fill(np.nan)
        for h in range(len(loc1)):
            location1 = loc1[h]
            base[location1] = value1[h]
        df_blue = pd.DataFrame(base)
        df_blue.interpolate(method='polynomial',
                            order=2,
                            linewidth=2,
                            inplace=True)
        bbb = list(df_blue[0])
        ccc = list(map(lambda x: 0 if x < 0 else x, bbb))
        df_blue[0] = ccc
        df_blue.fillna(0, inplace=True)
        for g in range(len(df_blue)):
            if df_blue[0][g] > max(f_you) * 1.2:
                df_blue[0][g] = max(f_you) * 1.2
        f_ted1 = f_ted
        cal2 = copy.copy(f_ted1)
        cal2[np.where(f_ted1 < 75)] = 0
        f_ted1[np.where(f_ted1 < 75)] = 0
        value2 = []
        loc2 = []
        c2 = 0
        i2 = 0
        f_ted1 = list(f_ted1)
        while i2 < len(range(len(f_ted1))):
            ten2 = f_ted1[i2:i2 + width]
            if ten2 == []:
                break
            a2 = max(ten2)
            b2 = ten2.index(a2)
            value2.append(a2)
            loc2.append(i2 + b2)
            if i2 + width > len(f_ted1):
                break
            else:
                c2 = c2 + 1
                i2 = width * c2
        base2 = np.empty(len(f_ted1))
        base2.fill(np.nan)
        for i in range(len(loc2)):
            location2 = loc2[i]
            base2[location2] = value2[i]
        df_red = pd.DataFrame(base2)
        df_red.interpolate(method='polynomial',
                           order=2,
                           linewidth=2,
                           inplace=True)
        bbb2 = list(df_red[0])
        ccc2 = list(map(lambda x: 0 if x < 0 else x, bbb2))
        df_red[0] = ccc2
        df_red.fillna(0, inplace=True)
        for g in range(len(df_red)):
            if df_red[0][g] > max(f_ted) * 1.2:
                df_red[0][g] = max(f_ted) * 1.2
        df_blue[0] = df_blue[0] * max(f_ted) / max(f_you)
        area = []
        diff_areas = []
        if len(df_red[0]) > len(df_blue[0]):
            for i in range(diff, len(df_blue[0])):
                area.append(df_red[0][i])
                diff_areas.append(abs(df_red[0][i] - df_blue[0][i]))
            result = 1 - (sum(diff_areas / sum(area)))
        else:
            for i in range(diff, len(df_red[0])):
                area.append(df_red[0][i])
                diff_areas.append(abs(df_red[0][i] - df_blue[0][i]))
            result = 1 - (sum(diff_areas) / sum(area))

        ranks = []
        for i in range(1, len(df_blue[0]) - 1, 1):
            if df_blue[0][i] > 60:
                if df_blue[0][i] > df_blue[0][
                        i - 1] and df_blue[0][i] > df_blue[0][i + 1]:
                    ranks.append(df_blue[0][i] / max(df_blue[0]))
        ranks1 = []
        for i in range(1, len(df_red[0]) - 1, 1):
            if df_red[0][i] > 60:
                if df_red[0][i] > df_red[0][
                        i - 1] and df_red[0][i] > df_red[0][i + 1]:
                    ranks1.append(df_red[0][i] / max(df_red[0]))
        diffrent = []
        if len(ranks) > len(ranks1):
            for i in range(len(ranks1)):
                diffrent.append(abs(ranks1[i] - ranks[i]))
        else:
            for i in range(len(ranks)):
                diffrent.append(abs(ranks1[i] - ranks[i]))
        points4 = 1 - (sum(diffrent) / sum(ranks1))

    result = int(result * 100)
    result1 = int(points4 * 100)
    pitch_result_rate = max(result, result1)
    global pitch_result
    if pitch_result_rate >= 85:
        pitch_result = 'Excellent'
    elif pitch_result_rate >= 65:
        pitch_result = 'Good'
    else:
        pitch_result = 'Bad'
    line1, = plt.plot(df_blue, color='navy', linewidth=5)
    line2, = plt.plot(df_red, color='crimson', linewidth=5)
    plt.title('Pitch Result', fontsize=50)
    plt.legend(handles=(line1, line2), labels=('Ted', 'You'), fontsize=20)
    plt.ylabel('Pitch', fontsize=20)
    plt.tick_params(axis='x',
                    which='both',
                    bottom=False,
                    top=False,
                    labelbottom=False)
    plt.savefig(png_save_path + 'pitch_result.png')
    return pitch_result_rate, pitch_result
Example #21
0
 def __test(x, fs, hopsize, otype):
     f0 = pysptk.swipe(x, fs, hopsize, otype=otype)
     assert np.all(np.isfinite(f0))
Example #22
0
 def __call__(self, x):
     pitch = pysptk.swipe(x, fs=self.sr, hopsize=self.hop_length,
                          min=self.f0_floor, max=self.f0_ceil, otype="pitch")
     f0 = pysptk.swipe(x, fs=self.sr, hopsize=self.hop_length,
                       min=self.f0_floor, max=self.f0_ceil, otype="f0")
     return pitch, f0
Example #23
0
def pitch_detect(sndarray,fs, chunk_size):
    new_sndarray = numpy.asarray(numpy.float64(sndarray))
    f0 = pysptk.swipe(numpy.asarray(new_sndarray), fs, chunk_size, 65,500,0.001,1) 
    # Fundamental Frequency 
    return f0
Example #24
0
        duration = 0
        file = ""
        if (i <= 9):
            file = dir_path + "chunks/chunk-0" + str(i) + ".wav"

        else:
            file = dir_path + "chunks/chunk-" + str(i) + ".wav"

        duration = get_duration(file)

        fs, x = wavfile.read(file)
        assert fs == 16000

        f0_swipe = pysptk.swipe(x.astype(np.float64),
                                fs=fs,
                                hopsize=80,
                                min=60,
                                otype="f0")
        a = []
        f = []
        X_Frequecies_Vector = []
        for w in f0_swipe:
            if w != 0:
                f.append(w)
        if len(f) >= 30:
            f = random.sample(f, 30)
        else:
            f += [0] * (30 - len(f))
        freq_matrix.append(f)
        #a =  np.var(f)
Example #25
0
def processingVideo():

	for i in range(66,67):
		dir_path = "/home/eduardo/data_base_www/2PhaT6AbH3Q/"
		#move_files()

		num_emphasys = []
		n_chunks = len(glob.glob(dir_path +"chunks/chunk*"))
		freq_matrix = [[0 for i in range(2)] for j in range(n_chunks)]
		for i in range(0, n_chunks):
			duration = 0
			file =""
			if(i <=9):
				file = dir_path+"chunks/chunk-0"+str(i)+".wav"


			else:
				file = dir_path+"chunks/chunk-"+str(i)+".wav"

			duration = get_duration(file)

			fs, x = wavfile.read(file)
			assert fs == 16000

			f0_swipe = pysptk.swipe(x.astype(np.float64), fs = fs, hopsize = 20, min=60)
			a = []
			f  = []
			X_Frequecies_Vector = []
			for w in f0_swipe:
				if w != 0:
					f.append(w)


			#if not f:
			#	a = 0
			#else:
			#	a = stats.mode(f)[0][0]
			a = np.mean(f)



			#print(len(f))
			c, Pxx_den = signal.welch(x, fs, nperseg=1024)
			#if(Pxx_den.any()):
			#	v = 0
			##	v = stats.mode(Pxx_den)[0][0]
			v = np.mean(Pxx_den)
			if(~np.isnan(v) and ~np.isnan(a)):
				#num_emphasys.append ( v )
				l = []
				l.append(a)
				l.append(v)
				freq_matrix[i] = l



		weightA = [0 for i in range(n_chunks)]
		weightT = [0 for i in range(n_chunks)]
		transcript_matrix, annotation_matrix, avg_depth = cs.computeMatrix(dir_path, n_chunks)
		best_model_silhouettte = -1000
		iterations_without_improvment = 0
		max = 32

		model2 = cluster.SpectralClustering(max, affinity='precomputed', n_init=10000, n_jobs=-1)
		cluster_labels = model2.fit_predict(transcript_matrix)



		for i in range(len(cluster_labels)-1):
			if(cluster_labels[i] != cluster_labels[i+1]):
				weightT[i+1] = np.sqrt(pow(freq_matrix[i+1][0],2) + pow(freq_matrix[i+1][1],2))

		for j in range(len(annotation_matrix) -1):
			if(not set(annotation_matrix[j]).intersection(annotation_matrix[j+1])):
				weightA[j+1] =  float(np.sqrt(pow(freq_matrix[j+1][0],2) + pow(freq_matrix[j+1][1],2)) /abs(avg_depth[j] - avg_depth[j+1]))

		#rankingT = sorted(range(len(weightT)), key=lambda x: weightT[x])[-70:]
		rankingA = sorted(range(len(weightA)), key=lambda x: weightA[x])[-70:]
		#ranking = list(set(rankingT).intersection(rankingA))
		#merged = sorted(list(set(ranking).union(rankingA)))
		#matrixT = np.array(transcript_matrix)
		#matrixA = np.array(annotation_matrix)
		#ranking = getranking(n_chunks, freq_matrix, matrixT, matrixA, 25)
		evaluate_method.evaluate(dir_path, sorted(rankingA), "aas")
def get_pitch_pysptk(wav):
    sample_rate, samples = wavfile.read(wav)
    f0_swipe = pysptk.swipe(samples.astype(np.float64), fs=sample_rate, hopsize=80, min=60, max=200, otype="f0")
    return f0_swipe
def read_audio_n_process(file, label, base_path, sampling_rate,
                         sample_size_in_seconds, overlap, normalise, method):
    """
    This method is called by the preprocess data method
    :param file:
    :param label:
    :param base_path:
    :param sampling_rate:
    :param sample_size_in_seconds:
    :param overlap:
    :param normalise:
    :return:
    """
    data, out_labels = [], []
    filepath = base_path + file
    if os.path.exists(filepath):
        audio, sr = librosa.load(filepath, sr=sampling_rate)
        # mask = envelope(audio, sr, 0.0005)
        # audio = audio[mask]
        sr = sampling_rate
        # audio = remove_silent_parts(filepath, sr=sampling_rate)
        chunks = cut_audio(audio,
                           sampling_rate=sr,
                           sample_size_in_seconds=sample_size_in_seconds,
                           overlap=overlap)
        for chunk in chunks:
            if method == 'fbank':
                zero_crossing = librosa.feature.zero_crossing_rate(chunk)
                f0 = pysptk.swipe(chunk.astype(np.float64),
                                  fs=sr,
                                  hopsize=510,
                                  min=60,
                                  max=240,
                                  otype="f0").reshape(1, -1)
                pitch = pysptk.swipe(chunk.astype(np.float64),
                                     fs=sr,
                                     hopsize=510,
                                     min=60,
                                     max=240,
                                     otype="pitch").reshape(1, -1)
                f0_pitch_multiplier = 1
                features = mel_filters(chunk, sr, normalise)
                f0 = np.reshape(f0[:, :features.shape[1] *
                                   f0_pitch_multiplier],
                                newshape=(f0_pitch_multiplier, -1))
                pitch = np.reshape(pitch[:, :features.shape[1] *
                                         f0_pitch_multiplier],
                                   newshape=(f0_pitch_multiplier, -1))
                # shimmer_jitter = get_shimmer_jitter_from_opensmile(chunk, time.time(), sr)
                # shimmer_jitter = np.tile(shimmer_jitter, math.ceil(features.shape[-1] / len(shimmer_jitter)))[
                #                  :features.shape[
                #                      -1]]  # Repeating the values to match the features length of filterbanks
                # shimmer_jitter = np.reshape(shimmer_jitter, newshape=(1, -1))
                features = np.concatenate((features, zero_crossing, f0, pitch),
                                          axis=0)  # shimmer_jitter
            elif method == 'mfcc':
                features = mfcc_features(chunk, sr, normalise)
            elif method == 'gaf':
                features = gaf(chunk)
            elif method == 'raw':
                features = chunk
            else:
                raise Exception(
                    'Specify a method to use for pre processing raw audio signal. Available options - {fbank, mfcc, gaf, raw}'
                )
            data.append(features)
            out_labels.append(float(label))
        return data, out_labels, chunks
    else:
        print('File not found ', filepath)
        return [], [], []
Example #28
0
def get_f0(waveform,
           sample_rate,
           hop_length_seconds=0.01,
           method='swipe',
           f0_min=60,
           f0_max=300):
    """Compute the F0 contour using PYSPTK: https://github.com/r9y9/pysptk/.

    Args:
        waveform (np.array, [T, ]): waveform over which to compute f0
        sample_rate (int > 0): number of samples per second in waveform
        hop_length (int): hop size argument in pysptk.swipe. Corresponds to hopsize
            in the window sliding of the computation of f0.
        method (str): is one of 'swipe' or 'rapt'. Define which method to use for f0
            calculation. See https://github.com/r9y9/pysptk

    Returns:
        dict: Dictionary containing keys:
            "contour" (np.array, [1, t1]): f0 contour of waveform. Contains unvoiced
                frames.
            "values" (np.array, [1, t2]): nonzero f0 values waveform. Note that this
                discards all unvoiced frames. Use to compute mean, std, and other statistics.
            "mean" (float): mean of the f0 contour.
            "std" (float): standard deviation of the f0 contour.
    """
    assert method in (
        'swipe',
        'rapt'), "The method argument should be one of 'swipe' or 'rapt'."

    hop_length = numseconds_to_numsamples(hop_length_seconds, sample_rate)
    if method == 'swipe':
        f0_contour = swipe(
            waveform.astype(np.float64),
            fs=sample_rate,
            hopsize=hop_length,
            min=f0_min,
            max=f0_max,
            otype="f0",
        )[np.newaxis, :]

    elif method == 'rapt':
        # For this estimation, waveform needs to be in the int PCM format.
        f0_contour = rapt(
            np.round(waveform * 32767).astype(np.float32),
            fs=sample_rate,
            hopsize=hop_length,
            min=f0_min,
            max=f0_max,
            otype="f0",
        )[np.newaxis, :]

    # Remove unvoiced frames.
    f0_values = f0_contour[:, np.where(f0_contour[0, :] != 0)][0]

    f0_mean = np.mean(f0_values[0])
    f0_std = np.std(f0_values[0])
    return {
        "contour": f0_contour,
        "values": f0_values,
        "mean": f0_mean,
        "std": f0_std,
    }
Example #29
0
    def __call__(self, tensor):
        """

        Args:
            tensor (Tensor): Tensor of audio of size (samples x 1)

        """
        # pysptk and interpolate are a MUST in this transform
        import pysptk
        from ahoproc_tools.interpolate import interpolation
        t_npy = tensor.cpu().squeeze(1).numpy()
        #print('t_npy shape: ', t_npy.shape)
        seqlen = t_npy.shape[0]
        T = seqlen // self.hop_length
        # compute LF0 and UV
        f0 = pysptk.swipe(t_npy.astype(np.float64),
                          fs=self.sr,
                          hopsize=self.hop_length,
                          min=60,
                          max=240,
                          otype="f0")[:T]
        lf0 = np.log(f0 + 1e-10)
        lf0, uv = interpolation(lf0, -1)
        if np.any(lf0 == np.log(1e-10)):
            # all lf0 goes to minf0 as a PAD symbol
            lf0 = np.ones(lf0.shape) * np.log(60)
            # all frames are unvoiced
            uv = np.zeros(uv.shape)
        ret = {
            'lf0': torch.FloatTensor(lf0).view(-1, 1),
            'uv': torch.FloatTensor(uv.astype(np.float32)).view(-1, 1)
        }
        tot_frames = T

        # MelSpectrum and MFCCs
        mel = self.mel(tensor).transpose(0, 1).squeeze(2)
        # do compression?
        if self.dynamic_norm_spec:
            mel = torch.log1p(mel * 10000) / torch.log(torch.FloatTensor([10]))
        ret['mel_spec'] = mel[:tot_frames]
        mfcc = librosa.feature.mfcc(y=t_npy,
                                    sr=self.sr,
                                    n_fft=self.n_fft,
                                    hop_length=self.hop_length,
                                    n_mfcc=self.mfcc_order).T
        mfcc = mfcc[:tot_frames]
        ret['mfcc'] = torch.FloatTensor(mfcc)
        # Spectrogram abs magnitude [dB]
        spec = librosa.stft(t_npy,
                            n_fft=self.n_fft,
                            hop_length=self.hop_length,
                            win_length=self.win_length,
                            window=self.window)
        spec_db = librosa.amplitude_to_db(spec).T
        spec_ang = np.angle(spec).T
        spec_db = spec_db[:tot_frames]
        spec_ang = spec_ang[:tot_frames]
        ret['mag'] = torch.FloatTensor(spec_db)
        ret['pha'] = torch.FloatTensor(spec_ang)
        # ZCR, E and lF0
        egy = librosa.feature.rmse(y=t_npy,
                                   frame_length=self.win_length,
                                   hop_length=self.hop_length,
                                   pad_mode='constant').T
        egy = egy[:tot_frames]
        zcr = librosa.feature.zero_crossing_rate(y=t_npy,
                                                 frame_length=self.win_length,
                                                 hop_length=self.hop_length).T
        zcr = zcr[:tot_frames]
        ret['egy'] = torch.FloatTensor(egy)
        ret['zcr'] = torch.FloatTensor(zcr)
        ntensor = tensor.clone()
        if hasattr(self, 'chopper'):
            do_chop = random.random() > 0.5
            if do_chop:
                ntensor = self.chopper(ntensor, self.sr)

        if hasattr(self, 'additive'):
            do_add = random.random() > 0.5
            if do_add:
                ntensor = self.additive(ntensor.numpy(), self.sr)

        if hasattr(self, 'clipping'):
            do_clip = random.random() > 0.5
            if do_clip:
                ntensor = self.clipping(ntensor.numpy())
        ret['wav'] = ntensor.view((-1, 1))
        ret['cwav'] = tensor.view((-1, 1))
        return ret