-
Notifications
You must be signed in to change notification settings - Fork 1
/
mainF0Detect_V1.py
254 lines (204 loc) · 10.3 KB
/
mainF0Detect_V1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# -*- coding: utf-8 -*-
"""
Created on Fri May 22 12:26:59 2015
Description: Used to detect the vocalic and non-vocalic regions of the percussion supressed music signal
Inputs:
Outputs:
@author: Gurunath Reddy M
"""
import numpy as np
from scipy.signal import _savitzky_golay as savgol_filter
import matplotlib.pyplot as plt
import zff as zff
import waveio as io
import voiceRegMod as speechVuv
import detSinusouds as detSin
import ssh as ssh
import getMusicZff as getZFFS
import smooth as smooth
import getSSHVUV as sshVUV
import dftStft as stft
import zffDoubleFilter as doubleFilt
import plotSpect as pltSpec
import scipy.io as matio
#Read the wave file for processing
fileName = '1.7.happy-15'
[fs, x] = io.wavread('../wav/'+fileName+'.wav')
x = np.array(x, np.float64) # Datatype conversion is must for python to compatable with matlab
x = x/(1.01*np.max(np.abs(x)));
lenSig = x.size
timeAxis = np.arange(lenSig)/float(fs)
#TODO: Find the representative F0 for each voiced region and then perform ZFF filteri
detSinusoids, mxLinear, sinPeaksMag, sinPeaksBin = detSin.getSinusoids(x, fs)
H = 80.0 # 5ms = 80 samples hop size for 16kHz
N = 1024 # FFT size
# Find summation of spectral harmonics and get pitch contour which is used as resonant frequency in ZFF filtering
resntFreq, sshVal = ssh.sumSpectHarm(detSinusoids, fs, H, N)
sshVal = np.power(sshVal, 2)
sshVal = sshVal/np.max(sshVal)
# Finding V/UV regions based on the power of the sshVal
begVoic, endVoic = sshVUV.sshVUV(sshVal, H, fs, N)
begVoicIndx = np.ones(begVoic.size)
frame2Time = (np.arange(np.size(sshVal)) * float(H))/fs
begVoicTime = (begVoic*H)/fs
endVoicTime = (endVoic*H)/fs
tempTime = np.arange(np.size(sshVal))
tTime = (tempTime * H)/fs
plt.figure()
plt.plot(timeAxis, x)
plt.plot(tTime, sshVal, 'r')
plt.stem(begVoicTime, begVoicIndx, 'g')
plt.stem(endVoicTime, begVoicIndx, 'r')
plt.title('Salience based voiced and unvoiced classification')
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synthtime.mat', mdict={'synthtime':timeAxis})
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synx.mat', mdict={'synx':x})
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synframe2time.mat', mdict={'synframe2time':frame2Time})
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synsshval.mat', mdict={'synsshval':sshVal})
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synbegvoictime.mat', mdict={'synbegvoictime':begVoicTime})
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synendvoictime.mat', mdict={'synendvoictime':endVoicTime})
# ------------------------------------------------------------------------------
# frame to sample number conversion
timeBegVoicSamp = np.floor(begVoic * H)
timeEndVoicSamp = np.floor(endVoic * H)
# ------------------------------------------------------------------------------
sampOnsets = np.sort(np.concatenate((timeBegVoicSamp, timeEndVoicSamp))) # smapleOnsets are in sample number
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/synthAngOnsets.mat', mdict={'synthAngOnsets':sampOnsets})
zffs = getZFFS.getZFFS(x, fs, H, sampOnsets, resntFreq) # ZFFS for each voiced region
# Once the ZFFS is found, get the instants of zero crossings, strength of excitation and f0 at GCI's
zf1 = np.copy(zffs)
zffs[zffs>0] = 1 # To find zero crossings, place value 1 at all locations zffs>0
zffs[zffs<0] = -1 # Place value -1 at all locations zffs<0
gci = np.where(np.diff(zffs) == 2)[0] # Take difference and look for the positions of value 2
es = np.abs(zf1[gci+1]-zf1[gci-1]) # Positions of value 2 are the instants of zero crossings
T0 = np.diff(gci) # Finding period interms of sample number
T0 = T0/float(fs) # Period in seconds
f0 = 1.0/T0 # Frequency in Hz
f0 = np.append(f0, f0[-1], f0[-1]) # Filling holes created by two difference operation
# Smoothing the melody to remove high frequency contents
f0Smooth = smooth.smooth(f0, 10, 'flat')
# ------------------------------------------------------------------------------- #
# Remove unvoiced portions from F0 contour
tempF0 = np.zeros(lenSig)
tempF0[gci] = f0Smooth
# Get F0 at only voiced regions
voicPitch = np.zeros(np.shape(x)[0])
if(np.shape(timeBegVoicSamp)[0] == np.shape(timeEndVoicSamp)[0]): # Check if number of instants are same
for i in range(np.shape(timeBegVoicSamp)[0]):
begInst = timeBegVoicSamp[i] # Get beg instant
endInst = timeEndVoicSamp[i] # Get end instant
voicPitch[begInst:endInst] = tempF0[begInst:endInst] # Place voiced portion f0
plt.figure()
plt.subplot(211)
plt.plot(timeAxis, x)
plt.xlim([np.min(timeAxis), np.max(timeAxis)])
plt.subplot(212)
plt.plot(timeAxis, voicPitch, 'r*')
plt.xlim([np.min(timeAxis), np.max(timeAxis)])
plt.ylim([0, 400])
plt.title('F0 extraction by adaptive ZFF')
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synpitchbefdouble.mat', mdict={'synpitchbefdouble':voicPitch})
#plt.subplot(212)
#plt.plot(timeAxis, zf1)
#plt.figure()
#plt.plot(timeAxis, x)
#plt.plot(frame2Time, sshVal/np.max(sshVal), 'r')
# ----------------------------------------------------------------------------- #
# Performing ZFF double filtering part
# ----------------------------------------------------------------------------- #
# FFT parameter setting
M = (20*fs)/1000.0 # 20ms window
N = 512
H = M/4
window = 'hamming'
w = np.hamming(M)
zffsDoub = doubleFilt.filterZffs(zf1, fs, w, N, H, sampOnsets) # Get the F0 of ZFFS signal
#plt.plot(zfsF0)
# ------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------
# Filter the zero frequnecy filtered signal one more time with the help of resonant frequency contour obtained from ZFFS signal
#zffsDoub = getZFFS.getZFFS(zf1, fs, H, sampOnsets, resntFreq) # ZFFS for each voiced region
# Once the ZFFS is found, get the instants of zero crossings, strength of excitation and f0 at GCI's
zf1Doub = np.copy(zffs)
zffsDoub[zffsDoub>0] = 1 # To find zero crossings, place value 1 at all locations zffs>0
zffsDoub[zffsDoub<0] = -1 # Place value -1 at all locations zffs<0
gci = np.where(np.diff(zffsDoub) == 2)[0] # Take difference and look for the positions of value 2
es = np.abs(zf1Doub[gci+1]-zf1Doub[gci-1]) # Positions of value 2 are the instants of zero crossings
T0 = np.diff(gci) # Finding period interms of sample number
T0 = T0/float(fs) # Period in seconds
f0 = 1.0/T0 # Frequency in Hz
f0 = np.append(f0, f0[-1], f0[-1]) # Filling holes created by two difference operation
# Smoothing the melody to remove high frequency contents
f0Smooth = smooth.smooth(f0, 10, 'flat')
# ------------------------------------------------------------------------------- #
# Remove unvoiced portions from F0 contour
tempF0 = np.zeros(lenSig)
tempF0[gci] = f0Smooth
# Get F0 at only voiced regions
voicPitch = np.zeros(np.shape(x)[0])
if(np.shape(timeBegVoicSamp)[0] == np.shape(timeEndVoicSamp)[0]): # Check if number of instants are same
for i in range(np.shape(timeBegVoicSamp)[0]):
begInst = timeBegVoicSamp[i] # Get beg instant
endInst = timeEndVoicSamp[i] # Get end instant
voicPitch[begInst:endInst] = tempF0[begInst:endInst] # Place voiced portion f0
plt.figure()
plt.subplot(211)
plt.plot(timeAxis, x)
plt.xlim([np.min(timeAxis), np.max(timeAxis)])
plt.subplot(212)
plt.plot(timeAxis, voicPitch, 'r*')
plt.xlim([np.min(timeAxis), np.max(timeAxis)])
plt.ylim([0, 400])
plt.title('F0 by double filtering')
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synvopitchaftdoub.mat', mdict={'synvoicpitchaftdoub':voicPitch})
#------------------------------------------------------- Original ZFF ---------------------------------- #
windSize = 3 # Choose ZFF mean subtraction window emperically as 4ms for v/uv classification
[zffs, gci, vuvEpssp1, vuvF0sp1] = zff.zff(x, fs, windSize) # Compute the zff of x
zf1 = np.copy(zffs)
zffs[zffs>0] = 1 # To find zero crossings, place value 1 at all locations zffs>0
zffs[zffs<0] = -1 # Place value -1 at all locations zffs<0
gci = np.where(np.diff(zffs) == 2)[0] # Take difference and look for the positions of value 2
es = np.abs(zf1[gci+1]-zf1[gci-1]) # Positions of value 2 are the instants of zero crossings
T0 = np.diff(gci) # Finding period interms of sample number
T0 = T0/float(fs) # Period in seconds
f0 = 1.0/T0 # Frequency in Hz
f0 = np.append(f0, f0[-1], f0[-1]) # Filling holes created by two difference operation
f0Smooth = smooth.smooth(f0, 10, 'flat')
# Remove unvoiced portions from F0 contour
tempF0 = np.zeros(lenSig)
tempF0[gci] = f0Smooth
# Get F0 at only voiced regions
voicPitch = np.zeros(np.shape(x)[0])
if(np.shape(timeBegVoicSamp)[0] == np.shape(timeEndVoicSamp)[0]): # Check if number of instants are same
for i in range(np.shape(timeBegVoicSamp)[0]):
begInst = timeBegVoicSamp[i] # Get beg instant
endInst = timeEndVoicSamp[i] # Get end instant
voicPitch[begInst:endInst] = tempF0[begInst:endInst] # Place voiced portion f0
plt.figure()
plt.subplot(211)
plt.plot(timeAxis, x)
plt.xlim([np.min(timeAxis), np.max(timeAxis)])
plt.subplot(212)
plt.plot(timeAxis, voicPitch, 'r*')
plt.xlim([np.min(timeAxis), np.max(timeAxis)])
plt.ylim([0, 400])
plt.title('Original ZFF')
#matio.savemat('/home/gurunath/MS_work/papers/Indicon-2015/happy_synth/code_to _generate_figures/orig/synorigzffpit.mat', mdict={'synorigzffpit':voicPitch})
# -------------------------------------------------------------------------------------------------------- #
#zfsF0Doub = doubleFilt.filterZffs(zf1Doub, fs, w, N, H) # Get the F0 of ZFFS signal
#plt.figure()
#plt.plot(x)
#plt.plot(zf1Doub)
#t = 0.001
#import peakDetectCorrect as pdc
#maxBinFreq = np.round((5000.0 * N)/fs)
#tempMag = mxLinear[512, :maxBinFreq]
#ploc = pdc.peakDetection(tempMag, t)
#plt.figure()
#plt.plot(tempMag)
#plt.stem(ploc, tempMag[ploc], 'r')
#
#plt.figure()
#tempMag = detSinusoids[512, :maxBinFreq]
#ploc = pdc.peakDetection(tempMag, t)
#plt.plot(tempMag)
#plt.stem(ploc, tempMag[ploc], 'r')