/
sidefunctions.py
214 lines (178 loc) · 7.52 KB
/
sidefunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import librosa
from scipy.fftpack import fft
from scipy.signal import hann
import numpy as np
import scipy as sp
from scipy.fftpack import ifft
def getsonglength(path_to_audio):
song, sr = librosa.load(path_to_audio)
return len(song) / float(sr)
def combineandsplitsongs(path_to_audio, path_to_voice, path_of_output, path_to_output1, path_to_output2):
song, sr = librosa.load(path_to_audio)
voice, sr = librosa.load(path_to_voice)
song[0:len(voice)] += voice
librosa.output.write_wav(path_of_output, song, sr)
split1len = splitty(song, path_to_output1, path_to_output2, sr)
return split1len
def combinesongs(path_to_audio, path_to_voice, path_of_output):
song, sr = librosa.load(path_to_audio)
voice, sr = librosa.load(path_to_voice)
song[0:len(voice)] += voice
librosa.output.write_wav(path_of_output, song, sr)
return path_of_output
def splitty(song, path_to_output1, path_to_output2, sr):
window_size = 2048
hop_size = 1024
combined = song
X_voice = stft(combined, window_size, hop_size)
V_voice = plt_spectrogram(X_voice, window_size, hop_size, sr)
if np.min(V_voice) < 0:
newV_voice = V_voice - np.min(V_voice)
else:
newV_voice = V_voice
comp, act = librosa.decompose.decompose(newV_voice, n_components=2)
nc1 = np.zeros(comp.shape)
nc1[:,0] = comp[:,0]
na1 = np.zeros(act.shape)
na1[0,:] = act[0,:]
nc2 = np.zeros(comp.shape)
nc2[:,1] = comp[:,1]
na2 = np.zeros(act.shape)
na2[1,:] = act[1,:]
newthing1 = nc1.dot(na1)
newthing2 = nc2.dot(na2)
mask1 = newthing1/(newthing1 + newthing2)
mask2 = newthing2/(newthing1 + newthing2)
fullmask1 = np.zeros(X_voice.shape)
fullmask2 = np.zeros(X_voice.shape)
fullmask1[:V_voice.shape[0], :] = mask1
fullmask1[V_voice.shape[0]:, :] = np.flipud(mask1)
fullmask2[:V_voice.shape[0], :] = mask2
fullmask2[V_voice.shape[0]:, :] = np.flipud(mask2)
part1X = X_voice * fullmask1
part2X = X_voice * fullmask2
part1 = istft(part1X, hop_size)
part2 = istft(part2X, hop_size)
librosa.output.write_wav(path_to_output1, part1.real, sr)
librosa.output.write_wav(path_to_output2, part2.real, sr)
return len(part1)
def stft(signal, window_size, hop_size, window_type = 'hann'):
"""
Computes the short term fourier transform of a 1-D numpy array, where the array
is windowed into a set of subarrays, each of length window_size. The distance between
window centers (in samples) is given by hop_size. The type of window applied is
determined by window_type. This returns a 2-D numpy array where the ith column
is the FFT of the ith window. Each column contains an array of complex values.
Input Parameters
----------------
signal: The 1-d (complex or real) numpy array containing the signal
window_size: an integer scalar specifying the number of samples in a window
hop_size: an integer specifying the number of samples between the start of adjacent windows
window_type: a string specifying one of two "hann" or "rectangular"
Returns
-------
a 2D numpy array of complex numbers where the array column is the FFT of the ith window,
and the jth element in the ith column is the jth frequency of analysis.
"""
# figure out how many hops
length_to_cover_with_hops = len(signal) - window_size;
assert (length_to_cover_with_hops >= 0), "window_size cannot be longer than the signal to be windowed"
num_hops = 1 + length_to_cover_with_hops/hop_size;
# make our window function
if (window_type == 'hann'):
window = sp.signal.hann(window_size, sym=False)
else:
window = np.ones(window_size)
stft = [0]*num_hops
# fill the array with values
for hop in range(num_hops):
start = hop*hop_size
end = start + window_size
unwindowed_sound = signal[start:end]
windowed_sound = unwindowed_sound * window
stft[hop]= fft(windowed_sound, window_size)
return np.array(stft).T
def plt_spectrogram(X,win_length, hop_size, sample_rate, zoom_x=None, zoom_y=None,tick_labels='time-freq'):
"""
Plots the log magnitude spectrogram.
Input Parameters:
------------------
X: 2D complex numpy array containing the stft values. Rows correspond to frequency bins and columns to time frames.
win_length: the length of the analysis window
hop_size: the hop size between adjacent windows
sample_rate: sampling frequency
tick_labels: the type of x and y tick labels, there are two options:
'time-freq': shows times (sec) on the x-axis and frequency (Hz) on the y-axis (default)
'bin-frame': shows time frame number on the x-axis and frequency bin number on the y-axis
zoom_x: 1 by 2 numpy array containing the range of values on the x-axis, e.g. zoom_t=np.array([x_start,x_end])
zoom_y: 1 by 2 numpy array containing the range of values on the y-axis, e.g. zoom_f=np.array([y_start,y_end])
Returns:
---------
times: 1D real numpy array containing time instances corresponding to stft frames
freqs: 1D real numpy array containing frequencies of analyasis up to Nyquist rate
2D plot of the magnitude spectrogram
"""
# Find the size of stft
Nf,Nt=np.shape(X)
# Compute the log magnitude spectrogram
X=20*np.log10(np.abs(X))
# Extract the first half of the spectrum for each time frame
X=X[0:Nf/2]
# Nf=np.shape(X)[0]
#
# # Generate time vector for plotting
# times=(hop_size/float(sample_rate))*np.arange(Nt)
#
# # Generate frequency vector for plotting
# freqs=(float(sample_rate)/win_length)*np.arange(Nf)
#
# # Generate time and frequency matrices for pcolormesh
# times_matrix,freqs_matrix=np.meshgrid(times,freqs)
# #
# # Plot the log magnitude spectrogram
# plt.title('Log magnitude spectrogram')
# if tick_labels == 'bin-frame':
# plt.pcolormesh(X)
# plt.xlabel('Time-frame Number')
# plt.ylabel('Frequency-bin Number')
# else:
# plt.pcolormesh(times_matrix,freqs_matrix,X)
# plt.xlabel('Time (sec)')
# plt.ylabel('Frequency (Hz)')
#
# # Zoom in on the plot if specified
# if zoom_x is None and zoom_y is None:
# plt.axis('tight')
#
# if zoom_x is not None:
# plt.xlim(zoom_x)
#
# if zoom_y is not None:
# plt.ylim(zoom_y)
#
return X
def istft(X, hop_size):
"""
Takes a 2-D numpy array representing an STFT of some signal, where stft[i]
is the FFT of the ith window as input and stft[i,k] is the kth frequency of analysis.
Performs an inverse FFT on each window and then does overlap & add resynthesis to rebuild
the original signal the STFT was built from.
Input Parameters
----------------
X: a 2-D numpy array of complex numbers representing an STFT, where the ith
column is the FFT of the ith window, and the jth row is the jth frequency of analysis.
hop_size: an integer specifying the number of samples between the start of adjacent windows.
Returns
-------
a 1-d numpy array of (possibly complex) values representing the original signal used to make X
"""
# make an empty signal of the appropriate length
window_size,num_hops = X.shape
signal_length = (num_hops-1)*hop_size + window_size
signal = np.zeros(signal_length,dtype='complex');
#fill the signal
for n in range(num_hops):
start = n * hop_size
end = start + window_size
signal[start:end] = signal[start:end] + ifft(X[:,n])
return signal