def process_file(input_fname, output_fname, l1, l2, direction): w = Wave.read(input_fname) print(w.dtype) X = stft.process(w) ae = AutoEncoder() ae.load_state_dict(torch.load("ae_4x1_poly.pickle")) if torch.cuda.is_available(): ae.cuda() criterion = torch.nn.MSELoss() optimizer = optim.Adam(ae.parameters(), weight_decay = 0.01) v = get_features(X) for epoch in range (n_iterations): epoch_loss = 0 ae.train() optimizer.zero_grad() output = ae(v) hdiff, vdiff = get_diffs(output[0,0,:,:]) if (direction == 0): tsloss = hdiff/(vdiff+eps) else: tsloss = vdiff/(hdiff+eps) mse = criterion(output,v) loss1 = torch.norm(output,1) loss2 = tsloss print("mse {:>10.10f} loss 1 {:>10.10f} loss 2 {:>10.10f}".format( mse.data.item(), loss1.data.item(), loss2.data.item()) ) loss = mse + l1 * loss1 + l2 * loss2 loss.backward() optimizer.step() O = ae(v) O = O[0,0,:,:].cpu().data.numpy() Y = np.abs(O.T) * np.exp(np.angle(X)*1j) y = istft.process(Spectrogram(Y, X.sample_rate)) y.write(output_fname)
def process_file(input_fname, output_fname, l1): w = Wave.read(input_fname) X = stft.process(w) ae = AutoEncoder() ae.load_state_dict(torch.load("ae_4x1_mono.pickle")) if torch.cuda.is_available(): ae.cuda() criterion = torch.nn.MSELoss() optimizer = optim.Adam(ae.parameters(), weight_decay=0.01) loss_curve = [] v = get_features(X) loss_curve = [] for epoch in range(n_iterations): epoch_loss = 0 ae.train() optimizer.zero_grad() output = ae(v) mse = criterion(output, v) loss1 = l1 * torch.norm(output, 1) print("mse {:>10.10f} loss 1 {:>10.10f}".format( mse.data.item(), loss1.data.item())) loss = mse + loss1 loss.backward() optimizer.step() loss_curve.append(loss.data.item()) O = ae(v) O = O[0, 0, :, :].cpu().data.numpy() Y = np.abs(O.T) * np.exp(np.angle(X) * 1j) Y = Spectrogram(Y, X.sample_rate) y = istft.process(Y) y.write(output_fname)
def get_features(path, fr=None, to=None): w = Wave.read(path) w = w[fr:to, 0] X = stft.process(w) f = np.abs(X).T M = f[np.newaxis, np.newaxis, :, :] v = Variable(torch.from_numpy(M.astype(np.float32))) if torch.cuda.is_available(): v = v.cuda() return v
def process_file(input_fname, l1, l2, l3): print(l1,l2,l3) w = Wave.read(input_fname) X = stft.process(w) ae = AutoEncoder() ae.load_state_dict(torch.load("ae_4x2_poly.pickle")) if torch.cuda.is_available(): ae.cuda() criterion = torch.nn.MSELoss() optimizer = optim.Adam(ae.parameters(), weight_decay = 0.5) v = get_features(X) for epoch in range (n_iterations): epoch_loss = 0 ae.train() optimizer.zero_grad() output = ae(v) out1 = output[0,0,:,:] out2 = output[0,1,:,:] mix = out1 + out2 tgt = v[0,0,:,:] mse = criterion(mix,tgt) hdiff1 = torch.sum(torch.pow(out1[:,1:] - out1[:,:-1], 2))/(torch.pow(torch.norm(out1,2),2)+eps) vdiff1 = torch.sum(torch.pow(out1[1:,:] - out1[:-1,:], 2))/(torch.pow(torch.norm(out1,2),2)+eps) hdiff2 = torch.sum(torch.pow(out2[:,1:] - out2[:,:-1], 2))/(torch.pow(torch.norm(out2,2),2)+eps) vdiff2 = torch.sum(torch.pow(out2[1:,:] - out2[:-1,:], 2))/(torch.pow(torch.norm(out2,2),2)+eps) loss1 = torch.norm(mix,1) tloss = hdiff1/(vdiff1+eps) sloss = vdiff2/(hdiff2+eps) print( "mse {:>10.10f} loss 1 {:>10.10f} T loss {:>10.10f} S loss {:>10.10f}".format( mse.data.item(), loss1.data.item(), tloss.data.item(), sloss.data.item() ) ) loss = mse + l1 * loss1 + l2 * tloss + l3 * sloss loss.backward() optimizer.step() O = ae(v) O1 = O[0,0,:,:].cpu().data.numpy() O2 = O[0,1,:,:].cpu().data.numpy() T = O1.T S = O2.T Smask = S/(S+T+eps) Tmask = T/(S+T+eps) steady = istft.process(Spectrogram(X*Smask, X.sample_rate)) trans = istft.process(Spectrogram(X*Tmask, X.sample_rate)) steady.write(splitext(input_fname)[0]+"_steady.wav") trans.write(splitext(input_fname)[0]+"_trans.wav")
import numpy as np import matplotlib.pyplot as plt from untwist.data import Wave, RatioMask from untwist.transforms import STFT, ISTFT from untwist.factorizations import RPCA stft = STFT() istft = ISTFT() rpca = RPCA(iterations = 100) # Try with vocals over repetitive music background x = Wave.read("mixture.wav") X = stft.process(x[:,0]) # this will take some time (L,S) = rpca.process(X.magnitude()) M = RatioMask(S, L) v = istft.process(X * M) v.write("vocal_estimate.wav") plt.subplot(4,1,1) X.plot(label_x = False, title="mixture") plt.subplot(4,1,2) L.plot(label_x = False, title="L") plt.subplot(4,1,3) S.plot(label_x = False, title="S") plt.subplot(4,1,4) M.plot(title="estimated mask") plt.show()
def get_spectrogram(path): x = Wave.read(path).to_mono() return STFT().process(x).magnitude().T
import numpy as np import matplotlib.pyplot as plt import theano from untwist.data import Wave, Dataset, BinaryMask from untwist.transforms import STFT, ISTFT from untwist.neuralnetworks import MLP, SGD floatX = theano.config.floatX n_bins = 513 train_frames = 10000 target = Wave.read("target.wav")[:, 0] background = Wave.read("background.wav")[:, 0] mix = target + background stft = STFT() istft = ISTFT() mlp = MLP(n_bins, n_bins, [n_bins, n_bins]) sgd = SGD(mlp, learning_rate=0.05, momentum=0.2, batch_size=200, iterations=100) X = stft.process(mix) T = stft.process(target) B = stft.process(background) ideal_mask = BinaryMask(T.magnitude(), B.magnitude()) ds = Dataset(n_bins, floatX, n_bins, np.bool_) Xtrain = X[:, :train_frames].magnitude().T
nc /= nc.max() return nc def novelty_seg(ftr, kernel_size): ftr -= ftr.min() ftr /= ftr.max() n = get_novelty_curve(ftr, kernel_size) print("Finding slices") tp, prop = signal.find_peaks(n, height=np.mean(n) + np.std(n), distance=1) return tp.astype(np.int32) stft_file = sys.argv[1] outPath = sys.argv[2] kernel_size = int(sys.argv[3]) iterations = sys.argv[4] net = ae.AutoEncoder(513, 13) print("performing STFT") x = Wave.read(stft_file).to_mono() X = STFT().process(x).magnitude().T print("done") ae.train_ae(net, X) print("Getting feature vectors") features = ae.get_learnt_features(net, X) print("Computing novelty") boundaries = novelty_seg(features, kernel_size) np.savetxt( os.path.expanduser(outPath + '/' + Path(stft_file).name + '.ae_segs.ds'), boundaries)
t2 = -0.2 t3 = 0.5 n_bins = fft_size/2 + 1 in_fname = str(sys.argv[1]) factor = float(sys.argv[2]) rank = float(sys.argv[3]) if len(sys.argv) > 4: t1 = float(sys.argv[4]) if len(sys.argv) > 5: t2 = float(sys.argv[5]) if len(sys.argv) > 6: t3 = float(sys.argv[6]) out_fname = splitext(in_fname)[0]+"_"+str(factor)+".wav" stft = STFT(signal.hann(window_size, sym = False), fft_size,hop_size) istft = ISTFT(signal.hann(window_size, sym = False), fft_size,hop_size) x = Wave.read(in_fname) if len(x.shape) > 1 and x.shape[1] > 1: x = x[:,0] sr = x.sample_rate X = stft.process(x) Xm = np.abs(X) Xp = np.angle(X) radian_bin_freqs = 2 * np.pi * np.arange(Xm.shape[0]) / fft_size phase_increment = radian_bin_freqs * hop_size phase_lock = True lock_active = True # change for envelope preservation if rank < 1: