def audio_descriptor(obj, i, a_dictionary, a_noise): """ Computes audio descriptor of ith sample of object obj. :param obj: string. Object from which descriptor is computed. :param i: int. ith sample of object from which descriptor is computed """ filename = globals.path + 'wav/' + obj + '-' + str(i) + '.wav' #Reads signal (fs, signal) = au.read(filename) #Clips signal decay = range(10, 110, 10) (RTN, signal, I) = au.RTN(signal, fs, 0.01, decay) signal = signal[:I[decay.index(60)]] #Adds additive noise to signal, if a_noise != float('inf'), where a_noise is SNR between signal and noise signal = au.get_noisy_signal(signal, a_noise, noise_type='white') signal = au.amplify(signal) #Gets RTN descriptor from updated signal (RTN, signal, I) = au.rtn(signal, fs, 0.01, decay) #audio descriptor a_descriptor = [] #RTN a_descriptor.extend(RTN) #Calculates MFCC descriptor from signal mfcc = mfcc_descriptor(signal, fs) b = bof(mfcc, a_dictionary) a_descriptor.extend(b) #Calculates FFT descriptor from signal fftcoefs = au.cft(signal, fs, 0.01, 0.001) a_descriptor.extend(fftcoefs) return a_descriptor
src_path = '/Users/avin/git/vc/datasets/timit/TIMIT/TRAIN/*/*' # src_path = '/Users/avin/git/vc/datasets/kate/sense_and_sensibility_split' # src_path = '/Users/avin/git/vc/datasets/arctic/bdl' # src_path = '/Users/avin/git/vc/datasets/kate/therese_raquin_split' sr = 16000 n_fft = 512 win_length = 400 hop_length = 80 n_sample = 200 amps = [] log_amps = [] dbs = [] for filepath in glob.glob('{}/*.wav'.format(src_path))[:n_sample]: wav = read(filepath, sr, mono=True) spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length) # (n_fft/2+1, t) amp = np.abs(spec) amps.extend(amp.flatten()) log_amp = np.log(amp) log_amps.extend(log_amp.flatten()) db = librosa.amplitude_to_db(amp) dbs.extend(db.flatten()) amps = np.array(amps) log_amps = np.array(log_amps)
import librosa.display import utils import numpy as np import matplotlib.pyplot as plt from audio_utils import read, write filename = '/Users/avin/git/vc/datasets/timit/TIMIT/TEST/DR1/FAKS0/SA1.wav' sr = 22050 n_fft = 4096 len_hop = n_fft / 4 plot_wav = True plot_spec = True # Waveforms wav = read(filename, sr, mono=True) # wav = np.where(wav == 0, 1000, wav) # wav = np.zeros_like(wav) # wav[0] = np.ones_like(wav[0]) # Spectrogram spec = librosa.stft(wav, n_fft=n_fft, hop_length=len_hop) # Plot waveforms if plot_wav: plt.figure(1) librosa.display.waveplot(wav, sr=sr, color='b') plt.title('waveform') plt.tight_layout()
def get_descriptors(obj, i, a_dictionary, a_k, a_noise, v_noise, combination): #Audio filename filename = globals.path + 'wav/' + obj + '-' + str(i) + '.wav' #Reads signal (fs, signal) = au.read(filename) #Clips signal decay = range(10, 110, 10) (RTN, signal, I) = au.RTN(signal, fs, 0.01, decay) signal = signal[:I[decay.index(60)]] signal = au.amplify(signal) audio_descriptor = [] combination = '{:06b}'.format(combination) #TN if (int(combination[0])): audio_descriptor.extend(RTN) #Stacked MFCC if (int(combination[1])): mfcc = mfcc_descriptor(signal, fs) mfcc = np.reshape(mfcc, -1, order='C') mfcc = mfcc.tolist() m.append(len(mfcc)) #mfcc.extend((2938 - len(mfcc))*[0]) audio_descriptor.extend(mfcc[:4056]) #BOF if (int(combination[2])): mfcc = mfcc_descriptor(signal, fs) #mfcc = np.reshape(mfcc, -1, order='C') #mfcc = mfcc.tolist() b = bof(mfcc, a_dictionary) audio_descriptor.extend(b) #wavelet if (int(combination[3])): wavelet = dwt_coefs(signal, a_noise, 'haar', 6) w.append(len(wavelet)) #wavelet.extend((5050 - len(wavelet))*[0]) audio_descriptor.extend(wavelet[:5050]) #fftIza if (int(combination[4])): coefs = au.fft_iza(signal, fs, 0.01, 0.001) # ciza.append(len(coefs)) # coefs.extend((18225 - len(coefs))*[0]) audio_descriptor.extend(coefs[:25000]) #fftArt if (int(combination[5])): coefs = au.cft(signal, fs, 0.01, 0.001) cart.append(len(coefs)) audio_descriptor.extend(coefs) #Video depth = cv.LoadImage( globals.path + 'img/' + obj + '-' + str(i) + '-depth.png', cv.CV_LOAD_IMAGE_GRAYSCALE) mask = cv.LoadImage(globals.path + 'mask.png', cv.CV_LOAD_IMAGE_GRAYSCALE) if v_noise > 0: rgb = cv2.imread( globals.path + 'img/' + obj + '-' + str(i) + '-rgb.png', cv.CV_LOAD_IMAGE_GRAYSCALE) noise = np.random.randn(*rgb.shape) * v_noise # Add this noise to image noisy = rgb + noise cv2.imwrite(globals.path + 'noisy.png', noisy) rgb = cv.LoadImage(globals.path + 'noisy.png', cv.CV_LOAD_IMAGE_GRAYSCALE) #rgb = noisy else: rgb = cv.LoadImage( globals.path + 'img/' + obj + '-' + str(i) + '-rgb.png', cv.CV_LOAD_IMAGE_GRAYSCALE) #cv.fromarray(rgb) bridge = CvBridge() rgb_image = bridge.cv_to_imgmsg(rgb) depth_image = bridge.cv_to_imgmsg(depth) mask_image = bridge.cv_to_imgmsg(mask) rospy.wait_for_service('masked_base_descriptor_service') try: descriptor_request = rospy.ServiceProxy( 'masked_base_descriptor_service', masked_service) response = descriptor_request(rgb=rgb_image, depth=depth_image, mask=mask_image) base_descriptor = bridge.imgmsg_to_cv(response.descriptor) visual_descriptor = np.array(base_descriptor) except rospy.ServiceException, e: print "Service call failed: %s" % e
src_path = '/Users/avin/git/vc/datasets/timit/TIMIT/TRAIN/*/*' # src_path = '/Users/avin/git/vc/datasets/kate/sense_and_sensibility_split' # src_path = '/Users/avin/git/vc/datasets/arctic/bdl' # src_path = '/Users/avin/git/vc/datasets/kate/therese_raquin_split' sr = 16000 n_fft = 512 win_length = 400 hop_length = 80 n_sample = 200 amps = [] log_amps = [] dbs = [] for filepath in glob.glob('{}/*.wav'.format(src_path))[:n_sample]: wav = read(filepath, sr, mono=True) spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length) # (n_fft/2+1, t) amp = np.abs(spec) amps.extend(amp.flatten()) log_amp = np.log(amp) log_amps.extend(log_amp.flatten()) db = librosa.amplitude_to_db(amp) dbs.extend(db.flatten()) amps = np.array(amps) log_amps = np.array(log_amps) dbs = np.array(dbs)