Beispiel #1
0
def audio_descriptor(obj, i, a_dictionary, a_noise):
    """
	Computes audio descriptor of ith sample of object obj.
	:param obj: string. Object from which descriptor is computed.
	:param i: int. ith sample of object from which descriptor is computed
	"""

    filename = globals.path + 'wav/' + obj + '-' + str(i) + '.wav'

    #Reads signal
    (fs, signal) = au.read(filename)

    #Clips signal
    decay = range(10, 110, 10)
    (RTN, signal, I) = au.RTN(signal, fs, 0.01, decay)

    signal = signal[:I[decay.index(60)]]

    #Adds additive noise to signal, if a_noise != float('inf'), where a_noise is SNR between signal and noise
    signal = au.get_noisy_signal(signal, a_noise, noise_type='white')

    signal = au.amplify(signal)

    #Gets RTN descriptor from updated signal
    (RTN, signal, I) = au.rtn(signal, fs, 0.01, decay)

    #audio descriptor
    a_descriptor = []

    #RTN
    a_descriptor.extend(RTN)

    #Calculates MFCC descriptor from signal
    mfcc = mfcc_descriptor(signal, fs)
    b = bof(mfcc, a_dictionary)
    a_descriptor.extend(b)

    #Calculates FFT descriptor from signal
    fftcoefs = au.cft(signal, fs, 0.01, 0.001)

    a_descriptor.extend(fftcoefs)

    return a_descriptor
Beispiel #2
0
src_path = '/Users/avin/git/vc/datasets/timit/TIMIT/TRAIN/*/*'
# src_path = '/Users/avin/git/vc/datasets/kate/sense_and_sensibility_split'
# src_path = '/Users/avin/git/vc/datasets/arctic/bdl'
# src_path = '/Users/avin/git/vc/datasets/kate/therese_raquin_split'
sr = 16000
n_fft = 512
win_length = 400
hop_length = 80
n_sample = 200

amps = []
log_amps = []
dbs = []
for filepath in glob.glob('{}/*.wav'.format(src_path))[:n_sample]:
    wav = read(filepath, sr, mono=True)
    spec = librosa.stft(wav,
                        n_fft=n_fft,
                        win_length=win_length,
                        hop_length=hop_length)  # (n_fft/2+1, t)
    amp = np.abs(spec)
    amps.extend(amp.flatten())

    log_amp = np.log(amp)
    log_amps.extend(log_amp.flatten())

    db = librosa.amplitude_to_db(amp)
    dbs.extend(db.flatten())

amps = np.array(amps)
log_amps = np.array(log_amps)
import librosa.display
import utils
import numpy as np
import matplotlib.pyplot as plt
from audio_utils import read, write

filename = '/Users/avin/git/vc/datasets/timit/TIMIT/TEST/DR1/FAKS0/SA1.wav'
sr = 22050
n_fft = 4096
len_hop = n_fft / 4
plot_wav = True
plot_spec = True

# Waveforms
wav = read(filename, sr, mono=True)
# wav = np.where(wav == 0, 1000, wav)
# wav = np.zeros_like(wav)
# wav[0] = np.ones_like(wav[0])

# Spectrogram
spec = librosa.stft(wav, n_fft=n_fft, hop_length=len_hop)

# Plot waveforms
if plot_wav:
    plt.figure(1)

    librosa.display.waveplot(wav, sr=sr, color='b')
    plt.title('waveform')

    plt.tight_layout()
import librosa.display
import utils
import numpy as np
import matplotlib.pyplot as plt
from audio_utils import read, write

filename = '/Users/avin/git/vc/datasets/timit/TIMIT/TEST/DR1/FAKS0/SA1.wav'
sr = 22050
n_fft = 4096
len_hop = n_fft / 4
plot_wav = True
plot_spec = True

# Waveforms
wav = read(filename, sr, mono=True)
# wav = np.where(wav == 0, 1000, wav)
# wav = np.zeros_like(wav)
# wav[0] = np.ones_like(wav[0])

# Spectrogram
spec = librosa.stft(wav, n_fft=n_fft, hop_length=len_hop)

# Plot waveforms
if plot_wav:
    plt.figure(1)

    librosa.display.waveplot(wav, sr=sr, color='b')
    plt.title('waveform')

    plt.tight_layout()
Beispiel #5
0
def get_descriptors(obj, i, a_dictionary, a_k, a_noise, v_noise, combination):

    #Audio filename
    filename = globals.path + 'wav/' + obj + '-' + str(i) + '.wav'

    #Reads signal
    (fs, signal) = au.read(filename)

    #Clips signal
    decay = range(10, 110, 10)
    (RTN, signal, I) = au.RTN(signal, fs, 0.01, decay)

    signal = signal[:I[decay.index(60)]]

    signal = au.amplify(signal)

    audio_descriptor = []

    combination = '{:06b}'.format(combination)
    #TN
    if (int(combination[0])):
        audio_descriptor.extend(RTN)

    #Stacked MFCC
    if (int(combination[1])):
        mfcc = mfcc_descriptor(signal, fs)
        mfcc = np.reshape(mfcc, -1, order='C')
        mfcc = mfcc.tolist()
        m.append(len(mfcc))
        #mfcc.extend((2938 - len(mfcc))*[0])
        audio_descriptor.extend(mfcc[:4056])

    #BOF
    if (int(combination[2])):
        mfcc = mfcc_descriptor(signal, fs)
        #mfcc = np.reshape(mfcc, -1, order='C')
        #mfcc = mfcc.tolist()
        b = bof(mfcc, a_dictionary)
        audio_descriptor.extend(b)

    #wavelet
    if (int(combination[3])):
        wavelet = dwt_coefs(signal, a_noise, 'haar', 6)
        w.append(len(wavelet))
        #wavelet.extend((5050 - len(wavelet))*[0])
        audio_descriptor.extend(wavelet[:5050])

    #fftIza
    if (int(combination[4])):
        coefs = au.fft_iza(signal, fs, 0.01, 0.001)

        # ciza.append(len(coefs))
        # coefs.extend((18225 - len(coefs))*[0])
        audio_descriptor.extend(coefs[:25000])

    #fftArt
    if (int(combination[5])):
        coefs = au.cft(signal, fs, 0.01, 0.001)

        cart.append(len(coefs))

        audio_descriptor.extend(coefs)

    #Video

    depth = cv.LoadImage(
        globals.path + 'img/' + obj + '-' + str(i) + '-depth.png',
        cv.CV_LOAD_IMAGE_GRAYSCALE)
    mask = cv.LoadImage(globals.path + 'mask.png', cv.CV_LOAD_IMAGE_GRAYSCALE)

    if v_noise > 0:
        rgb = cv2.imread(
            globals.path + 'img/' + obj + '-' + str(i) + '-rgb.png',
            cv.CV_LOAD_IMAGE_GRAYSCALE)
        noise = np.random.randn(*rgb.shape) * v_noise

        # Add this noise to image
        noisy = rgb + noise
        cv2.imwrite(globals.path + 'noisy.png', noisy)
        rgb = cv.LoadImage(globals.path + 'noisy.png',
                           cv.CV_LOAD_IMAGE_GRAYSCALE)
        #rgb = noisy

    else:
        rgb = cv.LoadImage(
            globals.path + 'img/' + obj + '-' + str(i) + '-rgb.png',
            cv.CV_LOAD_IMAGE_GRAYSCALE)

    #cv.fromarray(rgb)

    bridge = CvBridge()
    rgb_image = bridge.cv_to_imgmsg(rgb)
    depth_image = bridge.cv_to_imgmsg(depth)
    mask_image = bridge.cv_to_imgmsg(mask)

    rospy.wait_for_service('masked_base_descriptor_service')
    try:
        descriptor_request = rospy.ServiceProxy(
            'masked_base_descriptor_service', masked_service)
        response = descriptor_request(rgb=rgb_image,
                                      depth=depth_image,
                                      mask=mask_image)
        base_descriptor = bridge.imgmsg_to_cv(response.descriptor)

        visual_descriptor = np.array(base_descriptor)

    except rospy.ServiceException, e:
        print "Service call failed: %s" % e
src_path = '/Users/avin/git/vc/datasets/timit/TIMIT/TRAIN/*/*'
# src_path = '/Users/avin/git/vc/datasets/kate/sense_and_sensibility_split'
# src_path = '/Users/avin/git/vc/datasets/arctic/bdl'
# src_path = '/Users/avin/git/vc/datasets/kate/therese_raquin_split'
sr = 16000
n_fft = 512
win_length = 400
hop_length = 80
n_sample = 200

amps = []
log_amps = []
dbs = []
for filepath in glob.glob('{}/*.wav'.format(src_path))[:n_sample]:
    wav = read(filepath, sr, mono=True)
    spec = librosa.stft(wav, n_fft=n_fft, win_length=win_length, hop_length=hop_length)  # (n_fft/2+1, t)
    amp = np.abs(spec)
    amps.extend(amp.flatten())

    log_amp = np.log(amp)
    log_amps.extend(log_amp.flatten())

    db = librosa.amplitude_to_db(amp)
    dbs.extend(db.flatten())

amps = np.array(amps)
log_amps = np.array(log_amps)
dbs = np.array(dbs)