def spectral_loss(expected, actual, mag_weight=1.0, phase_weight=1.0): exp = tf.transpose( expected, [0, 2, 1 ]) # Place the samples in the last dimension as required by stft act = tf.transpose(actual, [0, 2, 1]) # TODO: Tunable params here (window size, window stride, window type) en = tf.random_normal(shape=tf.shape(exp), mean=0.0, stddev=0.00001, dtype=tf.float32) an = tf.random_normal(shape=tf.shape(act), mean=0.0, stddev=0.00001, dtype=tf.float32) estft = stft(exp + en, 4096, 2048, window_fn=hamming_window, pad_end=True) astft = stft(act + an, 4096, 2048, window_fn=hamming_window, pad_end=True) esm = tf.abs(estft) esp = tf.angle(estft) asm = tf.abs(astft) asp = tf.angle(astft) mag_err = tf.reduce_mean(tf.abs(esm - asm)) # Cosine-similarity. Also consider replacing tf.cos with 1-tf.sin phe = 1.0 - tf.cos(tf.abs(asp - esp)) ph_err = tf.reduce_mean(phe) loss = mag_weight * mag_err + phase_weight * ph_err loss = tf.where(tf.is_nan(loss), 0., loss) return [loss, estft, astft]
def phasor_loss(expected, actual): exp = tf.transpose(expected, [0, 2, 1]) # Place the samples in the last dimension as required by stft act = tf.transpose(actual, [0, 2, 1]) estft = stft(exp, 4096, 2048, window_fn=hamming_window, pad_end=True) astft = stft(act, 4096, 2048, window_fn=hamming_window, pad_end=True) mag_err = tf.reduce_mean(tf.square(tf.abs(estft-astft))); return mag_err
def spectralLoss(expected, actual, mag_weight=1.0, phase_weight=1.0): exp = tf.transpose(expected, [0, 2, 1]) # Place the samples in the last dimension as required by stft act = tf.transpose(actual, [0, 2, 1]) # TODO: Tunable params here (window size, window stride, window type) estft = stft(exp, 4096, 2048, window_fn=hamming_window, pad_end=True) astft = stft(act, 4096, 2048, window_fn=hamming_window, pad_end=True) esm = tf.abs(estft) esp = tf.angle(estft) asm = tf.abs(astft) asp = tf.angle(astft) mag_err = tf.reduce_mean(tf.abs(esm - asm)) # Cosine-similarity. Also consider replacing tf.cos with 1-tf.sin ph_err = tf.reduce_mean(1.0 - tf.cos(tf.abs(asp - esp))) return mag_weight * mag_err + phase_weight * ph_err
def phasor_loss(expected, actual): eps = 0.0000001 pwr = 0.5 # hyperparameter on 0..1. 1 = phase ONLY; 0 = unnormalized phasors exp = tf.transpose(expected, [0, 2, 1]) # Place the samples in the last dimension as required by stft act = tf.transpose(actual, [0, 2, 1]) estft = stft(exp, 4096, 2048, window_fn=hamming_window, pad_end=True) astft = stft(act, 4096, 2048, window_fn=hamming_window, pad_end=True) esn = tf.abs(estft) + eps asn = tf.abs(astft) + eps esph = tf.divide(estft, tf.pow(esn, pwr)) asph = tf.divide(estft, tf.pow(esn, pwr)) mag_err = tf.reduce_mean(tf.square(tf.abs(estft-astft))); return mag_err
def tf_stft(x): window_fn = choose_window_fn(window_fn_type) fc = stft(x,frame_length=frame_length,frame_step=frame_step,fft_length=fft_length,window_fn=window_fn,pad_end=pad_end) f_real = tf.real(fc) f_imag = tf.imag(fc) f = tf.stack([f_real,f_imag],-1) return f
def _build_stft_feature(self): """ Compute STFT of waveform and slice the STFT in segment with the right length to feed the network. """ stft_name = self.stft_name spec_name = self.spectrogram_name if stft_name not in self._features: # pad input with a frame of zeros waveform = tf.concat([ tf.zeros((self._frame_length, self._n_channels)), self._features['waveform'] ], 0) stft_feature = tf.transpose( stft(tf.transpose(waveform), self._frame_length, self._frame_step, window_fn=lambda frame_length, dtype: (hann_window(frame_length, periodic=True, dtype=dtype)), pad_end=True), perm=[1, 2, 0]) self._features[f'{self._mix_name}_stft'] = stft_feature if spec_name not in self._features: self._features[spec_name] = tf.abs( pad_and_partition(self._features[stft_name], self._T))[:, :, :self._F, :]
def compute_spectrogram_tf(waveform, frame_length=2048, frame_step=512, spec_exponent=1., window_exponent=1.): """ Compute magnitude / power spectrogram from waveform as a n_samples x n_channels tensor. :param waveform: Input waveform as (times x number of channels) tensor. :param frame_length: Length of a STFT frame to use. :param frame_step: HOP between successive frames. :param spec_exponent: Exponent of the spectrogram (usually 1 for magnitude spectrogram, or 2 for power spectrogram). :param window_exponent: Exponent applied to the Hann windowing function (may be useful for making perfect STFT/iSTFT reconstruction). :returns: Computed magnitude / power spectrogram as a (T x F x n_channels) tensor. """ stft_tensor = tf.transpose(stft( tf.transpose(waveform), frame_length, frame_step, window_fn=lambda f, dtype: hann_window( f, periodic=True, dtype=waveform.dtype)**window_exponent), perm=[1, 2, 0]) return tf.abs(stft_tensor)**spec_exponent
def body(i, wave_list, decomp): transform = stft(tf.transpose(wave_list.read(i)), frame_length, frame_step, fft_length) #Squeeze the transform to get rid of the channel dimension, # and transpose it, so that each frame is a vector transform = tf.transpose(tf.squeeze(transform)) decomp = decomp.write(i, transform) return i + 1, wave_list, decomp
def get_log_spectrogram(wav): specgram = signal.stft( wav, 256, # 16000 [samples per second] * 0.025 [s] -- default stft window frame 128, # 16000 * 0.010 -- default stride ) spectrograms = tf.abs(specgram) log_spectrograms = tf.log(spectrograms + 1e-6) log_spectrograms = tf.expand_dims(log_spectrograms, axis=3) return log_spectrograms
def tf_melspectrogram(x): window_fn = choose_window_fn(window_fn_type) fc = stft(x,frame_length=frame_length,frame_step=frame_step,fft_length=fft_length,window_fn=window_fn,pad_end=pad_end) f = tf.abs(fc)**power w = linear_to_mel_weight_matrix( num_mel_bins=num_mel_bins, num_spectrogram_bins=num_spectrogram_bins, sample_rate=sample_rate, lower_edge_hertz=lower_edge_hertz, upper_edge_hertz=upper_edge_hertz) r = tf.tensordot(f,w,1) r.set_shape(f.shape[:-1].concatenate(num_mel_bins)) return r
def preprocess(x): specgram = signal.stft( x, 400, # 16000 [samples per second] * 0.025 [s] -- default stft window frame 160, # 16000 * 0.010 -- default stride ) # specgram is a complex tensor, so split it into abs and phase parts: phase = tf.angle(specgram) / np.pi # log(1 + abs) is a default transformation for energy units amp = tf.log1p(tf.abs(specgram)) x2 = tf.stack([amp, phase], axis=3) # shape is [bs, time, freq_bins, 2] x2 = tf.to_float(x2) return x2
def _build_stft_feature(self): stft_feature = tf.transpose( stft( tf.transpose(self._features['waveform']), self._frame_length, self._frame_step, window_fn=lambda frame_length, dtype: ( hann_window(frame_length, periodic=True, dtype=dtype)), pad_end=True), perm=[1, 2, 0]) self._features[f'{self._mix_name}_stft'] = stft_feature self._features[f'{self._mix_name}_spectrogram'] = tf.abs( pad_and_partition(stft_feature, self._T))[:, :, :self._F, :]
def compute_stfts(tensors, hparams): frame_length_samples = int( (hparams.sample_rate / 1000) * hparams.frame_length_msec) frame_step_samples = int( (hparams.sample_rate / 1000) * hparams.frame_step_msec) stfts = signal.stft( signals=tensors, frame_length=frame_length_samples, frame_step=frame_step_samples, ) return stfts
def get_spectrogram(wav): specgram = signal.stft( wav, 256, # 16000 [samples per second] * 0.025 [s] -- default stft window frame 128, # 16000 * 0.010 -- default stride ) # log(1 + abs) is a default transformation for energy units amp = tf.log1p(tf.abs(specgram)) # specgram is a complex tensor, so split it into abs and phase parts: phase = tf.angle(specgram) / np.pi x = tf.stack([amp, phase], axis=3) # shape is [bs, time, freq_bins, 2] x = tf.to_float(x) # we want to have float32, not float64 return x
def _build_stft_feature(self): """ Compute STFT of waveform and slice the STFT in segment with the right length to feed the network. """ stft_feature = tf.transpose( stft(tf.transpose(self._features['waveform']), self._frame_length, self._frame_step, window_fn=lambda frame_length, dtype: (hann_window(frame_length, periodic=True, dtype=dtype)), pad_end=True), perm=[1, 2, 0]) self._features[f'{self._mix_name}_stft'] = stft_feature self._features[f'{self._mix_name}_spectrogram'] = tf.abs( pad_and_partition(stft_feature, self._T))[:, :, :self._F, :]
def mag_spectrogram(frames, fft_length=1024, fft_step=512, name=None): """Extract magnitude spectrograms from a batch of audio signals. Args: frames: A `Tensor` of shape `[frames, samples]`. fft_length: An integer scalar `Tensor`. The window length in samples. fft_step: An integer scalar `Tensor`. The number of samples to step. name: `string`, name of the operation. Returns: A `Tensor` with shape `[frames, spectrogram_bins]`. """ with tf.name_scope(name, "mag_spectrogram"): stft = contrib_signal.stft(frames, fft_length, fft_step) ms = tf.abs(stft) return ms
def preprocess(x): specgram = signal.stft( x, 400, # 16000 [samples per second] * 0.025 [s] -- default stft window frame 160, # 16000 * 0.010 -- default stride ) # specgram is a complex tensor, so split it into abs and phase parts: phase = tf.angle(specgram) / np.pi # log(1 + abs) is a default transformation for energy units amp = tf.log1p(tf.abs(specgram)) x2 = tf.stack([amp, phase], axis=3) # shape is [bs, time, freq_bins, 2] x2 = tf.to_float(x2) # # Compute MFCC using Tensorflow functions # # A 400-point STFT with frames of 25 ms and 10 ms overlap. # sample_rate = 16000 # stfts = tf.contrib.signal.stft(x, frame_length=400, frame_step=160, # fft_length=400) # spectrograms = tf.abs(stfts) # # # Warp the linear scale spectrograms into the mel-scale. # num_spectrogram_bins = stfts.shape[-1].value # lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 # linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( # num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, # upper_edge_hertz) # mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1) # mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( # linear_to_mel_weight_matrix.shape[-1:])) # # # Compute a stabilized log to get log-magnitude mel-scale spectrograms. # log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) # # # Compute MFCCs from log_mel_spectrograms and take the first 13. # mfccs = tf.contrib.signal.mfccs_from_log_mel_spectrograms( # log_mel_spectrograms)[..., :13] # mfccs = tf.Print(mfccs, [mfccs], message="MFCCs: ") # delta_mfccs = np.append(mfccs[0], mfccs[1:] - mfccs[:-1]) # dd_mfccs = np.append(delta_mfccs[0], delta_mfccs[1:] - delta_mfccs[:-1]) # x2 = tf.stack([mfccs, delta_mfccs, dd_mfccs], axis=3) # shape is [bs, time, freq_bins, ???] return x2
def compute_logmel_spectrograms(audio, sample_rate, frame_length_seconds, frame_step_seconds): """Computes the log-mel spectrograms of a batch of audio clips Parameters ---------- audio : a two dimensional tensor of audio samples of shape (num_samples, num_signals) sample_rate : the sample rate of the audio signals in Hz frame_length_seconds : the width of the STFT, in seconds frame_step_seconds : the number of seconds the STFTs are shifted from each other Returns ------- A tensor of spectrograms of shape (num_signals, time_units, mel_bins) and dtype tf.float32 """ # Convert time parameters to samples frame_length_samples = int(frame_length_seconds * sample_rate) frame_step_samples = int(frame_step_seconds * sample_rate) # Create a spectrogram by taking the magnitude of the Short Time fourier Transform stft = contrib_signal.stft(audio, frame_length=frame_length_samples, frame_step=frame_step_samples, fft_length=frame_length_samples) magnitude_spectrograms = tf.abs(stft) # Warp the linear scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = magnitude_spectrograms.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 40 linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot( magnitude_spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) # Compress the mel spectrogram magnitudes. log_offset = 1e-6 log_mel_spectrograms = tf.log(mel_spectrograms + log_offset) return log_mel_spectrograms
def get_mel_spectrogram(wav): specgram = signal.stft( wav, 256, # 16000 [samples per second] * 0.025 [s] -- default stft window frame 128, # 16000 * 0.010 -- default stride ) spectrograms = tf.abs(specgram) sample_rate = 16000 num_spectrogram_bins = specgram.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80 linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, axis=3) return log_mel_spectrograms
def power(wav): stft_matrix = stft(wav, win_length, hop_length) return tf.square(tf.abs(stft_matrix))
audio_loader = get_default_audio_adapter() sample_rate = 44100 waveform, _ = audio_loader.load(filename, sample_rate=sample_rate) print(waveform.dtype) print("max amplitude: {}".format(np.max(np.abs(waveform)))) # compute spectrogram print("compute stft") frame_length = separator._params['frame_length'] frame_step = separator._params['frame_step'] with predictor.graph.as_default(): stft_feature = tf.transpose( stft(tf.transpose(waveform), frame_length, frame_step, window_fn=lambda frame_length, dtype: (hann_window(frame_length, periodic=True, dtype=dtype)), pad_end=True), perm=[1, 2, 0]) T = separator._params['T'] F = separator._params['F'] spectrogram = tf.abs(pad_and_partition(stft_feature, T))[:, :, :F, :] stft_np = predictor.session.run(stft_feature) spectrogram_np = predictor.session.run(spectrogram) print("yes stft") # compute perturbation with predictor.graph.as_default(): print("build graph")
def model_handler(features, labels, mode, params, config): # Im really like to use make_template instead of variable_scopes and re-usage extractor = tf.make_template( 'extractor', baseline, create_scope_now_=True, ) # wav is a waveform signal with shape (16000, ) wav = features['wav'] # we want to compute spectograms by means of short time fourier transform: specgram = signal.stft( wav, 400, # 16000 [samples per second] * 0.025 [s] -- default stft window frame 160, # 16000 * 0.010 -- default stride ) # specgram is a complex tensor, so split it into abs and phase parts: phase = tf.angle(specgram) / np.pi # log(1 + abs) is a default transformation for energy units amp = tf.log1p(tf.abs(specgram)) x = tf.stack([amp, phase], axis=3) # shape is [bs, time, freq_bins, 2] x = tf.to_float(x) # we want to have float32, not float64 logits = extractor(x, params, mode == tf.estimator.ModeKeys.TRAIN) if mode == tf.estimator.ModeKeys.TRAIN: loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)) # some lr tuner, you could use move interesting functions def learning_rate_decay_fn(learning_rate, global_step): return tf.train.exponential_decay(learning_rate, global_step, decay_steps=10000, decay_rate=0.99) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=tf.contrib.framework.get_global_step(), learning_rate=params.learning_rate, optimizer=lambda lr: tf.train.MomentumOptimizer( lr, 0.9, use_nesterov=True), learning_rate_decay_fn=learning_rate_decay_fn, clip_gradients=params.clip_gradients, variables=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)) specs = dict( mode=mode, loss=loss, train_op=train_op, ) if mode == tf.estimator.ModeKeys.EVAL: prediction = tf.argmax(logits, axis=-1) acc, acc_op = tf.metrics.mean_per_class_accuracy( labels, prediction, params.num_classes) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)) specs = dict(mode=mode, loss=loss, eval_metric_ops=dict(acc=(acc, acc_op), )) if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'label': tf.argmax(logits, axis=-1), # for probability just take tf.nn.softmax() 'sample': features['sample'], # it's a hack for simplicity } specs = dict( mode=mode, predictions=predictions, ) return tf.estimator.EstimatorSpec(**specs)
def __init__( self, architecture, source_seq_len, target_seq_len, rnn_size, # hidden recurrent layer size num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, summaries_dir, loss_to_use, number_of_actions, one_hot=True, residual_velocities=False, dtype=tf.float32, custom_opt=False, cgru=True, fft=True, window_size=30, step_size=10, window_fun='hann', gaussian_scaling=False): """Create the model. Args: architecture: [basic, tied] whether to tie the decoder and decoder. source_seq_len: lenght of the input sequence. target_seq_len: lenght of the target sequence. rnn_size: number of units in the rnn. num_layers: number of rnns to stack. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. summaries_dir: where to log progress for tensorboard. loss_to_use: [supervised, sampling_based]. Whether to use ground truth in each timestep to compute the loss after decoding, or to feed back the prediction from the previous time-step. number_of_actions: number of classes we have. one_hot: whether to use one_hot encoding during train/test (sup models). residual_velocities: whether to use a residual connection that models velocities. dtype: the data type to use to store internal variables. """ if fft: assert cgru == True if custom_opt: assert cgru == True self.HUMAN_SIZE = 54 self.input_size = self.HUMAN_SIZE + number_of_actions if one_hot else self.HUMAN_SIZE print("One hot is ", one_hot) print("Input size is %d" % self.input_size) # Summary writers for train and test runs self.train_writer = tf.summary.FileWriter( os.path.normpath(os.path.join(summaries_dir, 'train'))) self.test_writer = tf.summary.FileWriter( os.path.normpath(os.path.join(summaries_dir, 'test'))) self.source_seq_len = source_seq_len self.target_seq_len = target_seq_len self.rnn_size = rnn_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # === Transform the inputs === with tf.name_scope("inputs"): enc_in = tf.placeholder( dtype, shape=[None, source_seq_len - 1, self.input_size], name="enc_in") dec_in = tf.placeholder( dtype, shape=[None, target_seq_len, self.input_size], name="dec_in") dec_out = tf.placeholder( dtype, shape=[None, target_seq_len, self.input_size], name="dec_out") self.encoder_inputs = enc_in self.decoder_inputs = dec_in self.decoder_outputs = dec_out enc_in = tf.transpose(enc_in, [1, 0, 2]) dec_in = tf.transpose(dec_in, [1, 0, 2]) dec_out = tf.transpose(dec_out, [1, 0, 2]) enc_in = tf.reshape(enc_in, [-1, self.input_size]) dec_in = tf.reshape(dec_in, [-1, self.input_size]) dec_out = tf.reshape(dec_out, [-1, self.input_size]) enc_in = tf.split(enc_in, source_seq_len - 1, axis=0) dec_in = tf.split(dec_in, target_seq_len, axis=0) dec_out = tf.split(dec_out, target_seq_len, axis=0) if fft: assert cgru == True # if true do centering to avoid boundary problems. center = True if center: pad_enc_in = tf.stack(enc_in, axis=-1) pad_amount = 2 * (window_size - step_size) print('padding with', [pad_amount // 2, pad_amount // 2]) # debug_here() pad_enc_in = tf.pad( pad_enc_in, [[0, 0], [0, 0], [pad_amount // 2, pad_amount // 2]], 'REFLECT') else: pad_enc_in = tf.stack(enc_in, axis=-1) # transform input and output. if window_fun == 'hann': w = functools.partial(tf.contrib.signal.hann_window, periodic=True) elif window_fun == 'hamming': w = functools.partial(tf.contrib.signal.hamming_window, periodic=True) elif window_fun == 'None': w = None else: raise ValueError("unknown window function.") fft_enc_in = tfsignal.stft(pad_enc_in, window_size, step_size, window_fn=w) print('fft_enc_in.shape', fft_enc_in.shape) batch_size = tf.shape(fft_enc_in)[0] freq_tensor_shape = fft_enc_in.get_shape().as_list() frames_in = freq_tensor_shape[2] fft_dim_in = freq_tensor_shape[1] * freq_tensor_shape[-1] fft_enc_in = tf.transpose(fft_enc_in, [0, 2, 1, 3]) fft_enc_in = tf.reshape(fft_enc_in, [batch_size, frames_in, fft_dim_in], name='fft_enc_in_reshape') fft_enc_in = tf.unstack(fft_enc_in, axis=1) if center is True: pad_dec_in = tf.stack(dec_in, axis=-1) pad_dec_in = tf.pad( pad_dec_in, [[0, 0], [0, 0], [pad_amount // 2, pad_amount // 2]], 'REFLECT') else: pad_dec_in = tf.stack(dec_in, axis=-1) fft_dec_in = tfsignal.stft(pad_dec_in, window_size, step_size, window_fn=w) print('fft_dec_in.shape', fft_dec_in.shape) batch_size = tf.shape(fft_dec_in)[0] freq_tensor_shape = fft_dec_in.get_shape().as_list() frames_dec = freq_tensor_shape[2] fft_unique_bins_dec = freq_tensor_shape[3] assert self.input_size == freq_tensor_shape[1] fft_dim_out = self.input_size * fft_unique_bins_dec fft_dec_in = tf.transpose(fft_dec_in, [0, 2, 1, 3]) fft_dec_in = tf.reshape(fft_dec_in, [batch_size, frames_dec, fft_dim_out], name='fft_dec_in_reshape') fft_dec_in = tf.unstack(fft_dec_in, axis=1) enc_in = fft_enc_in dec_in = fft_dec_in assert fft_dim_in == fft_dim_out # === Create the RNN that will keep the state === print('rnn_size = {0}'.format(rnn_size)) if cgru: if not fft: cell = rnn_cell_extensions.ComplexGatedRecurrentUnit( self.rnn_size) else: # num_proj = self.input_size * (window_size//2+1) cell = rnn_cell_extensions.ComplexGatedRecurrentUnit( self.rnn_size, complex_out=fft, num_proj=fft_dim_in) print(cell.to_string()) else: cell = tf.contrib.rnn.GRUCell(self.rnn_size) if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([ tf.contrib.rnn.GRUCell(self.rnn_size) for _ in range(num_layers) ]) # === Add space decoder === if not fft: cell = rnn_cell_extensions.LinearSpaceDecoderWrapper( cell, self.input_size) # Finally, wrap everything in a residual layer if we want to model velocities if residual_velocities: assert fft is False print('using resudial_velocities') cell = rnn_cell_extensions.ResidualWrapper(cell) # Store the outputs here outputs = [] # Define the loss function lf = None if loss_to_use == "sampling_based": def lf(prev, i): # function for sampling_based loss return prev elif loss_to_use == "supervised": pass else: raise (ValueError, "unknown loss: %s" % loss_to_use) # Build the RNN if architecture == "basic": # Basic RNN does not have a loop function in its API, so copying here. with vs.variable_scope("basic_rnn_seq2seq"): _, enc_state = tf.contrib.rnn.static_rnn( cell, enc_in, dtype=tf.float32) # Encoder outputs, self.states = tf.contrib.legacy_seq2seq.rnn_decoder( dec_in, enc_state, cell, loop_function=lf) # Decoder elif architecture == "tied": outputs, self.states = tf.contrib.legacy_seq2seq.tied_rnn_seq2seq( enc_in, dec_in, cell, loop_function=lf) else: raise (ValueError, "Unknown architecture: %s" % architecture) if fft: # compute the inverse fft on the outputs and restore the shape. spec_out = tf.reshape(tf.stack(outputs, -1), [ batch_size, self.input_size, fft_unique_bins_dec, len(outputs) ]) spec_out = tf.transpose(spec_out, [0, 1, 3, 2]) if w: iw = tf.contrib.signal.inverse_stft_window_fn( step_size, forward_window_fn=w) else: iw = None outputs = tfsignal.inverse_stft(spec_out, window_size, step_size, window_fn=iw) if center and pad_amount > 0: outputs = outputs[:, :, pad_amount // 2:-pad_amount // 2] outputs.set_shape([None, self.input_size, target_seq_len]) outputs = tf.unstack(outputs, axis=-1, name='result_unstack') self.outputs = outputs with tf.name_scope("loss_angles"): loss_angles = tf.reduce_mean( tf.square(tf.subtract(dec_out, outputs))) self.loss = loss_angles self.loss_summary = tf.summary.scalar('loss/loss', self.loss) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() # Original algorithm. if custom_opt: # Wisdoms modification. opt = RMSpropNatGrad(self.learning_rate, global_step=self.global_step) else: opt = tf.train.GradientDescentOptimizer(self.learning_rate) # Update all the trainable parameters gradients = tf.gradients(self.loss, params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Keep track of the learning rate self.learning_rate_summary = tf.summary.scalar( 'learning_rate/learning_rate', self.learning_rate) # === variables for loss in Euler Angles -- for each action with tf.name_scope("euler_error_walking"): self.walking_err80 = tf.placeholder(tf.float32, name="walking_srnn_seeds_0080") self.walking_err160 = tf.placeholder( tf.float32, name="walking_srnn_seeds_0160") self.walking_err320 = tf.placeholder( tf.float32, name="walking_srnn_seeds_0320") self.walking_err400 = tf.placeholder( tf.float32, name="walking_srnn_seeds_0400") self.walking_err560 = tf.placeholder( tf.float32, name="walking_srnn_seeds_0560") self.walking_err1000 = tf.placeholder( tf.float32, name="walking_srnn_seeds_1000") self.walking_err80_summary = tf.summary.scalar( 'euler_error_walking/srnn_seeds_0080', self.walking_err80) self.walking_err160_summary = tf.summary.scalar( 'euler_error_walking/srnn_seeds_0160', self.walking_err160) self.walking_err320_summary = tf.summary.scalar( 'euler_error_walking/srnn_seeds_0320', self.walking_err320) self.walking_err400_summary = tf.summary.scalar( 'euler_error_walking/srnn_seeds_0400', self.walking_err400) self.walking_err560_summary = tf.summary.scalar( 'euler_error_walking/srnn_seeds_0560', self.walking_err560) self.walking_err1000_summary = tf.summary.scalar( 'euler_error_walking/srnn_seeds_1000', self.walking_err1000) with tf.name_scope("euler_error_eating"): self.eating_err80 = tf.placeholder(tf.float32, name="eating_srnn_seeds_0080") self.eating_err160 = tf.placeholder(tf.float32, name="eating_srnn_seeds_0160") self.eating_err320 = tf.placeholder(tf.float32, name="eating_srnn_seeds_0320") self.eating_err400 = tf.placeholder(tf.float32, name="eating_srnn_seeds_0400") self.eating_err560 = tf.placeholder(tf.float32, name="eating_srnn_seeds_0560") self.eating_err1000 = tf.placeholder(tf.float32, name="eating_srnn_seeds_1000") self.eating_err80_summary = tf.summary.scalar( 'euler_error_eating/srnn_seeds_0080', self.eating_err80) self.eating_err160_summary = tf.summary.scalar( 'euler_error_eating/srnn_seeds_0160', self.eating_err160) self.eating_err320_summary = tf.summary.scalar( 'euler_error_eating/srnn_seeds_0320', self.eating_err320) self.eating_err400_summary = tf.summary.scalar( 'euler_error_eating/srnn_seeds_0400', self.eating_err400) self.eating_err560_summary = tf.summary.scalar( 'euler_error_eating/srnn_seeds_0560', self.eating_err560) self.eating_err1000_summary = tf.summary.scalar( 'euler_error_eating/srnn_seeds_1000', self.eating_err1000) with tf.name_scope("euler_error_smoking"): self.smoking_err80 = tf.placeholder(tf.float32, name="smoking_srnn_seeds_0080") self.smoking_err160 = tf.placeholder( tf.float32, name="smoking_srnn_seeds_0160") self.smoking_err320 = tf.placeholder( tf.float32, name="smoking_srnn_seeds_0320") self.smoking_err400 = tf.placeholder( tf.float32, name="smoking_srnn_seeds_0400") self.smoking_err560 = tf.placeholder( tf.float32, name="smoking_srnn_seeds_0560") self.smoking_err1000 = tf.placeholder( tf.float32, name="smoking_srnn_seeds_1000") self.smoking_err80_summary = tf.summary.scalar( 'euler_error_smoking/srnn_seeds_0080', self.smoking_err80) self.smoking_err160_summary = tf.summary.scalar( 'euler_error_smoking/srnn_seeds_0160', self.smoking_err160) self.smoking_err320_summary = tf.summary.scalar( 'euler_error_smoking/srnn_seeds_0320', self.smoking_err320) self.smoking_err400_summary = tf.summary.scalar( 'euler_error_smoking/srnn_seeds_0400', self.smoking_err400) self.smoking_err560_summary = tf.summary.scalar( 'euler_error_smoking/srnn_seeds_0560', self.smoking_err560) self.smoking_err1000_summary = tf.summary.scalar( 'euler_error_smoking/srnn_seeds_1000', self.smoking_err1000) with tf.name_scope("euler_error_discussion"): self.discussion_err80 = tf.placeholder( tf.float32, name="discussion_srnn_seeds_0080") self.discussion_err160 = tf.placeholder( tf.float32, name="discussion_srnn_seeds_0160") self.discussion_err320 = tf.placeholder( tf.float32, name="discussion_srnn_seeds_0320") self.discussion_err400 = tf.placeholder( tf.float32, name="discussion_srnn_seeds_0400") self.discussion_err560 = tf.placeholder( tf.float32, name="discussion_srnn_seeds_0560") self.discussion_err1000 = tf.placeholder( tf.float32, name="discussion_srnn_seeds_1000") self.discussion_err80_summary = tf.summary.scalar( 'euler_error_discussion/srnn_seeds_0080', self.discussion_err80) self.discussion_err160_summary = tf.summary.scalar( 'euler_error_discussion/srnn_seeds_0160', self.discussion_err160) self.discussion_err320_summary = tf.summary.scalar( 'euler_error_discussion/srnn_seeds_0320', self.discussion_err320) self.discussion_err400_summary = tf.summary.scalar( 'euler_error_discussion/srnn_seeds_0400', self.discussion_err400) self.discussion_err560_summary = tf.summary.scalar( 'euler_error_discussion/srnn_seeds_0560', self.discussion_err560) self.discussion_err1000_summary = tf.summary.scalar( 'euler_error_discussion/srnn_seeds_1000', self.discussion_err1000) with tf.name_scope("euler_error_directions"): self.directions_err80 = tf.placeholder( tf.float32, name="directions_srnn_seeds_0080") self.directions_err160 = tf.placeholder( tf.float32, name="directions_srnn_seeds_0160") self.directions_err320 = tf.placeholder( tf.float32, name="directions_srnn_seeds_0320") self.directions_err400 = tf.placeholder( tf.float32, name="directions_srnn_seeds_0400") self.directions_err560 = tf.placeholder( tf.float32, name="directions_srnn_seeds_0560") self.directions_err1000 = tf.placeholder( tf.float32, name="directions_srnn_seeds_1000") self.directions_err80_summary = tf.summary.scalar( 'euler_error_directions/srnn_seeds_0080', self.directions_err80) self.directions_err160_summary = tf.summary.scalar( 'euler_error_directions/srnn_seeds_0160', self.directions_err160) self.directions_err320_summary = tf.summary.scalar( 'euler_error_directions/srnn_seeds_0320', self.directions_err320) self.directions_err400_summary = tf.summary.scalar( 'euler_error_directions/srnn_seeds_0400', self.directions_err400) self.directions_err560_summary = tf.summary.scalar( 'euler_error_directions/srnn_seeds_0560', self.directions_err560) self.directions_err1000_summary = tf.summary.scalar( 'euler_error_directions/srnn_seeds_1000', self.directions_err1000) with tf.name_scope("euler_error_greeting"): self.greeting_err80 = tf.placeholder( tf.float32, name="greeting_srnn_seeds_0080") self.greeting_err160 = tf.placeholder( tf.float32, name="greeting_srnn_seeds_0160") self.greeting_err320 = tf.placeholder( tf.float32, name="greeting_srnn_seeds_0320") self.greeting_err400 = tf.placeholder( tf.float32, name="greeting_srnn_seeds_0400") self.greeting_err560 = tf.placeholder( tf.float32, name="greeting_srnn_seeds_0560") self.greeting_err1000 = tf.placeholder( tf.float32, name="greeting_srnn_seeds_1000") self.greeting_err80_summary = tf.summary.scalar( 'euler_error_greeting/srnn_seeds_0080', self.greeting_err80) self.greeting_err160_summary = tf.summary.scalar( 'euler_error_greeting/srnn_seeds_0160', self.greeting_err160) self.greeting_err320_summary = tf.summary.scalar( 'euler_error_greeting/srnn_seeds_0320', self.greeting_err320) self.greeting_err400_summary = tf.summary.scalar( 'euler_error_greeting/srnn_seeds_0400', self.greeting_err400) self.greeting_err560_summary = tf.summary.scalar( 'euler_error_greeting/srnn_seeds_0560', self.greeting_err560) self.greeting_err1000_summary = tf.summary.scalar( 'euler_error_greeting/srnn_seeds_1000', self.greeting_err1000) with tf.name_scope("euler_error_phoning"): self.phoning_err80 = tf.placeholder(tf.float32, name="phoning_srnn_seeds_0080") self.phoning_err160 = tf.placeholder( tf.float32, name="phoning_srnn_seeds_0160") self.phoning_err320 = tf.placeholder( tf.float32, name="phoning_srnn_seeds_0320") self.phoning_err400 = tf.placeholder( tf.float32, name="phoning_srnn_seeds_0400") self.phoning_err560 = tf.placeholder( tf.float32, name="phoning_srnn_seeds_0560") self.phoning_err1000 = tf.placeholder( tf.float32, name="phoning_srnn_seeds_1000") self.phoning_err80_summary = tf.summary.scalar( 'euler_error_phoning/srnn_seeds_0080', self.phoning_err80) self.phoning_err160_summary = tf.summary.scalar( 'euler_error_phoning/srnn_seeds_0160', self.phoning_err160) self.phoning_err320_summary = tf.summary.scalar( 'euler_error_phoning/srnn_seeds_0320', self.phoning_err320) self.phoning_err400_summary = tf.summary.scalar( 'euler_error_phoning/srnn_seeds_0400', self.phoning_err400) self.phoning_err560_summary = tf.summary.scalar( 'euler_error_phoning/srnn_seeds_0560', self.phoning_err560) self.phoning_err1000_summary = tf.summary.scalar( 'euler_error_phoning/srnn_seeds_1000', self.phoning_err1000) with tf.name_scope("euler_error_posing"): self.posing_err80 = tf.placeholder(tf.float32, name="posing_srnn_seeds_0080") self.posing_err160 = tf.placeholder(tf.float32, name="posing_srnn_seeds_0160") self.posing_err320 = tf.placeholder(tf.float32, name="posing_srnn_seeds_0320") self.posing_err400 = tf.placeholder(tf.float32, name="posing_srnn_seeds_0400") self.posing_err560 = tf.placeholder(tf.float32, name="posing_srnn_seeds_0560") self.posing_err1000 = tf.placeholder(tf.float32, name="posing_srnn_seeds_1000") self.posing_err80_summary = tf.summary.scalar( 'euler_error_posing/srnn_seeds_0080', self.posing_err80) self.posing_err160_summary = tf.summary.scalar( 'euler_error_posing/srnn_seeds_0160', self.posing_err160) self.posing_err320_summary = tf.summary.scalar( 'euler_error_posing/srnn_seeds_0320', self.posing_err320) self.posing_err400_summary = tf.summary.scalar( 'euler_error_posing/srnn_seeds_0400', self.posing_err400) self.posing_err560_summary = tf.summary.scalar( 'euler_error_posing/srnn_seeds_0560', self.posing_err560) self.posing_err1000_summary = tf.summary.scalar( 'euler_error_posing/srnn_seeds_1000', self.posing_err1000) with tf.name_scope("euler_error_purchases"): self.purchases_err80 = tf.placeholder( tf.float32, name="purchases_srnn_seeds_0080") self.purchases_err160 = tf.placeholder( tf.float32, name="purchases_srnn_seeds_0160") self.purchases_err320 = tf.placeholder( tf.float32, name="purchases_srnn_seeds_0320") self.purchases_err400 = tf.placeholder( tf.float32, name="purchases_srnn_seeds_0400") self.purchases_err560 = tf.placeholder( tf.float32, name="purchases_srnn_seeds_0560") self.purchases_err1000 = tf.placeholder( tf.float32, name="purchases_srnn_seeds_1000") self.purchases_err80_summary = tf.summary.scalar( 'euler_error_purchases/srnn_seeds_0080', self.purchases_err80) self.purchases_err160_summary = tf.summary.scalar( 'euler_error_purchases/srnn_seeds_0160', self.purchases_err160) self.purchases_err320_summary = tf.summary.scalar( 'euler_error_purchases/srnn_seeds_0320', self.purchases_err320) self.purchases_err400_summary = tf.summary.scalar( 'euler_error_purchases/srnn_seeds_0400', self.purchases_err400) self.purchases_err560_summary = tf.summary.scalar( 'euler_error_purchases/srnn_seeds_0560', self.purchases_err560) self.purchases_err1000_summary = tf.summary.scalar( 'euler_error_purchases/srnn_seeds_1000', self.purchases_err1000) with tf.name_scope("euler_error_sitting"): self.sitting_err80 = tf.placeholder(tf.float32, name="sitting_srnn_seeds_0080") self.sitting_err160 = tf.placeholder( tf.float32, name="sitting_srnn_seeds_0160") self.sitting_err320 = tf.placeholder( tf.float32, name="sitting_srnn_seeds_0320") self.sitting_err400 = tf.placeholder( tf.float32, name="sitting_srnn_seeds_0400") self.sitting_err560 = tf.placeholder( tf.float32, name="sitting_srnn_seeds_0560") self.sitting_err1000 = tf.placeholder( tf.float32, name="sitting_srnn_seeds_1000") self.sitting_err80_summary = tf.summary.scalar( 'euler_error_sitting/srnn_seeds_0080', self.sitting_err80) self.sitting_err160_summary = tf.summary.scalar( 'euler_error_sitting/srnn_seeds_0160', self.sitting_err160) self.sitting_err320_summary = tf.summary.scalar( 'euler_error_sitting/srnn_seeds_0320', self.sitting_err320) self.sitting_err400_summary = tf.summary.scalar( 'euler_error_sitting/srnn_seeds_0400', self.sitting_err400) self.sitting_err560_summary = tf.summary.scalar( 'euler_error_sitting/srnn_seeds_0560', self.sitting_err560) self.sitting_err1000_summary = tf.summary.scalar( 'euler_error_sitting/srnn_seeds_1000', self.sitting_err1000) with tf.name_scope("euler_error_sittingdown"): self.sittingdown_err80 = tf.placeholder( tf.float32, name="sittingdown_srnn_seeds_0080") self.sittingdown_err160 = tf.placeholder( tf.float32, name="sittingdown_srnn_seeds_0160") self.sittingdown_err320 = tf.placeholder( tf.float32, name="sittingdown_srnn_seeds_0320") self.sittingdown_err400 = tf.placeholder( tf.float32, name="sittingdown_srnn_seeds_0400") self.sittingdown_err560 = tf.placeholder( tf.float32, name="sittingdown_srnn_seeds_0560") self.sittingdown_err1000 = tf.placeholder( tf.float32, name="sittingdown_srnn_seeds_1000") self.sittingdown_err80_summary = tf.summary.scalar( 'euler_error_sittingdown/srnn_seeds_0080', self.sittingdown_err80) self.sittingdown_err160_summary = tf.summary.scalar( 'euler_error_sittingdown/srnn_seeds_0160', self.sittingdown_err160) self.sittingdown_err320_summary = tf.summary.scalar( 'euler_error_sittingdown/srnn_seeds_0320', self.sittingdown_err320) self.sittingdown_err400_summary = tf.summary.scalar( 'euler_error_sittingdown/srnn_seeds_0400', self.sittingdown_err400) self.sittingdown_err560_summary = tf.summary.scalar( 'euler_error_sittingdown/srnn_seeds_0560', self.sittingdown_err560) self.sittingdown_err1000_summary = tf.summary.scalar( 'euler_error_sittingdown/srnn_seeds_1000', self.sittingdown_err1000) with tf.name_scope("euler_error_takingphoto"): self.takingphoto_err80 = tf.placeholder( tf.float32, name="takingphoto_srnn_seeds_0080") self.takingphoto_err160 = tf.placeholder( tf.float32, name="takingphoto_srnn_seeds_0160") self.takingphoto_err320 = tf.placeholder( tf.float32, name="takingphoto_srnn_seeds_0320") self.takingphoto_err400 = tf.placeholder( tf.float32, name="takingphoto_srnn_seeds_0400") self.takingphoto_err560 = tf.placeholder( tf.float32, name="takingphoto_srnn_seeds_0560") self.takingphoto_err1000 = tf.placeholder( tf.float32, name="takingphoto_srnn_seeds_1000") self.takingphoto_err80_summary = tf.summary.scalar( 'euler_error_takingphoto/srnn_seeds_0080', self.takingphoto_err80) self.takingphoto_err160_summary = tf.summary.scalar( 'euler_error_takingphoto/srnn_seeds_0160', self.takingphoto_err160) self.takingphoto_err320_summary = tf.summary.scalar( 'euler_error_takingphoto/srnn_seeds_0320', self.takingphoto_err320) self.takingphoto_err400_summary = tf.summary.scalar( 'euler_error_takingphoto/srnn_seeds_0400', self.takingphoto_err400) self.takingphoto_err560_summary = tf.summary.scalar( 'euler_error_takingphoto/srnn_seeds_0560', self.takingphoto_err560) self.takingphoto_err1000_summary = tf.summary.scalar( 'euler_error_takingphoto/srnn_seeds_1000', self.takingphoto_err1000) with tf.name_scope("euler_error_waiting"): self.waiting_err80 = tf.placeholder(tf.float32, name="waiting_srnn_seeds_0080") self.waiting_err160 = tf.placeholder( tf.float32, name="waiting_srnn_seeds_0160") self.waiting_err320 = tf.placeholder( tf.float32, name="waiting_srnn_seeds_0320") self.waiting_err400 = tf.placeholder( tf.float32, name="waiting_srnn_seeds_0400") self.waiting_err560 = tf.placeholder( tf.float32, name="waiting_srnn_seeds_0560") self.waiting_err1000 = tf.placeholder( tf.float32, name="waiting_srnn_seeds_1000") self.waiting_err80_summary = tf.summary.scalar( 'euler_error_waiting/srnn_seeds_0080', self.waiting_err80) self.waiting_err160_summary = tf.summary.scalar( 'euler_error_waiting/srnn_seeds_0160', self.waiting_err160) self.waiting_err320_summary = tf.summary.scalar( 'euler_error_waiting/srnn_seeds_0320', self.waiting_err320) self.waiting_err400_summary = tf.summary.scalar( 'euler_error_waiting/srnn_seeds_0400', self.waiting_err400) self.waiting_err560_summary = tf.summary.scalar( 'euler_error_waiting/srnn_seeds_0560', self.waiting_err560) self.waiting_err1000_summary = tf.summary.scalar( 'euler_error_waiting/srnn_seeds_1000', self.waiting_err1000) with tf.name_scope("euler_error_walkingdog"): self.walkingdog_err80 = tf.placeholder( tf.float32, name="walkingdog_srnn_seeds_0080") self.walkingdog_err160 = tf.placeholder( tf.float32, name="walkingdog_srnn_seeds_0160") self.walkingdog_err320 = tf.placeholder( tf.float32, name="walkingdog_srnn_seeds_0320") self.walkingdog_err400 = tf.placeholder( tf.float32, name="walkingdog_srnn_seeds_0400") self.walkingdog_err560 = tf.placeholder( tf.float32, name="walkingdog_srnn_seeds_0560") self.walkingdog_err1000 = tf.placeholder( tf.float32, name="walkingdog_srnn_seeds_1000") self.walkingdog_err80_summary = tf.summary.scalar( 'euler_error_walkingdog/srnn_seeds_0080', self.walkingdog_err80) self.walkingdog_err160_summary = tf.summary.scalar( 'euler_error_walkingdog/srnn_seeds_0160', self.walkingdog_err160) self.walkingdog_err320_summary = tf.summary.scalar( 'euler_error_walkingdog/srnn_seeds_0320', self.walkingdog_err320) self.walkingdog_err400_summary = tf.summary.scalar( 'euler_error_walkingdog/srnn_seeds_0400', self.walkingdog_err400) self.walkingdog_err560_summary = tf.summary.scalar( 'euler_error_walkingdog/srnn_seeds_0560', self.walkingdog_err560) self.walkingdog_err1000_summary = tf.summary.scalar( 'euler_error_walkingdog/srnn_seeds_1000', self.walkingdog_err1000) with tf.name_scope("euler_error_walkingtogether"): self.walkingtogether_err80 = tf.placeholder( tf.float32, name="walkingtogether_srnn_seeds_0080") self.walkingtogether_err160 = tf.placeholder( tf.float32, name="walkingtogether_srnn_seeds_0160") self.walkingtogether_err320 = tf.placeholder( tf.float32, name="walkingtogether_srnn_seeds_0320") self.walkingtogether_err400 = tf.placeholder( tf.float32, name="walkingtogether_srnn_seeds_0400") self.walkingtogether_err560 = tf.placeholder( tf.float32, name="walkingtogether_srnn_seeds_0560") self.walkingtogether_err1000 = tf.placeholder( tf.float32, name="walkingtogether_srnn_seeds_1000") self.walkingtogether_err80_summary = tf.summary.scalar( 'euler_error_walkingtogether/srnn_seeds_0080', self.walkingtogether_err80) self.walkingtogether_err160_summary = tf.summary.scalar( 'euler_error_walkingtogether/srnn_seeds_0160', self.walkingtogether_err160) self.walkingtogether_err320_summary = tf.summary.scalar( 'euler_error_walkingtogether/srnn_seeds_0320', self.walkingtogether_err320) self.walkingtogether_err400_summary = tf.summary.scalar( 'euler_error_walkingtogether/srnn_seeds_0400', self.walkingtogether_err400) self.walkingtogether_err560_summary = tf.summary.scalar( 'euler_error_walkingtogether/srnn_seeds_0560', self.walkingtogether_err560) self.walkingtogether_err1000_summary = tf.summary.scalar( 'euler_error_walkingtogether/srnn_seeds_1000', self.walkingtogether_err1000) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
def stft(self): return signal.stft(self.sig, self.fft_len, self.fft_len // 2, window_fn=signal.hamming_window)
def signalProcessBatch(signals, noise_factor=0.1, noise_frac=0.2, window=512, maxamps=1.0, sr=16000, num_mel_bins=64, num_mfccs=13): """Function to perform all the DSP preprocessing and feature extraction. Returns the MFCCs, Log Mel spectrum, ZCR and RMSE. Works on a batch of num_files files. - Input signals : tensor of shape [num_files, samples] - Output : tuple ([num_files, num_windows, num_mfccs], [num_files, num_windows, num_mel_bins], [num_files, num_windows], [num_files, num_windows])""" # Get number of signal files num_files = tf.shape(signals)[0] # Select random noise samples idx = tf.random_uniform((num_files, ), 0, cfg.NOISE_MATRIX.shape[0], dtype=tf.int32) noise = tf.cast(tf.gather(cfg.NOISE_MATRIX, idx), tf.float32) nf = tf.cast(tf.greater(tf.random_uniform([num_files, 1]), noise_frac), tf.float32) # Add noise to signal with a certain noise factor signals = signals + noise_factor * maxamps * noise * nf # Window the audio signals hop_length = window / 4 signals32 = tf.cast(signals, tf.float32) signals_w = windower(signals32, window=window, hop_length=hop_length, rank=2) # Calculate Zero Crossing Rate and RMSE zcr = zero_crossing(signals_w, rank=3) rmse = rms_energy(signals_w, rank=3, maxamps=maxamps) # Calculate the Short Time Fourier Transform stfts = signal.stft(signals32, frame_length=window, frame_step=hop_length, fft_length=window) magnitude_spectrograms = tf.abs(stfts) # Define Mel space num_spectrogram_bins = magnitude_spectrograms.shape[-1].value lower_edge_hertz = 80.0 upper_edge_hertz = 7600.0 mel_weight_mat = signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sr, lower_edge_hertz, upper_edge_hertz) # Calculate the Mel spectrogram and set its shape mel_spectrograms = tf.tensordot(magnitude_spectrograms, mel_weight_mat, 1) spec_shape = magnitude_spectrograms \ .shape[:-1] \ .concatenate(mel_weight_mat.shape[-1:]) mel_spectrograms.set_shape(spec_shape) # Calculate log of the spectrogram log_offset = 1e-8 log_mel_spectrograms = tf.log(mel_spectrograms + log_offset, name='log_mel_spectrograms') # Calcuate the MFCCs mfccs = signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms) mfccs = mfccs[..., :num_mfccs] mfccs = tf.identity(mfccs, name='mfccs') return mfccs, log_mel_spectrograms, zcr, rmse
def tf_spectrogram(x): window_fn = choose_window_fn(window_fn_type) fc = stft(x,frame_length=frame_length,frame_step=frame_step,fft_length=fft_length,window_fn=window_fn,pad_end=pad_end) f = tf.abs(fc)**power return f
def generate_mel_filter_banks(signal, sample_rate_hz, frame_size_s=FRAME_SIZE_S, frame_stride_s=FRAME_STRIDE_S, window_fn=functools.partial( tf_signal.hamming_window, periodic=True), fft_num_points=STFT_NUM_POINTS, lower_freq_hz=0.0, num_mel_bins=NUM_TRIANGULAR_FILTERS, log_offset=1e-6, should_log_weight=False): # Convert the signal to a tf tensor in case it is in an np array. signal = tf.convert_to_tensor(signal, dtype=tf.float32) # Compute the remaining parameters for this calculation. frame_length = int(sample_rate_hz * frame_size_s) frame_step = int(sample_rate_hz * frame_stride_s) # The upper frequency is bounded by half the sample rate by Nyquist's Law. upper_freq_hz = sample_rate_hz / 2.0 # Package the signal into equally-sized, overlapping subsequences (padded with 0s if necessary). frames = tf_signal.frame(signal, frame_length=frame_length, frame_step=frame_step, pad_end=True, pad_value=0) # Apply a Short-Term Fourier Transform (STFT) to convert into the frequency domain (assuming each window has a # constant frequency snapshot). stfts = tf_signal.stft(frames, frame_length=frame_length, frame_step=frame_step, fft_length=fft_num_points, window_fn=window_fn) # Compute the magnitude and power of the frequencies (the magnitude spectrogram). magnitude_spectrograms = tf.abs(stfts) power_spectograms = tf.real(stfts * tf.conj(stfts)) # Warp the linear-scale spectrograms into the mel-scale. num_spectrogram_bins = 1 + int(fft_num_points / 2) # Compute the conversion matrix to mel-frequency space. linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins=num_mel_bins, num_spectrogram_bins=num_spectrogram_bins, sample_rate=sample_rate_hz, lower_edge_hertz=lower_freq_hz, upper_edge_hertz=upper_freq_hz, dtype=tf.float32) # Apply the conversion to complete the calculation of the filter-bank mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) if should_log_weight: return tf.log(mel_spectrograms + log_offset) else: return mel_spectrograms