def sdr_improvement(self, s_target, s_approx, with_perm=False): # B S L or B P S L mix = tf.tile(tf.expand_dims(self.x_mix, 1) ,[1, self.S, 1]) s_target_norm = tf.reduce_sum(tf.square(s_target), axis=-1) s_approx_norm = tf.reduce_sum(tf.square(s_approx), axis=-1) mix_norm = tf.reduce_sum(tf.square(mix), axis=-1) s_target_s_2 = tf.square(tf.reduce_sum(s_target*s_approx, axis=-1)) s_target_mix_2 = tf.square(tf.reduce_sum(s_target*mix, axis=-1)) sep = 1.0/((s_target_norm*s_approx_norm)/s_target_s_2 - 1.0) separated = 10. * log10(sep) non_separated = 10. * log10(1.0/((s_target_norm*mix_norm)/s_target_mix_2 - 1.0)) loss = (s_target_norm*s_approx_norm)/s_target_s_2 val = separated - non_separated val = tf.reduce_mean(val , -1) # Mean over speakers if not with_perm: val = tf.reduce_mean(val , -1) # Mean over batches else: val = tf.reduce_mean(val , 0) # Mean over batches val = tf.reduce_min(val, -1) return val, loss
def init_separator(self): if self.plugged: if self.abs_input: self.X = tf.abs(self.X) if self.normalize_input == '01': self.normalization01 elif self.normalize_input == 'meanstd': self.normalization_mean_std self.prediction else: # STFT self.preprocessing # Apply a certain function if self.pre_func == 'sqrt': self.X = tf.sqrt(self.X) elif self.pre_func == 'log': self.X = log10(self.X + 1e-12) # Apply normalization if self.normalize_input == '01': self.normalization01 elif self.normalize_input == 'meanstd': self.normalization_mean_std # Apply silent mask if self.silent_threshold > 0: max_ = tf.reduce_max(self.X, [1, 2], keep_dims=True) mask = tf.cast( tf.less(max_ - self.X, self.silent_threshold / 20.), tf.float32) self.X = mask * self.X if self.add_dilated: self.dilated self.prediction #TODO TO IMPROVE ! if self.args['model_folder'] is None: # if 'inference' not in self.folder and 'enhance' not in self.folder and 'finetuning' not in self.folder: self.cost_model = self.cost self.finish_construction() self.optimize
def __init__(self, plugged=False, *args, **kwargs): super(Separator, self).__init__(plugged, *args, **kwargs) self.num_speakers = kwargs['tot_speakers'] self.layer_size = kwargs['layer_size'] self.embedding_size = kwargs['embedding_size'] self.normalize = kwargs['no_normalize'] self.nb_layers = kwargs['nb_layers'] self.a = kwargs['mask_a'] self.b = kwargs['mask_b'] self.normalize_input = kwargs['normalize_separator'] self.abs_input = kwargs['abs_input'] # Loss Parameters self.loss_with_silence = kwargs['silence_loss'] self.threshold_silence_loss = kwargs['threshold_silence_loss'] self.function_mask = kwargs['function_mask'] # Kmeans Parameters self.beta = kwargs['beta_kmeans'] self.threshold = kwargs['threshold'] self.with_silence = kwargs['with_silence'] self.nb_tries = kwargs['nb_tries'] self.nb_steps = kwargs['nb_steps'] self.graph = tf.get_default_graph() self.plugged = plugged # If the Separator is not independant but using a front layer if self.plugged: self.F = kwargs['filters'] with self.graph.as_default(): self.training = self.graph.get_tensor_by_name('inputs/is_training:0') front = self.graph.get_tensor_by_name('front/output:0') self.B = tf.shape(self.graph.get_tensor_by_name('inputs/non_mix_input:0'))[0] with tf.name_scope('split_front'): self.X = tf.reshape(front[:self.B, :, :, :], [self.B, -1, self.F]) # Mix input [B, T, N] # Non mix input [B, T, N, S] self.X_input = tf.identity(self.X) self.X_non_mix = tf.transpose(tf.reshape(front[self.B:, :, :, :], [self.B, self.S, -1, self.F]), [0,2,3,1]) with tf.name_scope('create_masks'): # # Batch of Masks (bins label) # # shape = [ batch size, T, F, S] argmax = tf.argmax(tf.abs(self.X_non_mix), axis=3) self.y = tf.one_hot(argmax, self.S, self.a, self.b) self.y_test_export = tf.reshape(self.y[:, :, :, 0], [self.B, -1]) if self.function_mask == 'linear': max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True) self.y = self.y * tf.expand_dims(tf.abs(self.X)/max_, 3) elif self.function_mask == 'sqrt': max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True) self.y = self.y * tf.expand_dims(tf.sqrt(tf.abs(self.X)/max_), 3) elif self.function_mask == 'square': max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True) self.y = self.y * tf.expand_dims(tf.square(tf.abs(self.X)/max_), 3) if self.loss_with_silence: max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True) log_compare = log10(tf.divide(max_, tf.abs(self.X))) mask = tf.cast(log_compare < self.threshold_silence_loss, tf.float32) tf.summary.image('separator/silence_mask', tf.expand_dims(mask,3), max_outputs=1) self.y = self.y * tf.expand_dims(mask, 3) # Speakers indices used in the mixtures # shape = [ batch size, #speakers] self.I = tf.get_default_graph().get_tensor_by_name('inputs/indicies:0') else: # STFT hyperparams self.window_size = kwargs['window_size'] self.hop_size = kwargs['hop_size'] # Network hyperparams self.F = kwargs['window_size']//2 +1
def cost(self): """ Construct the cost function op for the negative sampling cost """ if self.loss_with_silence: max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True) log_compare = log10(tf.divide(max_, tf.abs(self.X))) mask = tf.cast(log_compare < self.threshold_silence_loss, tf.float32) tf.summary.image('separator/silence_mask', tf.expand_dims(mask, 3), max_outputs=1) y_a_b = self.y * tf.expand_dims(mask, 3) y_0_1 = (self.y + 1.0) / 2.0 * tf.expand_dims(mask, 3) else: y_a_b = self.y y_0_1 = (self.y + 1.0) / 2.0 tf.summary.image('mask/true/1', tf.abs(tf.expand_dims(y_0_1[:, :, :, 0], 3))) tf.summary.image('mask/true/2', tf.abs(tf.expand_dims(y_0_1[:, :, :, 1], 3))) # Get the embedded T-F vectors from the network embedding = self.prediction # [B, T, F, E] embedding_broad = tf.expand_dims(embedding, 4) # [B, T, F, E, 1] y_broad = tf.expand_dims(y_0_1, 3) # [B, T, F, 1, S] v_mean = tf.reduce_sum(embedding_broad * y_broad, [1, 2]) / ( 1e-12 + tf.expand_dims(tf.reduce_sum(y_0_1, [1, 2]), 1) ) # [B, E, S] # # Reconstruction loss # with tf.name_scope('reconstruction_loss'): v_mean_broad = tf.expand_dims(v_mean, 1) # [B, 1, E, S] v_mean_broad = tf.expand_dims(v_mean_broad, 1) # [B, 1, 1, E, S] assignments = tf.reduce_sum(v_mean_broad * embedding_broad, 3) # [B, T, F, S] assignments = tf.nn.sigmoid(assignments) # [B, T, F, S] masked_input = tf.expand_dims(self.X_input, 3) * assignments # X_non_mix [B, T, F, S] cost_recons = tf.reduce_mean(tf.square(self.X_non_mix - masked_input), axis=[1, 2]) cost_recons = tf.reduce_mean( cost_recons, axis=-1) # Mean among all speakers [B, S] cost_recons = tf.reduce_mean(cost_recons) tf.summary.scalar('value', cost_recons) # # Constrast loss # with tf.name_scope('source_contrastive_loss'): speaker_vectors = tf.nn.l2_normalize(self.speaker_vectors, 1) embedding = tf.nn.l2_normalize(embedding, -1) I = tf.expand_dims(self.I, axis=2) # [B, S, 1] # Gathering the speaker_vectors [|S|, E] Vspeakers = tf.gather_nd(speaker_vectors, I) # [B, S, E] # Expand the dimensions in preparation for broadcasting Vspeakers_broad = tf.expand_dims(Vspeakers, 1) Vspeakers_broad = tf.expand_dims(Vspeakers_broad, 1) # [B, 1, 1, S, E] embedding_broad = tf.expand_dims(embedding, 3) # Compute the dot product between the embedding vectors and speaker # vectors dot = tf.reduce_sum(Vspeakers_broad * embedding_broad, 4) # Compute the cost for every element sc_cost = -tf.log(tf.nn.sigmoid(y_a_b * dot)) sc_cost = tf.reduce_mean( sc_cost, 3) # Average the cost over all speakers in the input sc_cost = tf.reduce_mean(sc_cost, 0) # Average the cost over all batches sc_cost = tf.reduce_mean(sc_cost) tf.summary.scalar('value', sc_cost) cost = sc_cost + cost_recons tf.summary.scalar('total', cost) return cost
def __init__(self, nb_clusters, centroids_init=None, nb_tries=10, nb_iterations=10, input_tensor=None, latent_space_tensor=None, beta=None, threshold=2.5, assign_at_end=False): self.nb_clusters = nb_clusters self.nb_iterations = nb_iterations self.nb_tries = nb_tries self.latent_space_tensor = latent_space_tensor self.beta = beta self.assign_at_end = assign_at_end if input_tensor is None: self.graph = tf.Graph() else: self.graph = tf.get_default_graph() with self.graph.as_default(): with tf.name_scope('kmeans'): if input_tensor is None: # Spectrogram, embeddings # shape = [batch, L , E ] self.X_in = tf.placeholder("float", [None, None, None], name='Kmeans_input') else: self.X_in = input_tensor # mean, _ = tf.nn.moments(self.X_in, axes=-1, keep_dims=True) x_norm = tf.nn.l2_normalize(self.X_in, axis=-1) self.b = tf.shape(x_norm)[0] self.X_original = tf.identity(x_norm) self.X = tf.expand_dims(x_norm, 1) self.X = tf.tile(self.X, [1, self.nb_tries, 1, 1]) self.L = tf.shape(self.X)[-2] self.E = tf.shape(self.X)[-1] self.X = tf.reshape(self.X, [-1, self.L, self.E]) self.B = tf.shape(self.X)[0] self.ones = tf.ones_like(self.X, tf.float32) self.shifting = tf.tile( tf.expand_dims(tf.range(self.B) * self.nb_clusters, 1), [1, self.L]) if centroids_init is None: def random_without_replace(b, l): a = np.array([ np.random.choice(range(l), size=self.nb_clusters, replace=False) for _ in range(b) ]) return a.astype(np.int32) y = tf.py_func(random_without_replace, [self.B, self.L], tf.int32) random = tf.reshape(y, [self.B, self.nb_clusters, 1]) # Take randomly 'nb_clusters' vectors from X batch_range = tf.tile( tf.reshape(tf.range(self.B, dtype=tf.int32), shape=[self.B, 1, 1]), [1, self.nb_clusters, 1]) indices = tf.concat([batch_range, random], axis=2) self.centroid_init = tf.gather_nd(self.X, indices) else: self.centroids = tf.identity(centroids_init) self.centroids = tf.tile(self.centroids, [self.nb_tries, 1, 1]) if not self.latent_space_tensor is None: latent_space_tensor = tf.reshape(latent_space_tensor, [self.b, self.L]) log_lst = log10( tf.divide( tf.reduce_max(latent_space_tensor, [-1], keep_dims=True), latent_space_tensor)) self.notsilent_notry = tf.reshape( tf.cast(log_lst < threshold, tf.float32), [self.b, self.L, 1]) self.notsilent = tf.tile(self.notsilent_notry, [self.nb_tries, 1, 1]) else: self.notsilent = tf.ones([self.B, self.L, 1]) self.network