Beispiel #1
0
	def sdr_improvement(self, s_target, s_approx, with_perm=False):
		# B S L or B P S L
		mix = tf.tile(tf.expand_dims(self.x_mix, 1) ,[1, self.S, 1])

		s_target_norm = tf.reduce_sum(tf.square(s_target), axis=-1)
		s_approx_norm = tf.reduce_sum(tf.square(s_approx), axis=-1)
		mix_norm = tf.reduce_sum(tf.square(mix), axis=-1)

		s_target_s_2 = tf.square(tf.reduce_sum(s_target*s_approx, axis=-1))
		s_target_mix_2 = tf.square(tf.reduce_sum(s_target*mix, axis=-1))

		sep = 1.0/((s_target_norm*s_approx_norm)/s_target_s_2 - 1.0)
		separated = 10. * log10(sep)
		non_separated = 10. * log10(1.0/((s_target_norm*mix_norm)/s_target_mix_2 - 1.0))

		loss = (s_target_norm*s_approx_norm)/s_target_s_2

		val = separated - non_separated
		val = tf.reduce_mean(val , -1) # Mean over speakers
		if not with_perm:
			val = tf.reduce_mean(val , -1) # Mean over batches
		else:
			val = tf.reduce_mean(val , 0) # Mean over batches
			val = tf.reduce_min(val, -1)

		return val, loss
Beispiel #2
0
    def init_separator(self):
        if self.plugged:

            if self.abs_input:
                self.X = tf.abs(self.X)

            if self.normalize_input == '01':
                self.normalization01
            elif self.normalize_input == 'meanstd':
                self.normalization_mean_std

            self.prediction

        else:

            # STFT
            self.preprocessing

            # Apply a certain function
            if self.pre_func == 'sqrt':
                self.X = tf.sqrt(self.X)
            elif self.pre_func == 'log':
                self.X = log10(self.X + 1e-12)

            # Apply normalization
            if self.normalize_input == '01':
                self.normalization01
            elif self.normalize_input == 'meanstd':
                self.normalization_mean_std

            # Apply silent mask
            if self.silent_threshold > 0:
                max_ = tf.reduce_max(self.X, [1, 2], keep_dims=True)
                mask = tf.cast(
                    tf.less(max_ - self.X, self.silent_threshold / 20.),
                    tf.float32)
                self.X = mask * self.X

            if self.add_dilated:
                self.dilated

            self.prediction
            #TODO TO IMPROVE !
            if self.args['model_folder'] is None:
                # if 'inference' not in self.folder and 'enhance' not in self.folder and 'finetuning' not in self.folder:
                self.cost_model = self.cost
                self.finish_construction()
                self.optimize
Beispiel #3
0
	def __init__(self, plugged=False, *args, **kwargs):
		super(Separator, self).__init__(plugged, *args, **kwargs)

		self.num_speakers = kwargs['tot_speakers']
		self.layer_size = kwargs['layer_size']
		self.embedding_size = kwargs['embedding_size']
		self.normalize = kwargs['no_normalize']
		self.nb_layers = kwargs['nb_layers']
		self.a = kwargs['mask_a']
		self.b = kwargs['mask_b']
		self.normalize_input = kwargs['normalize_separator']
		self.abs_input = kwargs['abs_input']

		# Loss Parameters
		self.loss_with_silence = kwargs['silence_loss']
		self.threshold_silence_loss = kwargs['threshold_silence_loss']
		self.function_mask = kwargs['function_mask']

		# Kmeans Parameters
		self.beta = kwargs['beta_kmeans']
		self.threshold = kwargs['threshold']
		self.with_silence = kwargs['with_silence']
		self.nb_tries = kwargs['nb_tries']
		self.nb_steps = kwargs['nb_steps']

		self.graph = tf.get_default_graph()

		self.plugged = plugged
		# If the Separator is not independant but using a front layer
		if self.plugged:
			self.F = kwargs['filters']

			with self.graph.as_default():

				self.training = self.graph.get_tensor_by_name('inputs/is_training:0')

				front = self.graph.get_tensor_by_name('front/output:0')

				self.B = tf.shape(self.graph.get_tensor_by_name('inputs/non_mix_input:0'))[0]

				with tf.name_scope('split_front'):
					self.X = tf.reshape(front[:self.B, :, :, :], [self.B, -1, self.F]) # Mix input [B, T, N]
					# Non mix input [B, T, N, S]
					self.X_input = tf.identity(self.X)
					self.X_non_mix = tf.transpose(tf.reshape(front[self.B:, :, :, :], [self.B, self.S, -1, self.F]), [0,2,3,1])

				with tf.name_scope('create_masks'):
					# # Batch of Masks (bins label)
					# # shape = [ batch size, T, F, S]
					argmax = tf.argmax(tf.abs(self.X_non_mix), axis=3)
					self.y = tf.one_hot(argmax, self.S, self.a, self.b)
					self.y_test_export = tf.reshape(self.y[:, :, :, 0], [self.B, -1])

					if self.function_mask == 'linear':
						max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True)
						self.y = self.y * tf.expand_dims(tf.abs(self.X)/max_, 3)
					elif self.function_mask == 'sqrt':
						max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True)
						self.y = self.y * tf.expand_dims(tf.sqrt(tf.abs(self.X)/max_), 3)
					elif self.function_mask == 'square':
						max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True)
						self.y = self.y * tf.expand_dims(tf.square(tf.abs(self.X)/max_), 3)

					if self.loss_with_silence:
						max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True)
						log_compare = log10(tf.divide(max_, tf.abs(self.X)))
						mask = tf.cast(log_compare < self.threshold_silence_loss, tf.float32)
						tf.summary.image('separator/silence_mask', tf.expand_dims(mask,3), max_outputs=1)
						self.y = self.y * tf.expand_dims(mask, 3)

				# Speakers indices used in the mixtures
				# shape = [ batch size, #speakers]
				self.I = tf.get_default_graph().get_tensor_by_name('inputs/indicies:0')
		else:
			# STFT hyperparams
			self.window_size = kwargs['window_size']
			self.hop_size = kwargs['hop_size']

			# Network hyperparams
			self.F = kwargs['window_size']//2 +1
    def cost(self):
        """
		Construct the cost function op for the negative sampling cost
		"""

        if self.loss_with_silence:
            max_ = tf.reduce_max(tf.abs(self.X), [1, 2], keep_dims=True)
            log_compare = log10(tf.divide(max_, tf.abs(self.X)))
            mask = tf.cast(log_compare < self.threshold_silence_loss,
                           tf.float32)
            tf.summary.image('separator/silence_mask',
                             tf.expand_dims(mask, 3),
                             max_outputs=1)
            y_a_b = self.y * tf.expand_dims(mask, 3)
            y_0_1 = (self.y + 1.0) / 2.0 * tf.expand_dims(mask, 3)
        else:
            y_a_b = self.y
            y_0_1 = (self.y + 1.0) / 2.0

        tf.summary.image('mask/true/1',
                         tf.abs(tf.expand_dims(y_0_1[:, :, :, 0], 3)))
        tf.summary.image('mask/true/2',
                         tf.abs(tf.expand_dims(y_0_1[:, :, :, 1], 3)))

        # Get the embedded T-F vectors from the network
        embedding = self.prediction  # [B, T, F, E]

        embedding_broad = tf.expand_dims(embedding, 4)  # [B, T, F, E, 1]
        y_broad = tf.expand_dims(y_0_1, 3)  # [B, T, F, 1, S]
        v_mean = tf.reduce_sum(embedding_broad * y_broad, [1, 2]) / (
            1e-12 + tf.expand_dims(tf.reduce_sum(y_0_1, [1, 2]), 1)
        )  # [B, E, S]

        #
        # Reconstruction loss
        #

        with tf.name_scope('reconstruction_loss'):

            v_mean_broad = tf.expand_dims(v_mean, 1)  # [B, 1, E, S]
            v_mean_broad = tf.expand_dims(v_mean_broad, 1)  # [B, 1, 1, E, S]

            assignments = tf.reduce_sum(v_mean_broad * embedding_broad,
                                        3)  # [B, T, F, S]

            assignments = tf.nn.sigmoid(assignments)  # [B, T, F, S]

            masked_input = tf.expand_dims(self.X_input, 3) * assignments

            # X_non_mix [B, T, F, S]
            cost_recons = tf.reduce_mean(tf.square(self.X_non_mix -
                                                   masked_input),
                                         axis=[1, 2])
            cost_recons = tf.reduce_mean(
                cost_recons, axis=-1)  # Mean among all speakers [B, S]
            cost_recons = tf.reduce_mean(cost_recons)
            tf.summary.scalar('value', cost_recons)

        #
        # Constrast loss
        #
        with tf.name_scope('source_contrastive_loss'):

            speaker_vectors = tf.nn.l2_normalize(self.speaker_vectors, 1)
            embedding = tf.nn.l2_normalize(embedding, -1)

            I = tf.expand_dims(self.I, axis=2)  # [B, S, 1]
            # Gathering the speaker_vectors [|S|, E]
            Vspeakers = tf.gather_nd(speaker_vectors, I)  # [B, S, E]

            # Expand the dimensions in preparation for broadcasting
            Vspeakers_broad = tf.expand_dims(Vspeakers, 1)
            Vspeakers_broad = tf.expand_dims(Vspeakers_broad,
                                             1)  # [B, 1, 1, S, E]
            embedding_broad = tf.expand_dims(embedding, 3)

            # Compute the dot product between the embedding vectors and speaker
            # vectors
            dot = tf.reduce_sum(Vspeakers_broad * embedding_broad, 4)

            # Compute the cost for every element

            sc_cost = -tf.log(tf.nn.sigmoid(y_a_b * dot))

            sc_cost = tf.reduce_mean(
                sc_cost, 3)  # Average the cost over all speakers in the input
            sc_cost = tf.reduce_mean(sc_cost,
                                     0)  # Average the cost over all batches
            sc_cost = tf.reduce_mean(sc_cost)
            tf.summary.scalar('value', sc_cost)

        cost = sc_cost + cost_recons
        tf.summary.scalar('total', cost)

        return cost
Beispiel #5
0
    def __init__(self,
                 nb_clusters,
                 centroids_init=None,
                 nb_tries=10,
                 nb_iterations=10,
                 input_tensor=None,
                 latent_space_tensor=None,
                 beta=None,
                 threshold=2.5,
                 assign_at_end=False):

        self.nb_clusters = nb_clusters
        self.nb_iterations = nb_iterations
        self.nb_tries = nb_tries
        self.latent_space_tensor = latent_space_tensor
        self.beta = beta
        self.assign_at_end = assign_at_end

        if input_tensor is None:
            self.graph = tf.Graph()
        else:
            self.graph = tf.get_default_graph()

        with self.graph.as_default():
            with tf.name_scope('kmeans'):
                if input_tensor is None:
                    # Spectrogram, embeddings
                    # shape = [batch, L , E ]
                    self.X_in = tf.placeholder("float", [None, None, None],
                                               name='Kmeans_input')
                else:
                    self.X_in = input_tensor

                # mean, _ = tf.nn.moments(self.X_in, axes=-1, keep_dims=True)
                x_norm = tf.nn.l2_normalize(self.X_in, axis=-1)
                self.b = tf.shape(x_norm)[0]
                self.X_original = tf.identity(x_norm)
                self.X = tf.expand_dims(x_norm, 1)
                self.X = tf.tile(self.X, [1, self.nb_tries, 1, 1])

                self.L = tf.shape(self.X)[-2]
                self.E = tf.shape(self.X)[-1]
                self.X = tf.reshape(self.X, [-1, self.L, self.E])

                self.B = tf.shape(self.X)[0]

                self.ones = tf.ones_like(self.X, tf.float32)

                self.shifting = tf.tile(
                    tf.expand_dims(tf.range(self.B) * self.nb_clusters, 1),
                    [1, self.L])

                if centroids_init is None:

                    def random_without_replace(b, l):
                        a = np.array([
                            np.random.choice(range(l),
                                             size=self.nb_clusters,
                                             replace=False) for _ in range(b)
                        ])
                        return a.astype(np.int32)

                    y = tf.py_func(random_without_replace, [self.B, self.L],
                                   tf.int32)
                    random = tf.reshape(y, [self.B, self.nb_clusters, 1])

                    # Take randomly 'nb_clusters' vectors from X
                    batch_range = tf.tile(
                        tf.reshape(tf.range(self.B, dtype=tf.int32),
                                   shape=[self.B, 1, 1]),
                        [1, self.nb_clusters, 1])
                    indices = tf.concat([batch_range, random], axis=2)
                    self.centroid_init = tf.gather_nd(self.X, indices)
                else:
                    self.centroids = tf.identity(centroids_init)
                    self.centroids = tf.tile(self.centroids,
                                             [self.nb_tries, 1, 1])

                if not self.latent_space_tensor is None:
                    latent_space_tensor = tf.reshape(latent_space_tensor,
                                                     [self.b, self.L])
                    log_lst = log10(
                        tf.divide(
                            tf.reduce_max(latent_space_tensor, [-1],
                                          keep_dims=True),
                            latent_space_tensor))
                    self.notsilent_notry = tf.reshape(
                        tf.cast(log_lst < threshold, tf.float32),
                        [self.b, self.L, 1])
                    self.notsilent = tf.tile(self.notsilent_notry,
                                             [self.nb_tries, 1, 1])
                else:
                    self.notsilent = tf.ones([self.B, self.L, 1])

                self.network