Beispiel #1
0
    def __init__(self, sess, batch, bit_depth=2 ** 15, r_constant=0.95, update_method="geom", lowest_bound=None):

        assert type(r_constant) == float or type(r_constant) == np.float32
        assert 0 < r_constant < 1.0

        if lowest_bound is not None:
            assert lowest_bound > 0
            assert type(lowest_bound) in [float, int, np.int16, np.int32, np.float32]
            self.lowest_bound = float(lowest_bound)
        else:
            self.lowest_bound = None

        assert update_method in ["lin", "geom", "log"]

        self.__bit_depth = bit_depth
        self.r_constant = r_constant

        self.tf_run = sess.run
        self.update_method = update_method

        self.bounds = tf.Variable(
            tf.zeros([batch.size, 1]),
            trainable=False,
            validate_shape=True,
            dtype=tf.float32,
            name='qq_masks'
        )

        self.initial_taus = np_arr(
            lcomp(self._gen_tau(batch.audios["n_samples"])),
            np.float32
        )

        self.tf_run(self.bounds.assign(self.initial_taus))
Beispiel #2
0
def create_audio_batch_from_wav_files(batched_file_path_data, dtype="int16"):

    audio_fps = l_map(lambda x: x[1], batched_file_path_data)
    basenames = l_map(lambda x: x[2], batched_file_path_data)

    audios = lcomp([WavFile.load(f, dtype) for f in audio_fps])

    maxlen = max(map(len, audios))
    maximum_length = maxlen + utils.Audios.padding(maxlen)

    padded_audio = np_arr(
        lcomp(utils.Audios.gen_padded_audio(audios, maximum_length)),
        np.float32
    )
    actual_lengths = np_arr(
        l_map(lambda x: x.size, audios),
        np.int32
    )

    # N.B. Remember to use round instead of integer division here!
    maximum_feature_lengths = np_arr(
        l_map(
            lambda _: np.round((maximum_length - 320) / 320),
            audios
        ),
        np.int32
    )

    actual_feature_lengths = np_arr(
        l_map(
            lambda x: np.round((x.size - 320) / 320),
            audios
        ),
        np.int32
    )

    return {
        "file_paths": audio_fps,
        "max_samples": maximum_length,
        "max_feats": maximum_feature_lengths[0],
        "audio": audios,
        "padded_audio": padded_audio,
        "basenames": basenames,
        "n_samples": actual_lengths,
        "ds_feats": maximum_feature_lengths,
        "real_feats": actual_feature_lengths,
    }
    def __init__(self, sess, batch, hard_constraint, synthesiser, placeholders=None):

        batch_size = batch.size
        max_len = batch.audios["max_samples"]
        act_lengths = batch.audios["n_samples"]

        if placeholders is not None:
            self.placeholders = placeholders
        else:
            self.placeholders = Placeholders(batch_size, max_len)

        self.masks = tf.Variable(
            tf.zeros([batch_size, max_len]),
            trainable=False,
            validate_shape=True,
            dtype=tf.float32,
            name='qq_masks'
        )

        self.synthesiser = synthesiser
        self.opt_vars = synthesiser.opt_vars

        # Generate the delta synth parameter objects which we will optimise
        deltas = synthesiser.synthesise()

        # Mask deltas first so we zero value *any part of the signal* that is
        # zero value padded in the original audio
        deltas *= self.masks

        # Restrict delta to valid space before applying constraints

        lower = -2.0 ** 15
        upper = 2.0 ** 15 - 1

        valid_deltas = tf.clip_by_value(
            deltas,
            clip_value_min=lower,
            clip_value_max=upper
        )

        self.final_deltas = hard_constraint.clip(valid_deltas)

        # clip example to valid range
        self.adversarial_examples = tf.clip_by_value(
            self.final_deltas + self.placeholders.audios,
            clip_value_min=lower,
            clip_value_max=upper
        )

        # initialise static variables
        initial_masks = np_arr(
            lcomp(self._gen_mask(act_lengths, max_len)),
            np.float32
        )

        sess.run(self.masks.assign(initial_masks))
Beispiel #4
0
    def create_optimiser(self):
        """
        Manage the computation of gradients from the loss and the delta variable
        """
        self.variables = {}

        grad_var = self.optimizer.compute_gradients(
            self.attack.loss_fn,
            self.attack.delta_graph.opt_vars,
            colocate_gradients_with_ops=True)

        assert None not in lcomp(grad_var, i=0)
        self.train = self.optimizer.apply_gradients(grad_var)

        self.variables = {0: self.optimizer.variables()}
        self.gradients = grad_var[0]
    def __init__(self, sess, batch, bit_depth=2**15):

        batch_size = batch.size
        max_len = batch.audios["max_samples"]
        act_lengths = batch.audios["n_samples"]

        self.__bit_depth = bit_depth
        self.raw_deltas = None
        self.opt_vars = None

        masks = tf.Variable(
            tf.zeros([batch_size, max_len]),
            trainable=False,
            validate_shape=True,
            dtype=tf.float32,
            name='qq_masks'
        )

        # Generate a batch of delta variables which will be optimised as a batch

        deltas = self.create_perturbations(batch_size, max_len)

        # Mask deltas first so we zero value *any part of the signal* that is
        # zero value padded in the original audio
        deltas *= masks

        # Restrict delta to valid space before applying constraints

        lower = -self.__bit_depth
        upper = self.__bit_depth - 1

        self.final_deltas = tf.clip_by_value(
            deltas,
            clip_value_min=lower,
            clip_value_max=upper
        )

        # initialise static variables
        initial_masks = np_arr(
            lcomp(self._gen_mask(act_lengths, max_len)),
            np.float32
        )

        sess.run(masks.assign(initial_masks))
Beispiel #6
0
    def create_optimiser(self):
        """
        Manage the computation of gradients from the loss and the delta variable
        """
        train_ops = []
        self.variables = {}
        gradients = []

        for idx, opt in enumerate(self.optimizers):

            grad_var = opt.compute_gradients(
                self.attack.loss_fn, [self.attack.delta_graph.opt_vars[idx]],
                colocate_gradients_with_ops=True)

            assert None not in lcomp(grad_var, i=0)
            training_op = opt.apply_gradients(grad_var)
            train_ops.append(training_op)
            print(idx, training_op)
            gradients.append(grad_var[0][0])

            self.variables[idx] = opt.variables()

        self.train = tf.group(train_ops)
        self.gradients = tf.stack(gradients, axis=0)
Beispiel #7
0
def create_tf_ctc_alignment_search_graph(batch,
                                         q,
                                         use_beam_search_decoder=False):
    with tf.Session() as sess:

        targets = tf.placeholder(tf.int32, [batch.size, None],
                                 name='qq_alignment_targets')
        target_lengths = tf.placeholder(tf.int32, [batch.size],
                                        name='qq_alignment_targets_lengths')

        shape = [
            batch.size, batch.audios["max_feats"],
            len(batch.targets["tokens"])
        ]

        initial_alignments = tf.Variable(tf.zeros(shape),
                                         dtype=tf.float32,
                                         trainable=True,
                                         name='qq_alignment')

        mask = tf.Variable(tf.ones(shape),
                           dtype=tf.float32,
                           trainable=False,
                           name='qq_alignment_mask')

        logits_alignments = initial_alignments * mask
        raw_alignments = tf.transpose(logits_alignments, [1, 0, 2])
        softmax_alignments = tf.nn.softmax(logits_alignments, axis=-1)
        target_alignments = tf.argmax(softmax_alignments, axis=2)

        per_logit_lengths = batch.audios["real_feats"]
        maxlen = shape[1]

        def gen_mask(per_logit_len, maxlen):
            # per actual frame
            for l in per_logit_len:
                # per possible frame
                masks = []
                for f in range(maxlen):
                    if l > f:
                        # if should be optimised
                        mask = np.ones([29])
                    else:
                        # shouldn't be optimised
                        mask = np.zeros([29])
                        #mask[28] = 30.0
                    masks.append(mask)
                yield np.asarray(masks)

        initial_masks = np.asarray(
            [m for m in gen_mask(per_logit_lengths, maxlen)], dtype=np.float32)

        sess.run(mask.assign(initial_masks))

        seq_lens = batch.audios["real_feats"]

        ctc_target = tf.keras.backend.ctc_label_dense_to_sparse(
            targets, target_lengths)

        loss_fn = tf.nn.ctc_loss(
            labels=ctc_target,
            inputs=raw_alignments,
            sequence_length=seq_lens,
        )

        optimizer = tf.train.AdamOptimizer(1)

        grad_var = optimizer.compute_gradients(loss_fn, initial_alignments)
        assert None not in lcomp(grad_var, i=0)

        train_alignment = optimizer.apply_gradients(grad_var)
        variables = optimizer.variables()

        def tf_beam_decode(sess, logits, features_lengths, tokens):

            tf_decode, log_probs = tf.nn.ctc_beam_search_decoder(
                logits, features_lengths, merge_repeated=False, beam_width=500)
            dense = tf.sparse.to_dense(tf_decode[0])
            tf_dense = sess.run([dense])
            tf_outputs = [
                ''.join([tokens[int(x)] for x in tf_dense[0][i]])
                for i in range(tf_dense[0].shape[0])
            ]

            tf_outputs = [o.rstrip(" ") for o in tf_outputs]

            probs = sess.run(log_probs)
            probs = [prob[0] for prob in probs]
            return tf_outputs, probs

        def tf_greedy_decode(sess,
                             logits,
                             features_lengths,
                             tokens,
                             merge_repeated=True):

            tf_decode, log_probs = tf.nn.ctc_greedy_decoder(
                logits,
                features_lengths,
                merge_repeated=merge_repeated,
            )
            dense = tf.sparse.to_dense(tf_decode[0])
            tf_dense = sess.run([dense])
            tf_outputs = [
                ''.join([tokens[int(x)] for x in tf_dense[0][i]])
                for i in range(tf_dense[0].shape[0])
            ]

            tf_outputs = [o.rstrip(" ") for o in tf_outputs]

            neg_sum_logits = sess.run(log_probs)
            neg_sum_logits = [prob[0] for prob in neg_sum_logits]
            return tf_outputs, neg_sum_logits

        variables.append(initial_alignments)

        sess.run(tf.variables_initializer(variables))

        still_have_work = True
        max_iters = 1000
        c = 0

        while still_have_work:

            train_ops = [
                loss_fn, softmax_alignments, logits_alignments, mask,
                train_alignment
            ]

            feed = {
                targets: batch.targets["indices"],
                target_lengths: batch.targets["lengths"],
            }

            ctc_limit, softmax, raw, m, _ = sess.run(train_ops, feed_dict=feed)

            if use_beam_search_decoder is True:
                decodings, probs = tf_beam_decode(sess, raw_alignments,
                                                  batch.audios["real_feats"],
                                                  TOKENS)
            else:
                decodings, probs = tf_greedy_decode(sess, raw_alignments,
                                                    batch.audios["real_feats"],
                                                    TOKENS)

            target_phrases = batch.targets["phrases"]

            decoding_check = all(
                [d == t for d, t in zip(decodings, target_phrases)])
            ctc_check = all(c < 0.1 for c in ctc_limit)

            if decoding_check and ctc_check:
                s = "Found an alignment for each example:"
                for d, p, t in zip(decodings, probs, target_phrases):
                    s += "\nTarget: {t} | Decoding: {d} | Probs: {p:.3f}".format(
                        t=t,
                        d=d,
                        p=p,
                    )
                log(s, wrap=True)
                still_have_work = False

            elif c >= max_iters:
                log("Could not find any CTC optimal alignments for you...")
                q.put("dead")
                sys.exit(5)
            else:
                c += 1

        q.put(sess.run(target_alignments).tolist())