コード例 #1
0
 def next_batch(self):
     '''
     Draw the next batch from from the combined switchable queue.
     '''
     source, source_lengths, target, target_lengths = self._queue.dequeue_many(self._model_feeder.ph_batch_size)
     sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._model_feeder.ph_batch_size)
     return source, source_lengths, sparse_labels
コード例 #2
0
    def setup_graph(self, input_audio_batch, target_phrase):
        batch_size = input_audio_batch.shape[0]
        weird = (input_audio_batch.shape[1] - 1) // 320
        logits_arg2 = np.tile(weird, batch_size)
        dense_arg1 = np.array(np.tile(target_phrase, (batch_size, 1)),
                              dtype=np.int32)
        dense_arg2 = np.array(np.tile(target_phrase.shape[0], batch_size),
                              dtype=np.int32)

        pass_in = np.clip(input_audio_batch, -2**15, 2**15 - 1)
        seq_len = np.tile(weird, batch_size).astype(np.int32)

        with tf.variable_scope('', reuse=tf.AUTO_REUSE):
            inputs = tf.placeholder(tf.float32, shape=pass_in.shape, name='a')
            len_batch = tf.placeholder(tf.float32, name='b')
            arg2_logits = tf.placeholder(tf.int32,
                                         shape=logits_arg2.shape,
                                         name='c')
            arg1_dense = tf.placeholder(tf.float32,
                                        shape=dense_arg1.shape,
                                        name='d')
            arg2_dense = tf.placeholder(tf.int32,
                                        shape=dense_arg2.shape,
                                        name='e')
            len_seq = tf.placeholder(tf.int32, shape=seq_len.shape, name='f')

            logits = get_logits(inputs, arg2_logits)
            target = ctc_label_dense_to_sparse(arg1_dense, arg2_dense,
                                               len_batch)
            ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits,
                                     sequence_length=len_seq)
            decoded, _ = tf.nn.ctc_greedy_decoder(logits,
                                                  arg2_logits,
                                                  merge_repeated=True)

            sess = tf.Session()
            saver = tf.train.Saver(tf.global_variables())
            saver.restore(sess, "models/session_dump")

        func1 = lambda a, b, c, d, e, f: sess.run(ctcloss,
                                                  feed_dict={
                                                      inputs: a,
                                                      len_batch: b,
                                                      arg2_logits: c,
                                                      arg1_dense: d,
                                                      arg2_dense: e,
                                                      len_seq: f
                                                  })
        func2 = lambda a, b, c, d, e, f: sess.run(
            [ctcloss, decoded],
            feed_dict={
                inputs: a,
                len_batch: b,
                arg2_logits: c,
                arg1_dense: d,
                arg2_dense: e,
                len_seq: f
            })
        return (func1, func2)
コード例 #3
0
ファイル: feeding.py プロジェクト: RawStewage/DeepSpeech
 def next_batch(self):
     '''
     Draw the next batch from from the combined switchable queue.
     '''
     source, source_lengths, target, target_lengths = self._queue.dequeue_many(self._model_feeder.ph_batch_size)
     sparse_labels = ctc_label_dense_to_sparse(target, target_lengths, self._model_feeder.ph_batch_size)
     return source, source_lengths, sparse_labels
コード例 #4
0
ファイル: evaluate.py プロジェクト: revirevy/DeepSpeech
def main(_):
    initialize_globals()

    if not FLAGS.test_files:
        log_error('You need to specify what files to use for evaluation via '
                  'the --test_files flag.')
        exit(1)

    global alphabet
    alphabet = Alphabet(FLAGS.alphabet_config_path)

    scorer = Scorer(FLAGS.lm_weight, FLAGS.valid_word_count_weight,
                    FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                    alphabet)

    # sort examples by length, improves packing of batches and timesteps
    test_data = preprocess(
        FLAGS.test_files.split(','),
        FLAGS.test_batch_size,
        alphabet=alphabet,
        numcep=N_FEATURES,
        numcontext=N_CONTEXT,
        hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
        by="features_len",
        ascending=False)

    def create_windows(features):
        num_strides = len(features) - (N_CONTEXT * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2*N_CONTEXT+1
        features = np.lib.stride_tricks.as_strided(
            features,
            (num_strides, window_size, N_FEATURES),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    # Create overlapping windows over the features
    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session() as session:
        inputs, outputs, layers = create_inference_graph(batch_size=FLAGS.test_batch_size, n_steps=-1)

        # Transpose to batch major for decoder
        transposed = tf.transpose(outputs['outputs'], [1, 0, 2])

        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")

        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=layers['raw_logits'],
                              sequence_length=inputs['input_lengths'])

        # Create a saver using variables from the above newly created graph
        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        logitses = []
        losses = []

        print('Computing acoustic model predictions...')
        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # First pass, compute losses and transposed logits for decoding
        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            features = pad_to_dense(batch['features'].values)
            features_len = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            logits, loss = session.run([transposed, loss], feed_dict={
                inputs['input']: features,
                inputs['input_lengths']: features_len,
                labels_ph: labels,
                label_lengths_ph: label_lengths
            })

            logitses.append(logits)
            losses.extend(loss)

        ground_truths = []
        predictions = []
        distances = []

        print('Decoding predictions...')
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # Get number of accessible CPU cores for this process
        num_processes = len(os.sched_getaffinity(0))

        # Second pass, decode logits and compute WER and edit distance metrics
        for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
            seq_lengths = batch['features_len'].values.astype(np.int32)
            decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width,
                                                    num_processes=num_processes, scorer=scorer)

            ground_truths.extend(alphabet.decode(l) for l in batch['transcript'])
            predictions.extend(d[0][1] for d in decoded)
            distances.extend(levenshtein(a, b) for a, b in zip(labels, predictions))

    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Take only the first report_count items
    report_samples = itertools.islice(samples, FLAGS.report_count)

    print('Test - WER: %f, loss: %f, mean edit distance: %f' %
          (wer, mean_loss, mean_edit_distance))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, loss: %f, edit distance: %f' %
              (sample.wer, sample.loss, sample.distance))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    if FLAGS.test_output_file:
        json.dump(samples, open(FLAGS.test_output_file, 'w'), default=lambda x: float(x))
コード例 #5
0
 def next_batch(self):
     source, source_lengths, target, target_lengths = self._example_queue.dequeue_many(
         self._batch_size)
     sparse_labels = ctc_label_dense_to_sparse(target, target_lengths,
                                               self._batch_size)
     return source, source_lengths, sparse_labels
コード例 #6
0
ファイル: evaluate.py プロジェクト: PiotrowskiD/DeepSpeech
def evaluate(test_data, inference_graph, alphabet):
    scorer = Scorer(FLAGS.lm_alpha, FLAGS.lm_beta,
                    FLAGS.lm_binary_path, FLAGS.lm_trie_path,
                    Config.alphabet)


    def create_windows(features):
        num_strides = len(features) - (Config.n_context * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2*Config.n_context+1
        features = np.lib.stride_tricks.as_strided(
            features,
            (num_strides, window_size, Config.n_input),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    # Create overlapping windows over the features
    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session(config=Config.session_config) as session:
        inputs, outputs, layers = inference_graph

        # Transpose to batch major for decoder
        transposed = tf.transpose(outputs['outputs'], [1, 0, 2])

        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None], name="labels")
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size], name="label_lengths")

        sparse_labels = tf.cast(ctc_label_dense_to_sparse(labels_ph, label_lengths_ph, FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=layers['raw_logits'],
                              sequence_length=inputs['input_lengths'])

        # Create a saver using variables from the above newly created graph
        mapping = {v.op.name: v for v in tf.global_variables() if not v.op.name.startswith('previous_state_')}
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        if FLAGS.checkpoint_dir is not None:
            checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            if not checkpoint:
                log_error('Checkpoint directory ({}) does not contain a valid checkpoint state.'.format(FLAGS.checkpoint_dir))
                exit(1)

            checkpoint_path = checkpoint.model_checkpoint_path
            saver.restore(session, checkpoint_path)

        logitses = []
        losses = []

        print('Computing acoustic model predictions...')
        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count,
                                      widget=progressbar.AdaptiveETA)

        # First pass, compute losses and transposed logits for decoding
        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            features = pad_to_dense(batch['features'].values)
            features_len = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            logits, loss_ = session.run([transposed, loss], feed_dict={
                inputs['input']: features,
                inputs['input_lengths']: features_len,
                labels_ph: labels,
                label_lengths_ph: label_lengths
            })

            logitses.append(logits)
            losses.extend(loss_)

    ground_truths = []
    predictions = []

    print('Decoding predictions...')
    bar = progressbar.ProgressBar(max_value=batch_count,
                                  widget=progressbar.AdaptiveETA)

    # Get number of accessible CPU cores for this process
    try:
        num_processes = cpu_count()
    except:
        num_processes = 1

    # Second pass, decode logits and compute WER and edit distance metrics
    for logits, batch in bar(zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
        seq_lengths = batch['features_len'].values.astype(np.int32)
        decoded = ctc_beam_search_decoder_batch(logits, seq_lengths, alphabet, FLAGS.beam_width,
                                                num_processes=num_processes, scorer=scorer)

        ground_truths.extend(alphabet.decode(l) for l in batch['transcript'])
        predictions.extend(d[0][1] for d in decoded)

    distances = [levenshtein(a, b) for a, b in zip(ground_truths, predictions)]

    wer, samples = calculate_report(ground_truths, predictions, distances, losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Take only the first report_count items
    report_samples = itertools.islice(samples, FLAGS.report_count)

    print('Test - WER: %f, CER: %f, loss: %f' %
          (wer, mean_edit_distance, mean_loss))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, CER: %f, loss: %f' %
              (sample.wer, sample.distance, sample.loss))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    return samples
コード例 #7
0
    def __init__(self,
                 sess,
                 loss_fn,
                 phrase_length,
                 max_audio_len,
                 learning_rate=10,
                 num_iterations=1000,
                 batch_size=1):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """

        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know hich
        # ones are ours so when we restore the session we don't
        # clobber them.
        self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                  dtype=np.float32),
                                         name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                dtype=np.float32),
                                       name='qq_mask')
        self.cwmask = cwmask = tf.Variable(np.zeros(
            (batch_size, phrase_length), dtype=np.float32),
                                           name='qq_cwmask')
        self.original = original = tf.Variable(np.zeros(
            (batch_size, max_audio_len), dtype=np.float32),
                                               name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size,
                                                      dtype=np.int32),
                                             name='qq_lengths')
        self.importance = tf.Variable(np.zeros((batch_size, phrase_length),
                                               dtype=np.float32),
                                      name='qq_importance')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length),
                                                  dtype=np.int32),
                                         name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size),
                                                          dtype=np.int32),
                                                 name='qq_phrase_lengths')
        self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32),
                                   name='qq_phrase_lengths')

        # Initially we bound the l_infty norm by 2000, increase this
        # constant if it's not big enough of a distortion for your dataset.
        self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale

        # We set the new input to the model to be the abve delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.
        self.new_input = new_input = self.apply_delta * mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        noise = tf.random_normal(new_input.shape, stddev=2)
        pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1)

        # Feed this final value to get the logits.
        self.logits = logits = get_logits(pass_in, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver(
            [x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, "models/session_dump")

        # Choose the loss function we want -- either CTC or CW
        self.loss_fn = loss_fn
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase,
                                               self.target_phrase_lengths,
                                               batch_size)

            ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits,
                                     sequence_length=lengths)

            loss = tf.nn.relu(ctcloss)
            self.expanded_loss = tf.constant(0)

        elif loss_fn == "CW":
            raise NotImplemented(
                "The current version of this project does not include the CW loss function implementation."
            )
        else:
            raise

        # Set up the Adam optimizer to perform gradient descent for us
        var_start = tf.global_variables()
        self.train = tf.train.AdamOptimizer(learning_rate).minimize(
            loss, var_list=[delta])
        self.loss = loss
        self.ctcloss = ctcloss

        var_end = tf.global_variables()
        new_vars = [
            x for x in var_end if x.name not in [y.name for y in var_start]
        ]
        sess.run(tf.variables_initializer(new_vars + [delta]))

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                        lengths,
                                                        merge_repeated=False,
                                                        beam_width=1000)
コード例 #8
0
ファイル: evaluate.py プロジェクト: gmyzc520/DeepSpeech-1
def main(_):
    initialize_globals()

    if not FLAGS.test_files:
        log_error('You need to specify what files to use for evaluation via '
                  'the --test_files flag.')
        exit(1)

    global alphabet
    alphabet = Alphabet(os.path.abspath(FLAGS.alphabet_config_path))

    # sort examples by length, improves packing of batches and timesteps
    test_data = preprocess(FLAGS.test_files.split(','),
                           FLAGS.test_batch_size,
                           alphabet=alphabet,
                           numcep=N_FEATURES,
                           numcontext=N_CONTEXT,
                           hdf5_cache_path=FLAGS.hdf5_test_set).sort_values(
                               by="features_len", ascending=False)

    def create_windows(features):
        num_strides = len(features) - (N_CONTEXT * 2)

        # Create a view into the array with overlapping strides of size
        # numcontext (past) + 1 (present) + numcontext (future)
        window_size = 2 * N_CONTEXT + 1
        features = np.lib.stride_tricks.as_strided(
            features, (num_strides, window_size, N_FEATURES),
            (features.strides[0], features.strides[0], features.strides[1]),
            writeable=False)

        return features

    test_data['features'] = test_data['features'].apply(create_windows)

    with tf.Session() as session:
        inputs, outputs = create_inference_graph(
            batch_size=FLAGS.test_batch_size, n_steps=N_STEPS)

        seq_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size])
        decode_logits_ph = tf.placeholder(
            tf.float32, [None, FLAGS.test_batch_size,
                         alphabet.size() + 1])
        labels_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size, None])
        label_lengths_ph = tf.placeholder(tf.int32, [FLAGS.test_batch_size])

        decoded, _ = decode_with_lm(decode_logits_ph,
                                    seq_lengths_ph,
                                    merge_repeated=False,
                                    beam_width=FLAGS.beam_width)

        sparse_labels = tf.cast(
            ctc_label_dense_to_sparse(labels_ph, label_lengths_ph,
                                      FLAGS.test_batch_size), tf.int32)
        loss = tf.nn.ctc_loss(labels=sparse_labels,
                              inputs=decode_logits_ph,
                              sequence_length=seq_lengths_ph)

        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32),
                                    sparse_labels)

        # Create a saver using variables from the above newly created graph
        mapping = {
            v.op.name: v
            for v in tf.global_variables()
            if not v.op.name.startswith('previous_state_')
        }
        saver = tf.train.Saver(mapping)

        # Restore variables from training checkpoint
        checkpoint = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
        if not checkpoint:
            log_error(
                'Checkpoint directory ({}) does not contain a valid checkpoint state.'
                .format(FLAGS.checkpoint_dir))
            exit(1)

        checkpoint_path = checkpoint.model_checkpoint_path
        saver.restore(session, checkpoint_path)

        logitses = []

        batch_count = len(test_data) // FLAGS.test_batch_size
        bar = progressbar.ProgressBar(max_value=batch_count - 1,
                                      widget=progressbar.AdaptiveETA)

        for batch in bar(split_data(test_data, FLAGS.test_batch_size)):
            session.run(outputs['initialize_state'])

            batch_features = pad_to_dense(batch['features'].values)
            batch_features_len = batch['features_len'].values
            full_step_len = np.full_like(batch_features_len, N_STEPS)

            logits = np.empty([0, FLAGS.test_batch_size, alphabet.size() + 1])
            for i in range(0, batch_features.shape[1], N_STEPS):
                chunk_features = batch_features[:, i:i + N_STEPS, :, :]
                chunk_features_len = np.minimum(batch_features_len,
                                                full_step_len)

                # pad with zeros if the chunk does not have enough steps
                steps_in_chunk = chunk_features.shape[1]
                if steps_in_chunk < FLAGS.n_steps:
                    chunk_features = np.pad(
                        chunk_features,
                        ((0, 0), (0, FLAGS.n_steps - steps_in_chunk), (0, 0),
                         (0, 0)),
                        mode='constant',
                        constant_values=0)

                output = session.run(outputs['outputs'],
                                     feed_dict={
                                         inputs['input']:
                                         chunk_features,
                                         inputs['input_lengths']:
                                         chunk_features_len,
                                     })
                logits = np.concatenate((logits, output))

                # we have processed N_STEPS so subtract from remaining steps
                batch_features_len -= N_STEPS
                # clip to zero
                batch_features_len = np.maximum(
                    batch_features_len, np.zeros_like(batch_features_len))

            logitses.append(logits)

        ground_truths = []
        predictions = []
        distances = []
        losses = []

        bar = progressbar.ProgressBar(max_value=batch_count - 1,
                                      widget=progressbar.AdaptiveETA)

        for logits, batch in bar(
                zip(logitses, split_data(test_data, FLAGS.test_batch_size))):
            seq_lengths = batch['features_len'].values
            labels = pad_to_dense(batch['transcript'].values)
            label_lengths = batch['transcript_len'].values

            decoded_, loss_, distance_, sparse_labels_ = session.run(
                [decoded, loss, distance, sparse_labels],
                feed_dict={
                    decode_logits_ph: logits,
                    seq_lengths_ph: seq_lengths,
                    labels_ph: labels,
                    label_lengths_ph: label_lengths
                })

            ground_truths.extend(
                sparse_tensor_value_to_texts(sparse_labels_, alphabet))
            predictions.extend(
                sparse_tensor_value_to_texts(decoded_[0], alphabet))
            distances.extend(distance_)
            losses.extend(loss_)

    wer, samples = calculate_report(ground_truths, predictions, distances,
                                    losses)
    mean_edit_distance = np.mean(distances)
    mean_loss = np.mean(losses)

    # Filter out all items with WER=0 and take only the first report_count items
    report_samples = itertools.islice((s for s in samples if s.wer > 0),
                                      FLAGS.report_count)

    print('Test - WER: %f, loss: %f, mean edit distance: %f' %
          (wer, mean_loss, mean_edit_distance))
    print('-' * 80)
    for sample in report_samples:
        print('WER: %f, loss: %f, mean edit distance: %f' %
              (sample.wer, sample.loss, sample.distance))
        print(' - src: "%s"' % sample.src)
        print(' - res: "%s"' % sample.res)
        print('-' * 80)

    if FLAGS.test_output_file:
        json.dump(samples,
                  open(FLAGS.test_output_file, 'w'),
                  default=lambda x: float(x))
コード例 #9
0
class Attack:
    def __init__(self, sess, loss_fn, phrase_length, max_audio_len, psdMaxes,
                     learning_rate=10, num_iterations=5000, window_size=2048,
                     step_per_window=4, batch_size=1, mp3=False, 
                     onlyCTC=True, audio=None, psdShape=None):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """
        
        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len
        self.mp3 = mp3
        self.psdMaxes = psdMaxes
        self.window_size = window_size
        self.step_per_window = step_per_window
        
        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.
        
        frame_length = int(window_size)
        frame_step = int(window_size//step_per_window)
        fft_length = int(2**np.ceil(np.log2(frame_length)))
        sample_rate = 16000
        freq_res = sample_rate/window_size
        time_res = frame_step/(sample_rate/1000)
        sigma_time = 96. / time_res
        sigma_freq = 15.625 / freq_res
        
        self.regularizer = regularizer = tf.Variable(np.zeros((batch_size), dtype=np.float32), name='qq_regularizer')
        self.psyTh = psyTh = tf.Variable(np.zeros((batch_size, psdShape[0], psdShape[1]), dtype=np.float32), name='qq_psyTh')
        
        self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len)).astype(np.float32)/2, name='qq_delta') name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask')
        self.original = original = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths')
        self.rescale = tf.Variable(np.zeros((batch_size,1), dtype=np.float32), name='qq_rescale')

        # Initially we bound the l_infty norm by 2000, increase this
        # constant if it's not big enough of a distortion for your dataset.
         
        if(loss_fn == 'CTC'):
            self.apply_delta = tf.clip_by_value(delta, -2000, 2000)*self.rescale
        elif(loss_fn == 'CTCPSYCLIP'):
            
            self.apply_delta = apply_delta = self.clipBatch(delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size, step_per_window)
            
            self.new_input = new_input = self.apply_delta*mask + original
            
            #self.new_input = new_input = delta*mask + original
            
        # We set the new input to the model to be the above delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.
        
        if(loss_fn == 'CTC'):
            self.new_input = new_input = self.apply_delta*mask + original
        if(loss_fn == 'CTCPSYGRAD'):
            self.new_input = new_input = self.delta*mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        if(loss_fn == 'CTC'):
            noise = tf.random_normal(new_input.shape,
                                     stddev=2)
            pass_in = tf.clip_by_value(new_input+noise, -2**15, 2**15-1)
 
        # Feed this final value to get the logits.
        self.logits = logits = get_logits(new_input, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, "models/session_dump")

        self.loss_fn = loss_fn
        
        
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size)
            
            ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits, sequence_length=lengths)

            # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion
            # The code runs faster at a slight cost of distortion, and also leaves one less
            # paramaeter that requires tuning.
            if not onlyCTC:
                loss = tf.reduce_mean((self.new_input-self.original)**2,axis=1)/regularizer + ctcLoss
            else:
                loss = ctcLoss
            self.expanded_loss = tf.constant(0)
コード例 #10
0
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size)
            
            ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits, sequence_length=lengths)

            # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion
            # The code runs faster at a slight cost of distortion, and also leaves one less
            # paramaeter that requires tuning.
            if not onlyCTC:
                loss = tf.reduce_mean((self.new_input-self.original)**2,axis=1)/regularizer + ctcLoss
            else:
                loss = ctcLoss
            self.expanded_loss = tf.constant(0)
        elif loss_fn == "CTCPSYCLIP":
            target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size)
            
            ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=logits, sequence_length=lengths)
            loss = ctcLoss
            self.expanded_loss = tf.constant(0)
        elif loss_fn == "CW":
            raise NotImplemented("The current version of this project does not include the CW loss function implementation.")
        else:
            raise
            
        self.deltaPSD = deltaPSD = tfPSD(self.new_input-self.original, window_size, step_per_window, self.psdMaxes)
        psyLoss = tf.reduce_max(deltaPSD - self.psyTh, axis=[1,2])
        self.loss = loss
        self.psyLoss = tf.transpose(psyLoss)
        self.ctcLoss = ctcLoss
コード例 #11
0
    def __init__(self,
                 sess,
                 phrase_length,
                 max_audio_len,
                 psdMaxes,
                 learning_rate=10,
                 num_iterations=5000,
                 window_size=256,
                 step_per_window=2,
                 batch_size=1,
                 mp3=False,
                 delta=None,
                 audio=None,
                 psdShape=None):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """

        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len
        self.mp3 = mp3
        self.psdMaxes = psdMaxes
        self.window_size = window_size
        self.step_per_window = step_per_window

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.

        frame_length = int(window_size)
        frame_step = int(window_size // step_per_window)
        fft_length = int(2**np.ceil(np.log2(frame_length)))
        sample_rate = 16000  # datapoints per second
        freq_res = sample_rate / window_size
        # sample_rate/2 is the maximal recorded frequency,
        # We have window_size/2+1 frequencies
        time_res = frame_step / (sample_rate / 1000)
        # (sample_rate/1000) = samples per millisecond
        # frame_step/(sample_rate/1000) => milliseconds for one step

        self.regularizer = regularizer = tf.Variable(np.zeros(
            (batch_size), dtype=np.float32),
                                                     name='qq_regularizer')
        self.psyTh = psyTh = tf.Variable(np.zeros(
            (batch_size, psdShape[0], psdShape[1]), dtype=np.float32),
                                         name='qq_psyTh')

        if (delta is None):
            self.delta = delta = tf.Variable(np.zeros(
                (batch_size, max_audio_len)).astype(np.float32) / 2,
                                             name='qq_delta')
        else:
            self.delta = delta = tf.Variable(
                (delta - audio).astype(np.float32), name='qq_delta')
        self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len),
                                                dtype=np.float32),
                                       name='qq_mask')
        self.original = original = tf.Variable(np.zeros(
            (batch_size, max_audio_len), dtype=np.float32),
                                               name='qq_original')
        self.lengths = lengths = tf.Variable(np.zeros(batch_size,
                                                      dtype=np.int32),
                                             name='qq_lengths')
        self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length),
                                                  dtype=np.int32),
                                         name='qq_phrase')
        self.target_phrase_lengths = tf.Variable(np.zeros((batch_size),
                                                          dtype=np.int32),
                                                 name='qq_phrase_lengths')

        self.apply_delta = apply_delta = self.clipBatch(
            delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size,
            step_per_window)

        self.new_input = new_input = self.apply_delta * mask + original

        # We set the new input to the model to be the above delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.

        # Feed this final value to get the logits.
        self.logits = logits = get_logits(new_input, lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver(
            [x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, "models/session_dump")

        target = ctc_label_dense_to_sparse(self.target_phrase,
                                           self.target_phrase_lengths,
                                           batch_size)

        ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                 inputs=logits,
                                 sequence_length=lengths)
        loss = ctcLoss
        self.expanded_loss = tf.constant(0)

        self.deltaPSD = deltaPSD = tfPSD(self.new_input - self.original,
                                         window_size, step_per_window,
                                         self.psdMaxes)
        self.loss = loss
        self.psyLoss = tf.reduce_max(deltaPSD - self.psyTh, axis=[1, 2])
        self.ctcLoss = ctcLoss

        # Set up the Adam optimizer to perform gradient descent for us
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(learning_rate)

        grad, var = optimizer.compute_gradients(self.loss, [delta])[0]
        self.train = optimizer.apply_gradients([(grad, var)])

        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        sess.run(tf.variables_initializer(new_vars + [delta]))

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits,
                                                        lengths,
                                                        merge_repeated=False,
                                                        beam_width=100)
コード例 #12
0
 def next_batch(self):
     uttids, source, source_lengths, target, target_lengths = self._queue.dequeue_many(
         self._data_set.batch_size)
     sparse_labels = ctc_label_dense_to_sparse(target, target_lengths,
                                               self._data_set.batch_size)
     return uttids, source, source_lengths, sparse_labels
コード例 #13
0
    def __init__(self, sess, loss_fn, phrase_length, max_audio_len,
                 learning_rate=10, num_iterations=5000, batch_size=1,
                 mp3=False, l2penalty=float('inf'), beam_width=100):
        """
        Set up the attack procedure.

        Here we create the TF graph that we're going to use to
        actually generate the adversarial examples.
        """
        
        self.sess = sess
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.batch_size = batch_size
        self.phrase_length = phrase_length
        self.max_audio_len = max_audio_len
        self.mp3 = mp3
        self.beam_width = beam_width

        # Create all the variables necessary
        # they are prefixed with qq_ just so that we know which
        # ones are ours so when we restore the session we don't
        # clobber them.
        bs_mal_shape = [batch_size, max_audio_len]
        bs_pl_shape = [batch_size, phrase_length]
        self.delta = delta = tf.get_variable('qq_delta',
            bs_mal_shape,
            dtype=np.float32,
            initializer=tf.zeros_initializer)
        self.mask = mask = tf.get_variable('qq_mask',
            bs_mal_shape,
            dtype=np.float32,
            initializer=tf.zeros_initializer)
        self.cwmask = cwmask = tf.get_variable('qq_cwmask',
            bs_pl_shape
            dtype=np.float32,
            initializer=tf.zeros_initializer)
        self.original = original = tf.get_variable('qq_original',
            bs_mal_shape, 
            dtype=np.float32,
            initializer=tf.zeros_initializer)
        self.lengths = tf.get_variable('qq_lengths',
            np.zeros(batch_size,
            dtype=np.int32)
         initializer=tf.zeros_initializer)
        self.importance = tf.get_variable('qq_importance',
            bs_pl_shape,
            dtype=np.float32,
            initializer=tf.zeros_initializer)
        self.target_phrase = tf.get_variable('qq_phrase',
            bs_pl_shape,
            dtype=np.int32,
            initializer=tf.zeros_initializer)
        self.target_phrase_lengths = tf.get_variable('qq_phrase_lengths',
            [batch_size],
            dtype=np.int32,
            initializer=tf.zeros_initializer)
        self.rescale = tf.get_variable('qq_phrase_lengths',
            [batch_size,1],
            dtype=np.float32,
            initializer=tf.zeros_initializer)

        # Initially we bound the l_infty norm by 2000, increase this
        # constant if it's not big enough of a distortion for your dataset.
        self.apply_delta = tf.clip_by_value(delta, -2000, 2000)*self.rescale

        # We set the new input to the model to be the abve delta
        # plus a mask, which allows us to enforce that certain
        # values remain constant 0 for length padding sequences.
        self.new_input = self.apply_delta*mask + original

        # We add a tiny bit of noise to help make sure that we can
        # clip our values to 16-bit integers and not break things.
        noise = tf.random_normal(self.new_input.shape, stddev=2)
        pass_in = tf.clip_by_value(self.new_input + noise, -2**15, 2**15-1)

        # Feed this final value to get the logits.
        self.logits = get_logits(pass_in, self.lengths)

        # And finally restore the graph to make the classifier
        # actually do something interesting.
        saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name])
        saver.restore(sess, "models/session_dump")

        # Choose the loss function we want -- either CTC or CW
        self.loss_fn = loss_fn
        if loss_fn == "CTC":
            target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size)
            
            self.ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32),
                                     inputs=self.logits, sequence_length=self.lengths)

            # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion
            # The code runs faster at a slight cost of distortion, and also leaves one less
            # paramaeter that requires tuning.
            if not np.isinf(l2penalty):
                self.loss = tf.reduce_mean((self.new_input - self.original)**2,axis=1) + l2penalty * self.ctcloss
            else:
                self.loss = self.ctcloss
            self.expanded_loss = tf.constant(0)
            
        elif loss_fn == "CW":
            raise NotImplemented("The current version of this project does not include the CW loss function implementation.")
        else:
            raise

        # Set up the Adam optimizer to perform gradient descent for us
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(self.learning_rate)

        grad,var = optimizer.compute_gradients(self.loss, [delta])[0]
        self.train = optimizer.apply_gradients([(tf.sign(grad),var)])
        
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]
        
        sess.run(tf.variables_initializer(new_vars + [delta]))

        # Decoder from the logits, to see how we're doing
        self.decoded, _ = tf.nn.ctc_beam_search_decoder(self.logits, self.lengths, merge_repeated=False, beam_width=self.beam_width)