def classify(input, psearch): with tf.Session() as sess: _, audio = wav.read(input) N = len(audio) new_input = tf.placeholder(tf.float32, [1, N]) lengths = tf.placeholder(tf.int32, [1]) # get logits (probability matrix) from deepspeech with tf.variable_scope("", reuse=tf.AUTO_REUSE): logits = get_logits(new_input, lengths) saver = tf.train.Saver() saver.restore(sess, restore_path) # decode them using either greedy or beam search decoded, _ = tf.nn.ctc_beam_search_decoder( logits, lengths, merge_repeated=False, beam_width=(1 if psearch == "greedy" else 100)) #print('logits shape', logits.shape) length = (len(audio) - 1) // 320 r = sess.run(decoded, {new_input: [audio], lengths: [length]}) return "".join([toks[x] for x in r[0].values])
def setup_graph(self, input_audio_batch, target_phrase): batch_size = input_audio_batch.shape[0] weird = (input_audio_batch.shape[1] - 1) // 320 logits_arg2 = np.tile(weird, batch_size) dense_arg1 = np.array(np.tile(target_phrase, (batch_size, 1)), dtype=np.int32) dense_arg2 = np.array(np.tile(target_phrase.shape[0], batch_size), dtype=np.int32) pass_in = np.clip(input_audio_batch, -2**15, 2**15 - 1) seq_len = np.tile(weird, batch_size).astype(np.int32) with tf.variable_scope('', reuse=tf.AUTO_REUSE): inputs = tf.placeholder(tf.float32, shape=pass_in.shape, name='a') len_batch = tf.placeholder(tf.float32, name='b') arg2_logits = tf.placeholder(tf.int32, shape=logits_arg2.shape, name='c') arg1_dense = tf.placeholder(tf.float32, shape=dense_arg1.shape, name='d') arg2_dense = tf.placeholder(tf.int32, shape=dense_arg2.shape, name='e') len_seq = tf.placeholder(tf.int32, shape=seq_len.shape, name='f') logits = get_logits(inputs, arg2_logits) target = ctc_label_dense_to_sparse(arg1_dense, arg2_dense, len_batch) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=len_seq) decoded, _ = tf.nn.ctc_greedy_decoder(logits, arg2_logits, merge_repeated=True) sess = tf.Session() saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, "models/session_dump") func1 = lambda a, b, c, d, e, f: sess.run(ctcloss, feed_dict={ inputs: a, len_batch: b, arg2_logits: c, arg1_dense: d, arg2_dense: e, len_seq: f }) func2 = lambda a, b, c, d, e, f: sess.run( [ctcloss, decoded], feed_dict={ inputs: a, len_batch: b, arg2_logits: c, arg1_dense: d, arg2_dense: e, len_seq: f }) return (func1, func2)
def main(): with tf.Session() as sess: for i in range(1, len(sys.argv)): if sys.argv[i].split(".")[-1] == 'mp3': raw = pydub.AudioSegment.from_mp3(sys.argv[i]) audio = np.array([ struct.unpack("<h", raw.raw_data[i:i + 2])[0] for i in range(0, len(raw.raw_data), 2) ]) elif sys.argv[i].split(".")[-1] == 'wav': _, audio = wav.read(sys.argv[i]) else: raise Exception("Unknown file format") N = len(audio) new_input = tf.placeholder(tf.float32, [1, N]) lengths = tf.placeholder(tf.int32, [1]) with tf.variable_scope("", reuse=tf.AUTO_REUSE): logits = get_logits(new_input, lengths) if i == 1: saver = tf.train.Saver() saver.restore(sess, "models/session_dump") decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=500) length = (len(audio) - 1) // 320 l = len(audio) r = sess.run(decoded, {new_input: [audio], lengths: [length]}) if len(sys.argv[i]) > 2: print(sys.argv[i]) print("".join([toks[x] for x in r[0].values]))
def main(): parser = argparse.ArgumentParser(description=None) parser.add_argument( '--in', type=str, dest="input", required=True, help="Input audio .wav file(s), at 16KHz (separated by spaces)") parser.add_argument( '--restore_path', type=str, required=True, help="Path to the DeepSpeech checkpoint (ending in model0.4.1)") args = parser.parse_args() while len(sys.argv) > 1: sys.argv.pop() with tf.Session() as sess: if args.input.split(".")[-1] == 'mp3': raw = pydub.AudioSegment.from_mp3(args.input) audio = np.array([ struct.unpack("<h", raw.raw_data[i:i + 2])[0] for i in range(0, len(raw.raw_data), 2) ]) elif args.input.split(".")[-1] == 'wav' or args.input.split( ".")[-1] == 'WAV': _, audio = wav.read(args.input) else: raise Exception("Unknown file format") N = len(audio) new_input = tf.placeholder(tf.float32, [1, N]) lengths = tf.placeholder(tf.int32, [1]) with tf.variable_scope("", reuse=tf.AUTO_REUSE): logits = get_logits(new_input, lengths) saver = tf.train.Saver() saver.restore(sess, args.restore_path) decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=500) print('logits shape', logits.shape) length = (len(audio) - 1) // 320 l = len(audio) r = sess.run(decoded, {new_input: [audio], lengths: [length]}) print("-" * 80) print("-" * 80) print("Classification:") print("".join([toks[x] for x in r[0].values])) print("-" * 80) print("-" * 80) output_text = "".join([toks[x] for x in r[0].values]) return output_text
def __init__(self, sess, phrase_length, max_audio_len, batch_size=1, restore_path=None): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. # self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.cwmask = cwmask = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.float32), name='qq_cwmask') self.original = original = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') # We set the new input to the model to be the abve delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. self.new_input = new_input = mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input + noise, -2 ** 15, 2 ** 15 - 1) # Feed this final value to get the logits. self.logits = logits = get_logits(pass_in, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, restore_path) target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) self.expanded_loss = tf.constant(0) self.ctcloss = ctcloss # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=100)
def main(): parser = argparse.ArgumentParser(description=None) parser.add_argument('input_files', type=str, nargs='+', help="Input audio .wav file(s), at 16KHz (separated by spaces)") args = parser.parse_args() restore_path='deepspeech-0.4.1-checkpoint/model.v0.4.1' for input_file in args.input_files: tf.reset_default_graph() with tf.Session() as sess: if input_file.split(".")[-1] == 'mp3': raw = pydub.AudioSegment.from_mp3(input_file) audio = np.array([struct.unpack("<h", raw.raw_data[i:i+2])[0] for i in range(0,len(raw.raw_data),2)]) elif input_file.split(".")[-1] == 'wav': _, audio = wav.read(input_file) else: raise Exception("Unknown file format") prediction_output_path = input_file.split('.')[0] + '_041_prediction' N = len(audio) new_input = tf.placeholder(tf.float32, [1, N]) lengths = tf.placeholder(tf.int32, [1]) with tf.variable_scope("", reuse=tf.AUTO_REUSE): logits = get_logits(new_input, lengths) saver = tf.train.Saver() saver.restore(sess, restore_path) decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=500) print('logits shape', logits.shape) length = (len(audio)-1)//320 l = len(audio) r = sess.run(decoded, {new_input: [audio], lengths: [length]}) prediction = "".join([toks[x] for x in r[0].values]) print("-"*80) print("-"*80) print("Classification:") print(prediction) print("-"*80) print("-"*80) with open(prediction_output_path, 'w') as f: f.write(prediction)
def getAudioPrediction(sess, audio): global modelInitDone N = len(audio) new_input = tf.placeholder(tf.float32, [1, N]) lengths = tf.placeholder(tf.int32, [1]) with tf.variable_scope("", reuse=tf.AUTO_REUSE): logits = get_logits(new_input, lengths) if not modelInitDone: init(sess) modelInitDone = True decoded, logprobs = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=500) length = (len(audio)-1)//320 l = len(audio) r = sess.run(decoded, {new_input: [audio], lengths: [length]}) lp = sess.run(logprobs, {new_input: [audio], lengths: [length]}) tts = "".join([toks[x] for x in r[0].values]) return tts
def __init__(self, sess, loss_fn, phrase_length, max_audio_len, learning_rate=10, num_iterations=5000, batch_size=1, max_offset=320, mp3=False, l2penalty=float('inf'), restore_path=None, adversarial_signal_limit=2000.0): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len self.mp3 = mp3 self.max_offset = max_offset self.adversarial_signal_limit = adversarial_signal_limit # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.cwmask = cwmask = tf.Variable(np.zeros( (batch_size, phrase_length), dtype=np.float32), name='qq_cwmask') self.original = original = tf.Variable(np.zeros( (batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.importance = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.float32), name='qq_importance') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32), name='qq_phrase_lengths') self.learning_rate_tensor = tf.Variable(np.ones((1), dtype=np.float32), name='qq_learning_rate_tensor') # Initially we bound the l_infty norm by 2000, increase this # constant if it's not big enough of a distortion for your dataset. self.apply_delta = tf.clip_by_value( delta, -adversarial_signal_limit, adversarial_signal_limit) * self.rescale # We set the new input to the model to be the above delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. self.new_input = new_input = self.apply_delta * mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1) # Feed this final value to get the logits. self.logits = logits = get_logits(pass_in, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver( [x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, restore_path) # Choose the loss function we want -- either CTC or CW self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion # The code runs faster at a slight cost of distortion, and also leaves one less # paramaeter that requires tuning. if not np.isinf(l2penalty): loss = tf.reduce_mean((self.new_input - self.original)**2, axis=1) + l2penalty * ctcloss else: loss = ctcloss self.expanded_loss = tf.constant(0) elif loss_fn == "CW": raise NotImplemented( "The current version of this project does not include the CW loss function implementation." ) else: raise self.loss = loss self.ctcloss = ctcloss # Set up the Adam optimizer to perform gradient descent for us start_vars = set(x.name for x in tf.global_variables()) tf.summary.scalar('Learning Rate', self.learning_rate_tensor[0]) optimizer = tf.train.AdamOptimizer(self.learning_rate_tensor[0]) self.optimizer = optimizer grad, var = optimizer.compute_gradients(self.loss, [delta])[0] self.grad_sign = grad_sign = tf.sign(grad) self.train = optimizer.apply_gradients([(grad_sign, var)]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] sess.run(tf.variables_initializer(new_vars + [delta])) # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=100) self.merged = tf.summary.merge_all()
def __init__(self, sess, loss_fn, phrase_length, max_audio_len, learning_rate=10, num_iterations=1000, batch_size=1): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len # Create all the variables necessary # they are prefixed with qq_ just so that we know hich # ones are ours so when we restore the session we don't # clobber them. self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.cwmask = cwmask = tf.Variable(np.zeros( (batch_size, phrase_length), dtype=np.float32), name='qq_cwmask') self.original = original = tf.Variable(np.zeros( (batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.importance = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.float32), name='qq_importance') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32), name='qq_phrase_lengths') # Initially we bound the l_infty norm by 2000, increase this # constant if it's not big enough of a distortion for your dataset. self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale # We set the new input to the model to be the abve delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. self.new_input = new_input = self.apply_delta * mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1) # Feed this final value to get the logits. self.logits = logits = get_logits(pass_in, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver( [x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, "models/session_dump") # Choose the loss function we want -- either CTC or CW self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) loss = tf.nn.relu(ctcloss) self.expanded_loss = tf.constant(0) elif loss_fn == "CW": raise NotImplemented( "The current version of this project does not include the CW loss function implementation." ) else: raise # Set up the Adam optimizer to perform gradient descent for us var_start = tf.global_variables() self.train = tf.train.AdamOptimizer(learning_rate).minimize( loss, var_list=[delta]) self.loss = loss self.ctcloss = ctcloss var_end = tf.global_variables() new_vars = [ x for x in var_end if x.name not in [y.name for y in var_start] ] sess.run(tf.variables_initializer(new_vars + [delta])) # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=1000)
def __init__(self, sess, loss_fn, phrase_length, max_audio_len, learning_rate=10, num_iterations=5000, batch_size=1, mp3=False, l2penalty=float('inf'), restore_path=None, th=None, psd_max_ori=None): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len self.mp3 = mp3 # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.cwmask = cwmask = tf.Variable(np.zeros( (batch_size, phrase_length), dtype=np.float32), name='qq_cwmask') self.original = original = tf.Variable(np.zeros( (batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.importance = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.float32), name='qq_importance') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.alpha = tf.Variable(np.ones( (batch_size), dtype=np.float32) * 0.05, name='qq_alpha') self.rescale = tf.Variable(np.zeros((batch_size, 1), dtype=np.float32), name='qq_phrase_lengths') self.th = tf.placeholder(tf.float32, shape=[batch_size, None, None], name='qq_th') self.psd_max_ori = tf.placeholder(tf.float32, shape=[batch_size], name='qq_psd') self.input_tf = tf.placeholder(tf.float32, shape=[batch_size, None], name='qq_input') self.tgt_tf = tf.placeholder(tf.string) self.sample_rate_tf = tf.placeholder(tf.int32, name='qq_sample_rate') self.mask_freq = tf.placeholder(dtype=np.float32, shape=[batch_size, None, 80]) # Initially we bound the l_infty norm by 2000, increase this # constant if it's not big enough of a distortion for your dataset. self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale # compute the loss for masking threshold self.loss_th_list = [] self.transform = Transform(2048) for i in range(self.batch_size): logits_delta = self.transform((self.apply_delta[i, :]), (self.psd_max_ori)[i]) #But more recently people use a function that results in 0 if the input is negative, and the input itself if that input is 0 or positive. This specific add-on function (or better "activation function") is called a relu. #tf.reduce_mean will compute mean across a particular row/column loss_th = tf.reduce_mean(tf.nn.relu(logits_delta - (self.th)[i])) #Returns a tensor with an additional dimension inserted at index axis here dim=0 so nex dimension of array is (1,,,) refer tensorflow document for more information. loss_th = tf.expand_dims(loss_th, dim=0) self.loss_th_list.append(loss_th) #tf.concat:- the data along the input tensor is joined along the axis dimension self.loss_th = tf.concat(self.loss_th_list, axis=0) # We set the new input to the model to be the abve delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. self.new_input = new_input = self.apply_delta * mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1) # Feed this final value to get the logits. self.logits = logits = get_logits(pass_in, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver( [x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, restore_path) # Choose the loss function we want -- either CTC or CW self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion # The code runs faster at a slight cost of distortion, and also leaves one less # paramaeter that requires tuning. if not np.isinf(l2penalty): loss = tf.reduce_mean((self.new_input - self.original)**2, axis=1) + l2penalty * ctcloss else: loss = ctcloss self.expanded_loss = tf.constant(0) elif loss_fn == "CW": raise NotImplemented( "The current version of this project does not include the CW loss function implementation." ) else: raise self.loss = loss self.ctcloss = ctcloss # Set up the Adam optimizer to perform gradient descent for us start_vars = set(x.name for x in tf.global_variables()) optimizer = tf.train.AdamOptimizer(learning_rate) optimizer2 = tf.train.AdamOptimizer(1) grad, var = optimizer.compute_gradients(self.loss, [delta])[0] grad21, var21 = optimizer2.compute_gradients(self.loss, [delta])[0] grad22, var22 = optimizer2.compute_gradients(self.alpha * self.loss_th, [delta])[0] self.train = optimizer.apply_gradients([(tf.sign(grad), var)]) self.train21 = optimizer2.apply_gradients([(grad21, var21)]) self.train22 = optimizer2.apply_gradients([(grad22, var22)]) self.train2 = tf.group(self.train21, self.train22) end_vars = tf.global_variables() # new_vars contain variables which are not present in start_var new_vars = [x for x in end_vars if x.name not in start_vars] sess.run(tf.variables_initializer(new_vars + [delta])) # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=100)
class Attack: def __init__(self, sess, loss_fn, phrase_length, max_audio_len, psdMaxes, learning_rate=10, num_iterations=5000, window_size=2048, step_per_window=4, batch_size=1, mp3=False, onlyCTC=True, audio=None, psdShape=None): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len self.mp3 = mp3 self.psdMaxes = psdMaxes self.window_size = window_size self.step_per_window = step_per_window # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. frame_length = int(window_size) frame_step = int(window_size//step_per_window) fft_length = int(2**np.ceil(np.log2(frame_length))) sample_rate = 16000 freq_res = sample_rate/window_size time_res = frame_step/(sample_rate/1000) sigma_time = 96. / time_res sigma_freq = 15.625 / freq_res self.regularizer = regularizer = tf.Variable(np.zeros((batch_size), dtype=np.float32), name='qq_regularizer') self.psyTh = psyTh = tf.Variable(np.zeros((batch_size, psdShape[0], psdShape[1]), dtype=np.float32), name='qq_psyTh') self.delta = delta = tf.Variable(np.zeros((batch_size, max_audio_len)).astype(np.float32)/2, name='qq_delta') name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.original = original = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.rescale = tf.Variable(np.zeros((batch_size,1), dtype=np.float32), name='qq_rescale') # Initially we bound the l_infty norm by 2000, increase this # constant if it's not big enough of a distortion for your dataset. if(loss_fn == 'CTC'): self.apply_delta = tf.clip_by_value(delta, -2000, 2000)*self.rescale elif(loss_fn == 'CTCPSYCLIP'): self.apply_delta = apply_delta = self.clipBatch(delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size, step_per_window) self.new_input = new_input = self.apply_delta*mask + original #self.new_input = new_input = delta*mask + original # We set the new input to the model to be the above delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. if(loss_fn == 'CTC'): self.new_input = new_input = self.apply_delta*mask + original if(loss_fn == 'CTCPSYGRAD'): self.new_input = new_input = self.delta*mask + original # We add a tiny bit of noise to help make sure that we can # clip our values to 16-bit integers and not break things. if(loss_fn == 'CTC'): noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input+noise, -2**15, 2**15-1) # Feed this final value to get the logits. self.logits = logits = get_logits(new_input, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver([x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, "models/session_dump") self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) # Slight hack: an infinite l2 penalty means that we don't penalize l2 distortion # The code runs faster at a slight cost of distortion, and also leaves one less # paramaeter that requires tuning. if not onlyCTC: loss = tf.reduce_mean((self.new_input-self.original)**2,axis=1)/regularizer + ctcLoss else: loss = ctcLoss self.expanded_loss = tf.constant(0)
def __init__(self, sess, loss_fn, phrase_length, maxlen, learn_rate=10, iterations_num=5000, mem_size=1, mp3=False, foreit=float('inf'), restore_path=None): ## Настроим процедуру modify ## Здесь создаётся tf граф, который мы используем, чтобы генерировать аудиофайл. self.sess = sess self.learn_rate = learn_rate self.iterations_num = iterations_num self.mem_size = mem_size self.phrase_length = phrase_length self.maxlen = maxlen self.mp3 = mp3 # Создаём необходимые переменные Они имеют префикс qq, чтобы отличаться # от стандартных. Таким образом мы отличаем их от остальных self.delta = delta = tf.Variable(np.zeros((mem_size, maxlen), dtype=np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((mem_size, maxlen), dtype=np.float32), name='qq_mask') self.maskcw = maskcw = tf.Variable(np.zeros((mem_size, phrase_length), dtype=np.float32), name='qq_maskcw') self.oring = oring = tf.Variable(np.zeros((mem_size, maxlen), dtype=np.float32), name='qq_oring') self.length = length = tf.Variable(np.zeros(mem_size, dtype=np.int32), name='qq_length') self.importance = tf.Variable(np.zeros((mem_size, phrase_length), dtype=np.float32), name='qq_importance') self.target_phrase = tf.Variable(np.zeros((mem_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_length = tf.Variable(np.zeros((mem_size), dtype=np.int32), name='qq_phrase_length') self.rescale = tf.Variable(np.zeros((mem_size, 1), dtype=np.float32), name='qq_phrase_length') # Изначально привяжем l_infty к 2000, увеличиваем константу, если она # недостаточно велика для искажения нашего набора данных. self.apply_delta = tf.clip_by_value(delta, -2000, 2000) * self.rescale # Мы устанавливаем новый вход для модели, чтобы получить дельту и маску, # которая позволяет применять определённым значениям константу 0 для # последовательного заполнения длины. self.new_input = new_input = self.apply_delta * mask + oring # Добавляем шума, чтобы убедиться, что можно обрезать значения # в 16-битные целые числа. noise = tf.random_normal(new_input.shape, stddev=2) pass_in = tf.clip_by_value(new_input + noise, -2**15, 2**15 - 1) # Вводим конечное число, чтобы получить logits. self.logits = logits = get_logits(pass_in, length) # Здесь восстанавливаем график, чтобы сделать классификатор saver = tf.train.Saver( [x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, restore_path) # Выбираем функцию потерь - СТС или CW. # В нашем случае это CTC. self.loss_fn = loss_fn if loss_fn == "CTC": target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_length) ctcloss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=length) # Небольшая оговорка: бесконечный штраф l2 означает, что мы не увеличиваем # искажение l2. Код работает быстрее при небольшой величине искажения, а также # оставляет на единицу меньше параметр, который требует настройки if not np.isinf(foreit): loss = tf.reduce_mean((self.new_input - self.oring)**2, axis=1) + foreit * ctcloss else: loss = ctcloss self.expanded_loss = tf.constant(0) elif loss_fn == "CW": # Введём предупреждение, что modify() не поддерживает CW. raise NotImplemented( "Сurrent version does not support implementation CW.") else: raise self.loss = loss self.ctcloss = ctcloss # Настроим AdamOptimizer для выполнения градиентного спуска. start_vars = set(x.name for x in tf.global_variables()) optimizer = tf.train.AdamOptimizer(learn_rate) grad, var = optimizer.compute_gradients(self.loss, [delta])[0] self.train = optimizer.apply_gradients([(tf.sign(grad), var)]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] sess.run(tf.variables_initializer(new_vars + [delta])) # Декодер logits нужен для того, чтобы просмотреть успешность выполнения программы self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, length, merge_repeated=False, beam_width=100)
def __init__(self, sess, phrase_length, max_audio_len, psdMaxes, learning_rate=10, num_iterations=5000, window_size=256, step_per_window=2, batch_size=1, mp3=False, delta=None, audio=None, psdShape=None): """ Set up the attack procedure. Here we create the TF graph that we're going to use to actually generate the adversarial examples. """ self.sess = sess self.learning_rate = learning_rate self.num_iterations = num_iterations self.batch_size = batch_size self.phrase_length = phrase_length self.max_audio_len = max_audio_len self.mp3 = mp3 self.psdMaxes = psdMaxes self.window_size = window_size self.step_per_window = step_per_window # Create all the variables necessary # they are prefixed with qq_ just so that we know which # ones are ours so when we restore the session we don't # clobber them. frame_length = int(window_size) frame_step = int(window_size // step_per_window) fft_length = int(2**np.ceil(np.log2(frame_length))) sample_rate = 16000 # datapoints per second freq_res = sample_rate / window_size # sample_rate/2 is the maximal recorded frequency, # We have window_size/2+1 frequencies time_res = frame_step / (sample_rate / 1000) # (sample_rate/1000) = samples per millisecond # frame_step/(sample_rate/1000) => milliseconds for one step self.regularizer = regularizer = tf.Variable(np.zeros( (batch_size), dtype=np.float32), name='qq_regularizer') self.psyTh = psyTh = tf.Variable(np.zeros( (batch_size, psdShape[0], psdShape[1]), dtype=np.float32), name='qq_psyTh') if (delta is None): self.delta = delta = tf.Variable(np.zeros( (batch_size, max_audio_len)).astype(np.float32) / 2, name='qq_delta') else: self.delta = delta = tf.Variable( (delta - audio).astype(np.float32), name='qq_delta') self.mask = mask = tf.Variable(np.zeros((batch_size, max_audio_len), dtype=np.float32), name='qq_mask') self.original = original = tf.Variable(np.zeros( (batch_size, max_audio_len), dtype=np.float32), name='qq_original') self.lengths = lengths = tf.Variable(np.zeros(batch_size, dtype=np.int32), name='qq_lengths') self.target_phrase = tf.Variable(np.zeros((batch_size, phrase_length), dtype=np.int32), name='qq_phrase') self.target_phrase_lengths = tf.Variable(np.zeros((batch_size), dtype=np.int32), name='qq_phrase_lengths') self.apply_delta = apply_delta = self.clipBatch( delta, psyTh, regularizer, psdMaxes, max_audio_len, window_size, step_per_window) self.new_input = new_input = self.apply_delta * mask + original # We set the new input to the model to be the above delta # plus a mask, which allows us to enforce that certain # values remain constant 0 for length padding sequences. # Feed this final value to get the logits. self.logits = logits = get_logits(new_input, lengths) # And finally restore the graph to make the classifier # actually do something interesting. saver = tf.train.Saver( [x for x in tf.global_variables() if 'qq' not in x.name]) saver.restore(sess, "models/session_dump") target = ctc_label_dense_to_sparse(self.target_phrase, self.target_phrase_lengths, batch_size) ctcLoss = tf.nn.ctc_loss(labels=tf.cast(target, tf.int32), inputs=logits, sequence_length=lengths) loss = ctcLoss self.expanded_loss = tf.constant(0) self.deltaPSD = deltaPSD = tfPSD(self.new_input - self.original, window_size, step_per_window, self.psdMaxes) self.loss = loss self.psyLoss = tf.reduce_max(deltaPSD - self.psyTh, axis=[1, 2]) self.ctcLoss = ctcLoss # Set up the Adam optimizer to perform gradient descent for us start_vars = set(x.name for x in tf.global_variables()) optimizer = tf.train.AdamOptimizer(learning_rate) grad, var = optimizer.compute_gradients(self.loss, [delta])[0] self.train = optimizer.apply_gradients([(grad, var)]) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] sess.run(tf.variables_initializer(new_vars + [delta])) # Decoder from the logits, to see how we're doing self.decoded, _ = tf.nn.ctc_beam_search_decoder(logits, lengths, merge_repeated=False, beam_width=100)