def test_attack(sess,model,data,label): sifts = [] # getSIFT(data) attack = CarliniL2(sess, model, len(data[0][1]), NN.img_channels, NN.nb_classes, sifts, batch_size=len(data), max_iterations=1000, confidence=0,targeted = False) #inputs, targets = generate_data(data, samples=1, targeted=True, # start=0, inception=False) timestart = time.time() #adv = attack.attack(inputs, targets) adv = attack.attack(data, getTargets(label)) print(adv.shape) timeend = time.time() print("Took",timeend-timestart,"seconds to run",len(data),"samples.") for i in range(len(adv)): print("Valid:") show(data[i]) print("Adversarial:") show(adv[i]) # keep information for the original image (newClass,newConfident) = NN.predictWithImage(model,adv[i]+0.5) newClassStr = dataBasics.LABELS(int(newClass)) path0="%s/%s_converted_into_%s_with_confidence_%s.png"%(directory_pic_string,startIndexOfImage+i,newClassStr,newConfident) dataBasics.save(-1,np.squeeze(adv[i]), path0) print("Classification:", model.predict(adv[i:i+1]+0.5)) print("Total distortion:", np.sum((adv[i]-data[i])**2)**.5) print("L1 distance:", l1Distance(data[i],adv[i]))
def __init__(self, sess, classifier, ord, confidence=None, **kwargs): if np.isinf(ord): self.attacker = CarliniLi(sess, classifier, **kwargs) else: self.attacker = CarliniL2(sess, classifier, confidence=confidence, **kwargs)
def attack(data, name): sess = K.get_session() model = load_model("models/"+name, custom_objects={'fn': fn}) class Wrap: image_size = 28 if "mnist" in name else 32 num_labels = 10 num_channels = 1 if "mnist" in name else 3 def predict(self, x): return model(x) attack = CarliniL2(sess, Wrap(), batch_size=100, max_iterations=10000, binary_search_steps=5, initial_const=1, targeted=True) adv = attack.attack(data.test_data[:100], get_labs(data.test_labels[:100])) np.save("/tmp/"+name, adv) print(np.mean(np.sum((adv-data.test_data[:100])**2,axis=(1,2,3))**.5))
def attack(self): attack = CarliniL2(self.model.sess, self.model, batch_size=1, max_iterations=self.MAX_ITERATION, confidence=0, direction=self.args.direction) for i in range(self.args.start, self.args.start + self.args.instances): image = self.data.get_image([i])[0] org_prediction = self.model.predict_image(image)[0][0] distortions = [0] predictions = [org_prediction] for j in self.CONST_LIST: time_start = time.time() inputs = self.data.get_image([i]) adv = attack.attack(inputs, [j]) new_prediction = self.model.predict_image(adv[0])[0][0] x_diff = adv[0] - inputs[0] distortion = np.sum( np.square(x_diff.raw_image / (image.max_value - image.min_value))) distortions.append(distortion) predictions.append(new_prediction) print('Sample %d' % i, "Const %f" % j, "Time used %.2f" % (time.time() - time_start), new_prediction, distortion) if self.args.save_data and i >= self.args.start + self.args.instances - 1 and j == self.CONST_LIST[ -1]: self.save_variables([ inputs[0], adv[0], predictions[0], new_prediction, distortion ]) self.write_results(predictions) self.write_results(distortions)
def main(_): tf.set_random_seed(FLAGS.seed) random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) beam_size = FLAGS.beam_size record_path = FLAGS.result_directory # we should use os.path.join! if record_path[-1] != "/": record_path += "/" with open(FLAGS.caption_file) as data_file: caption_file = json.load(data_file) caption_info = caption_file['annotations'] print("using " + FLAGS.norm + " for attack") print("targeted?", FLAGS.targeted) print("attack confidence kappa", FLAGS.confidence) if FLAGS.use_keywords: keywords_num = FLAGS.keywords_num header = ("target filename","attack filename",\ "L2 distortion","L_inf distortion","loss","loss1","loss2",\ "optimal C","attack successful?","target_sentence") header += tuple(["keywords"] * keywords_num) + tuple(["human caption"]) header += tuple([ val for pair in zip([ "caption before attack " + str(i + 1) for i in range(beam_size) ], [ "prob of caption before attack " + str(i + 1) for i in range(beam_size) ]) for val in pair ]) header += tuple([ val for pair in zip([ "caption after attack " + str(i + 1) for i in range(beam_size) ], [ "prob of caption after attack " + str(i + 1) for i in range(beam_size) ]) for val in pair ]) with open('wordPOS/noun.txt') as noun_file: noun = noun_file.read().split() with open('wordPOS/verb.txt') as verb_file: verb = verb_file.read().split() with open('wordPOS/adjective.txt') as adjective_file: adjective = adjective_file.read().split() with open('wordPOS/adverb.txt') as adverb_file: adverb = adverb_file.read().split() # good words are noun, verb, adj or adv. We do not want words like "a" or "the" to be our keywords. # Those .txt files are generated by classifying the vocabulary list. good_words = set(noun + verb + adjective + adverb) else: header = ("target filename","attack filename","L2 distortion","L_inf distortion","loss","loss1","loss2",\ "optimal C","attack successful?") header += tuple([ val for pair in zip( ["target caption " + str(i + 1) for i in range(beam_size)], [ "prob of target caption " + str(i + 1) for i in range(beam_size) ]) for val in pair ]) header += tuple(["human caption"]) header += tuple([ val for pair in zip([ "caption before attack " + str(i + 1) for i in range(beam_size) ], [ "prob of caption before attack " + str(i + 1) for i in range(beam_size) ]) for val in pair ]) header += tuple([ val for pair in zip([ "caption after attack " + str(i + 1) for i in range(beam_size) ], [ "prob of caption after attack " + str(i + 1) for i in range(beam_size) ]) for val in pair ]) os.system("mkdir -p {}".format(os.path.join(record_path, "fail_log"))) record = open( os.path.join(record_path, "record_" + str(FLAGS.offset) + ".csv"), "a+") writer = csv.writer(record) writer.writerow(header) record.close() fail_log = open( os.path.join(record_path, "fail_log/fail_record_" + str(FLAGS.offset) + ".csv"), "a+") fail_log_writer = csv.writer(fail_log) fail_log_writer.writerow(header) fail_log.close() gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) config = tf.ConfigProto(gpu_options=gpu_options) vocab = vocabulary.Vocabulary(FLAGS.vocab_file) inference_graph = tf.Graph() with inference_graph.as_default(): inf_model = inference_wrapper.InferenceWrapper() inf_restore_fn = inf_model.build_graph_from_config( configuration.ModelConfig(), FLAGS.checkpoint_path) inf_image_placeholder = tf.placeholder(dtype=tf.string, shape=[]) inf_preprocessor = inf_model.model.process_image(inf_image_placeholder) inference_graph.finalize() inf_sess = tf.Session(graph=inference_graph, config=config) # Load the model from checkpoint. inf_restore_fn(inf_sess) inf_generator = caption_generator.CaptionGenerator(inf_model, vocab, beam_size=beam_size) if FLAGS.targeted or FLAGS.use_keywords: target_g = tf.Graph() with target_g.as_default(): target_model = inference_wrapper.InferenceWrapper() target_restore_fn = target_model.build_graph_from_config( configuration.ModelConfig(), FLAGS.checkpoint_path) target_image_placeholder = tf.placeholder(dtype=tf.string, shape=[]) target_preprocessor = target_model.model.process_image( target_image_placeholder) target_g.finalize() target_sess = tf.Session(graph=target_g, config=config) target_restore_fn(target_sess) target_generator = caption_generator.CaptionGenerator( target_model, vocab, beam_size=beam_size) attack_graph = tf.Graph() with attack_graph.as_default(): model = attack_wrapper.AttackWrapper() sess = tf.Session(config=config) # build the attacker graph print("target:", FLAGS.targeted) attack = CarliniL2(sess, inf_sess, attack_graph, inference_graph, model, inf_model, targeted=FLAGS.targeted, use_keywords=FLAGS.use_keywords, use_logits=FLAGS.use_logits, batch_size=1, initial_const=FLAGS.C, max_iterations=FLAGS.iters, print_every=1, confidence=FLAGS.confidence, use_log=False, norm=FLAGS.norm, abort_early=False, learning_rate=0.005) # compute graph for preprocessing image_placeholder = tf.placeholder(dtype=tf.string, shape=[]) preprocessor = model.model.process_image(image_placeholder) # get all the files in the directory image_directory = FLAGS.image_directory filenames = [file for file in os.listdir(image_directory)] filenames.sort() random.shuffle(filenames) for j in range(FLAGS.exp_num): attack_filename = filenames[len(filenames) - 1 - j - FLAGS.offset] attack_image_id = int( re.match(r"^.*\_(.*)\..*$", attack_filename).group(1)) human_cap = next((item for item in caption_info if item["image_id"] == attack_image_id)) human_cap = human_cap['caption'] print("attack filename:", attack_filename) print("human's caption:", human_cap) with tf.gfile.GFile(image_directory + attack_filename, "rb") as f: image = f.read() raw_image = sess.run(preprocessor, feed_dict={image_placeholder: image}) show(raw_image, record_path, "original_" + attack_filename.replace(".jpg", ".png")) raw_filename = record_path + "original_" + attack_filename.replace( ".jpg", ".png.npy") # raw_image = np.squeeze(np.load(raw_filename)) raw_captions = inf_generator.beam_search(inf_sess, raw_image) print("Captions for original image %s:" % os.path.basename(raw_filename)) raw_sentences = [] raw_probs = [] for indx, raw_caption in enumerate(raw_captions): raw_sentence = [ vocab.id_to_word(w) for w in raw_caption.sentence[1:-1] ] raw_sentence = " ".join(raw_sentence) print(" %d) %s (p=%f)" % (1, raw_sentence, math.exp(raw_caption.logprob))) raw_sentences = raw_sentences + [raw_sentence] raw_probs = raw_probs + [math.exp(raw_caption.logprob)] if FLAGS.targeted: # If it's targeted attack, we pick another image as our target image to generate target caption for us. target_filename = filenames[j + FLAGS.offset] print("Captions for target image %s:" % os.path.basename(target_filename)) with tf.gfile.GFile(image_directory + target_filename, "rb") as f: target_image = f.read() target_image = target_sess.run( target_preprocessor, {target_image_placeholder: target_image}) target_captions = target_generator.beam_search( target_sess, target_image) target_sentences = [] target_probs = [] for indx, target_caption in enumerate(target_captions): target_sentence = [ vocab.id_to_word(w) for w in target_caption.sentence[1:-1] ] target_sentence = " ".join(target_sentence) print(" %d) %s (p=%f)" % (1, target_sentence, math.exp(target_caption.logprob))) target_sentences = target_sentences + [target_sentence] target_probs = target_probs + [ math.exp(target_caption.logprob) ] else: # If it's untargeted, our target sentence is the attack image's own original caption. target_sentences = raw_sentences target_probs = raw_probs target_filename = attack_filename if FLAGS.use_keywords: if FLAGS.input_feed: # If there is an input feed, we use input feed as our keywords. words = FLAGS.input_feed.split() else: # If there is no input feed, we use select keywords from the target caption. target_sentences_words = set(target_sentences[0].split()) raw_sentences_words = set(raw_sentences[0].split()) if FLAGS.targeted: # If tagreted, we also need to exclude the words in the original caption. word_candidates = list((target_sentences_words & good_words) - raw_sentences_words) word_candidates.sort() else: word_candidates = list( (target_sentences_words & good_words)) word_candidates.sort() if len(word_candidates) < keywords_num: print("words not enough for this attack!") print( "****************************************** END OF THIS ATTACK ******************************************" ) continue # Randomly select keywords from all candidates. words = list( np.random.choice(word_candidates, keywords_num, replace=False)) # run multiple attacks success = [] C_val = [FLAGS.C] best_adv = None best_loss, best_loss1, best_loss2 = None, None, None l2_distortion_log = [] linf_distortion_log = [] best_l2_distortion = 1e10 best_linf_distortion = 1e10 adv_log = [] loss1_log = [] loss2_log = [] loss_log = [] for try_index in range(FLAGS.C_search_times): attack_const = C_val[try_index] max_caption_length = 20 if FLAGS.use_keywords: # keywords based attack key_words = [vocab.word_to_id(word) for word in words] print("My key words are: ", words) key_words_mask = np.append( np.ones(len(key_words)), np.zeros(max_caption_length - len(key_words))) key_words = key_words + [vocab.end_id] * (max_caption_length - len(key_words)) adv, loss, loss1, loss2, _ = attack.attack( np.array([raw_image]), sess, inf_sess, model, inf_model, vocab, key_words, key_words_mask, j, try_index, beam_size, FLAGS.infer_per_iter, attack_const=attack_const) else: # exact attack if FLAGS.targeted: if FLAGS.input_feed: new_sentence = FLAGS.input_feed else: new_sentence = target_sentences[0] else: new_sentence = raw_sentences[0] # new_sentence = "a black and white photo of a train on a track ." new_sentence = new_sentence.split() print("My target sentence:", new_sentence) new_caption = [vocab.start_id ] + [vocab.word_to_id(w) for w in new_sentence] + [vocab.end_id] true_cap_len = len(new_caption) new_caption = new_caption + [vocab.end_id] * ( max_caption_length - true_cap_len) print("My target id:", new_caption) new_mask = np.append( np.ones(true_cap_len), np.zeros(max_caption_length - true_cap_len)) adv, loss, loss1, loss2, _ = attack.attack( np.array([raw_image]), sess, inf_sess, model, inf_model, vocab, new_caption, new_mask, j, try_index, 1, attack_const=attack_const) # save information of this image to log array adv_log += [adv] loss_log += [loss] loss1_log += [loss1] loss2_log += [loss2] adv_captions = inf_generator.beam_search(inf_sess, np.squeeze(adv)) print("Captions after this attempt:") adv_caption = adv_captions[0] adv_sentence = [ vocab.id_to_word(w) for w in adv_caption.sentence[1:-1] ] adv_sentence = " ".join(adv_sentence) print(" %d) %s (p=%f)" % (1, adv_sentence, math.exp(adv_caption.logprob))) if FLAGS.use_keywords: if FLAGS.targeted: success += [set(words) < set(adv_sentence.split())] else: success += [ not bool(set(words) & set(adv_sentence.split())) ] else: if FLAGS.targeted: success += [(adv_sentence == target_sentences[0])] else: ''' raw_split = [item.split() for item in raw_sentences] nltk_BLEU = nltk.translate.bleu_score.sentence_bleu(raw_split, adv_sentence.split()) print("BLEU by nltk is:", nltk_BLEU) success += [nltk_BLEU<0.5] ''' # For untargeted and caption based attack, there is no simple criterion to determine an attack is successful or not. We need to calculate the scores. # So here we always assumee the attack is fail, then we save fail log for score calculation. success += [False] print("Attack with this C is successful?", success[try_index]) l2_distortion = np.sum((adv - raw_image)**2)**.5 linf_distortion = np.max(np.abs(adv - raw_image)) l2_distortion_log += [l2_distortion] linf_distortion_log += [linf_distortion] print("L2 distortion is", l2_distortion) print("L_inf distortion is", linf_distortion) if success[try_index]: # Among the successful attacks, we select the one with minimum distortion as our final result. # Note this one may not correspond to minimum C. if FLAGS.norm == "l2": if l2_distortion < best_l2_distortion: best_adv = adv best_loss, best_loss1, best_loss2 = loss, loss1, loss2 best_l2_distortion = l2_distortion best_linf_distortion = linf_distortion final_C = C_val[try_index] elif FLAGS.norm == "inf": if linf_distortion < best_linf_distortion: best_adv = adv best_loss, best_loss1, best_loss2 = loss, loss1, loss2 best_l2_distortion = l2_distortion best_linf_distortion = linf_distortion final_C = C_val[try_index] else: raise ValueError("unsupported distance metric:" + FLAGS.norm) if FLAGS.targeted or FLAGS.use_keywords: # We do binary search to find next C. if try_index + 1 < FLAGS.C_search_times: if success[try_index]: if any(not _ for _ in success): last_false = len(success) - success[::-1].index( False) - 1 C_val += [ 0.5 * (C_val[try_index] + C_val[last_false]) ] else: C_val += [C_val[try_index] * 0.5] else: if any(_ for _ in success): last_true = len(success) - success[::-1].index( True) - 1 C_val += [ 0.5 * (C_val[try_index] + C_val[last_true]) ] else: C_val += [C_val[try_index] * 10.0] else: C_val += [C_val[try_index] * 10.0] print("results of each attempt:", success) print("C values of each attempt:", C_val) print("L2 distortion log is", l2_distortion_log) print("L_inf distortion log is", linf_distortion_log) final_success = any(_ for _ in success) if not final_success: final_C = C_val[-1] best_adv = adv best_loss, best_loss1, best_loss2 = loss, loss1, loss2 if FLAGS.use_keywords: target_info = { "words": words, "target_filename": target_filename, "target_sentences": target_sentences } else: target_info = { 'target_filename': target_filename, "target_sentences": target_sentences, "target_probs": target_probs } save_fail_log(adv_log, loss_log, loss1_log, loss2_log, l2_distortion_log, linf_distortion_log, success, C_val, record_path, attack_filename, raw_image, human_cap,\ raw_sentences, raw_probs, inf_sess, inf_generator, vocab, target_info) show(best_adv, record_path, "adversarial_" + attack_filename.replace(".jpg", ".png")) show(best_adv - raw_image, record_path, "diff_" + attack_filename.replace(".jpg", ".png")) best_l2_distortion = np.sum((best_adv - raw_image)**2)**.5 best_linf_distortion = np.max(np.abs(best_adv - raw_image)) print("best L2 distortion is", best_l2_distortion) print("best L_inf distortion is", best_linf_distortion) adv_filename = record_path + "adversarial_" + attack_filename.replace( ".jpg", ".png.npy") adv_image = np.squeeze(np.load(adv_filename)) adv_captions = inf_generator.beam_search(inf_sess, adv_image) print("Captions for adversarial image %s:" % os.path.basename(adv_filename)) adv_sentences = [] adv_probs = [] for indx, adv_caption in enumerate(adv_captions): adv_sentence = [ vocab.id_to_word(w) for w in adv_caption.sentence[1:-1] ] adv_sentence = " ".join(adv_sentence) print(" %d) %s (p=%f)" % (1, adv_sentence, math.exp(adv_caption.logprob))) adv_sentences = adv_sentences + [adv_sentence] adv_probs = adv_probs + [math.exp(adv_caption.logprob)] record = open(record_path + "record_" + str(FLAGS.offset) + ".csv", "a+") writer = csv.writer(record) if FLAGS.use_keywords: row = (target_filename, attack_filename, best_l2_distortion,best_linf_distortion,\ best_loss,best_loss1,best_loss2,final_C,str(final_success),target_sentences[0]) row += tuple(words) + tuple([human_cap]) row += tuple([ val for pair in zip(raw_sentences, raw_probs) for val in pair ]) row += tuple([ val for pair in zip(adv_sentences, adv_probs) for val in pair ]) writer.writerow(row) else: row = (target_filename, attack_filename, best_l2_distortion, best_linf_distortion, best_loss, best_loss1, best_loss2, final_C, str(final_success)) row += tuple([ val for pair in zip(target_sentences, target_probs) for val in pair ]) + tuple([human_cap]) row += tuple([ val for pair in zip(raw_sentences, raw_probs) for val in pair ]) row += tuple([ val for pair in zip(adv_sentences, adv_probs) for val in pair ]) writer.writerow(row) record.close() print( "****************************************** END OF THIS ATTACK ******************************************" ) sess.close() inf_sess.close() if FLAGS.use_keywords or FLAGS.targeted: target_sess.close()
def main(args): with tf.Session() as sess: if args['dataset'] == 'mnist': data, model = MNIST(), MNISTModel("models/mnist", sess) handpick = False inception = False if args['dataset'] == "cifar": data, model = CIFAR(), CIFARModel("models/cifar", sess) handpick = True inception = False if args['dataset'] == "imagenet": data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess) handpick = True inception = True if args['adversarial'] != "none": model = MNISTModel("models/mnist_cwl2_admm" + str(args['adversarial']), sess) if args['temp'] and args['dataset'] == 'mnist': model = MNISTModel("models/mnist-distilled-" + str(args['temp']), sess) if args['temp'] and args['dataset'] == 'cifar': model = CIFARModel("models/cifar-distilled-" + str(args['temp']), sess) inputs, targets, labels, true_ids = generate_data_ST(data, model, samples=args['numimg'], samplesT=args['numimgT'], targeted=True, start=0, inception=inception, handpick=handpick, seed=args['seed']) #print(true_ids) if args['attack'] == 'L2C': attack = CarliniL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], abort_early=args['abort_early']) if args['attack'] == 'L2LA2': attack = LADMML2re(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], layernum=args['layer_number'], use_kernel=args['use_kernel'], confidence=args['conf'], binary_search_steps=args['iteration_steps'], ro=args['ro'], abort_early=args['abort_early']) timestart = time.time() adv = attack.attack(inputs, targets) timeend = time.time() print("Took", timeend - timestart, "seconds to run", len(inputs), "samples.\n") if args['conf'] != 0: model = MNISTModel("models/mnist-distilled-100", sess) if args['kernel_bias']: EP = evaluate_perturbation_kb(args, sess, model, inputs) scores, l2 = EP(inputs, targets, adv) EPT = evaluate_perturbation_testset(args, sess, model, data.test_data) test_scores = EPT(data.test_data, data.test_labels) EP2 = evaluate_perturbation_kb_restore(args, sess, model, inputs) scores2 = EP2(inputs, targets, adv) EPT2 = evaluate_perturbation_testset(args, sess, model, data.test_data) test_scores2 = EPT2(data.test_data, data.test_labels) else: EP = evaluate_perturbation(args, sess, model, inputs) # scores = EP(inputs, targets, adv) # scores2 = EP2(inputs, targets, adv) score_count = [] score_count2 = [] score_count3 = [] score_count4 = [] for e, (sc) in enumerate(scores): if np.argmax(sc) == np.argmax(targets[e]): score_count.append(1) if e < args['numimg']: score_count4.append(1) else: score_count.append(0) if e < args['numimg']: score_count4.append(0) for e, (sc) in enumerate(scores): if np.argmax(sc) == np.argmax(labels[e]): score_count3.append(1) else: score_count3.append(0) for e, (sc2) in enumerate(scores2): if np.argmax(sc2) == np.argmax(labels[e]): score_count2.append(1) else: score_count2.append(0) test_score_count = [] test_score_count2 = [] for e, (tsc) in enumerate(test_scores): if np.argmax(tsc) == np.argmax(data.test_labels[e]): test_score_count.append(1) else: test_score_count.append(0) for e, (tsc2) in enumerate(test_scores2): if np.argmax(tsc2) == np.argmax(data.test_labels[e]): test_score_count2.append(1) else: test_score_count2.append(0) l0s = np.count_nonzero(adv) successrate = np.mean(score_count) successrate2 = np.mean(score_count2) successrate3 = np.mean(score_count3) test_successrate = np.mean(test_score_count) test_successrate2 = np.mean(test_score_count2) print('original model, success rate of T images for the original labels:', successrate2) print('modified model, success rate of T images for the original labels:', successrate3) print('modified model, success rate of T images for the target labels:', successrate) print('modified model, success rate of S imges for the target labels:', np.mean(score_count4)) print('modified model, success rate of test set for the original labels:', test_successrate) print('original model, success rate of test set for the original labels:', test_successrate2) print('l0 distance:', l0s) print('l2 distance:', l2)
# In[ ]: if __name__ == "__main__": config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: modelPath = '%smodels/mnist' % (nn_robust_attack_root) data, model = MNIST(), MNISTModel(modelPath, sess) attack = CarliniL2(sess, model, batch_size=1, max_iterations=2000, confidence=0, binary_search_steps=5, initial_const=1., learning_rate=1e-1, targeted=False) inputs, targets = generate_data(data, samples=1000, targeted=False, start=5500, inception=False) original_classified_wrong_number = 0 #number of benign samples that are misclassified disturbed_failure_number = 0 #number of samples that failed to craft corresponding adversarial samples test_number = 0 #number of adversarial samples that we generate TTP = 0
def main(args): with tf.Session() as sess: if args['dataset'] == 'mnist': data, model = MNIST(), MNISTModel("models/mnist", sess) handpick = False inception = False if args['dataset'] == "cifar": data, model = CIFAR(), CIFARModel("models/cifar", sess) handpick = True inception = False if args['dataset'] == "imagenet": data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess) handpick = True inception = True if args['adversarial'] != "none": model = MNISTModel( "models/mnist_cwl2_admm" + str(args['adversarial']), sess) if args['temp'] and args['dataset'] == 'mnist': model = MNISTModel("models/mnist-distilled-" + str(args['temp']), sess) if args['temp'] and args['dataset'] == 'cifar': model = CIFARModel("models/cifar-distilled-" + str(args['temp']), sess) inputs, targets, labels, true_ids = generate_data( data, model, samples=args['numimg'], targeted=True, start=0, inception=inception, handpick=handpick, seed=args['seed']) #print(true_ids) if args['attack'] == 'L2C': attack = CarliniL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], abort_early=args['abort_early']) if args['attack'] == 'L0A': attack = ADMML0(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['iteration_steps'], ro=args['ro'], abort_early=args['abort_early']) timestart = time.time() adv = attack.attack(inputs, targets) timeend = time.time() print("Took", timeend - timestart, "seconds to run", len(inputs), "samples.\n") if args['train']: np.save('labels_train.npy', labels) np.save(str(args['attack']) + '_train.npy', adv) if (args['conf'] != 0): model = MNISTModel("models/mnist-distilled-100", sess) if args['attack'] != 'L0A' and args['attack'] != 'L0AE' and args[ 'attack'] != 'L0C' and args['attack'] != 'L0AE2': l1_l2_li_computation(args, data, model, adv, inception, inputs, targets, labels, true_ids) else: l0_computation(args, data, model, adv, inception, inputs, targets, labels, true_ids)
def __init__(self, reformer): self.reformer = reformer def predict(self, x): return classifier.model(self.reformer.model(x)) if attacking: thrs = operator.get_thrs(dict((k, v * 4) for k, v in dr.items())) attack = CarliniL2(sess, [Pred2(x) for x in reformer], detector_dict, thrs, batch_size=100, binary_search_steps=4, learning_rate=1e-2, max_iterations=10000, targeted=True, initial_const=1, confidence=1, boxmin=0, boxmax=1) adv = attack.attack(dat, lab) np.save("/tmp/" + dataset + ".npy", adv) else: adv = np.load("/tmp/" + dataset + ".npy") print('mean distortion', np.mean(np.sum((adv - dat)**2, axis=(1, 2, 3))**.5)) for i, ref in enumerate(reformer): print('reformer', i) predicted = np.argmax(classifier.model.predict(ref.model.predict(adv)),
def main(args): temp_encoder = encoder(level=args['level']) with tf.Session() as sess: use_log = not args['use_zvalue'] is_inception = args['dataset'] == "imagenet" # load network print('Loading model', args['dataset']) if args['dataset'] == "mnist": data, model = MNIST(), MNISTModel("models/mnist", sess, use_log) # data, model = MNIST(), MNISTModel("models/mnist-distilled-100", sess, use_log) elif args['dataset'] == "cifar10": #data, model = CIFAR(), CIFARModel("models/cifar", sess, use_log) # data, model = CIFAR(), CIFARModel("models/cifar-distilled-100", sess, use_log) data, model = CIFAR(), CIFAR_WIDE("models/wide_resnet", sess, use_log) elif args['dataset'] == "imagenet": data, model = ImageNet(), InceptionModel(sess, use_log) print('Done...') if args['numimg'] == 0: args['numimg'] = len(data.test_labels) - args['firstimg'] print('Using', args['numimg'], 'test images') # load attack module if args['attack'] == "white": # batch size 1, optimize on 1 image at a time, rather than optimizing images jointly attack = CarliniL2(sess, model, batch_size=1, max_iterations=args['maxiter'], print_every=args['print_every'], early_stop_iters=args['early_stop_iters'], confidence=0, learning_rate=args['lr'], initial_const=args['init_const'], binary_search_steps=args['binary_steps'], targeted=not args['untargeted'], use_log=use_log, adam_beta1=args['adam_beta1'], adam_beta2=args['adam_beta2']) else: # batch size 128, optimize on 128 coordinates of a single image attack = BlackBoxL2(sess, model, batch_size=128, max_iterations=args['maxiter'], print_every=args['print_every'], early_stop_iters=args['early_stop_iters'], confidence=0, learning_rate=args['lr'], initial_const=args['init_const'], binary_search_steps=args['binary_steps'], targeted=not args['untargeted'], use_log=use_log, use_tanh=args['use_tanh'], use_resize=args['use_resize'], adam_beta1=args['adam_beta1'], adam_beta2=args['adam_beta2'], reset_adam_after_found=args['reset_adam'], solver=args['solver'], save_ckpts=args['save_ckpts'], load_checkpoint=args['load_ckpt'], start_iter=args['start_iter'], init_size=args['init_size'], use_importance=not args['uniform']) random.seed(args['seed']) np.random.seed(args['seed']) print('Generate data') all_inputs, all_targets, all_labels, all_true_ids, encoding_all = generate_data( data, samples=args['numimg'], targeted=not args['untargeted'], start=args['firstimg'], inception=is_inception) print('Done...') #print('all_inputs : ', all_inputs.shape) #print('encoding_all : ',encoding_all.shape) os.system("mkdir -p {}/{}".format(args['save'], args['dataset'])) img_no = 0 total_success = 0 l2_total = 0.0 origin_correct = 0 adv_correct = 0 for i in range(all_true_ids.size): print(' adversarial_image_no: ', i) inputs = all_inputs[i:i + 1] encoding_inputs = encoding_all[i:i + 1] #print('encoding_inputs shape: ', encoding_inputs) targets = all_targets[i:i + 1] labels = all_labels[i:i + 1] print("true labels:", np.argmax(labels), labels) print("target:", np.argmax(targets), targets) # test if the image is correctly classified original_predict = model.model.predict(encoding_inputs) original_predict = np.squeeze(original_predict) original_prob = np.sort(original_predict) original_class = np.argsort(original_predict) print("original probabilities:", original_prob[-1:-6:-1]) print("original classification:", original_class[-1:-6:-1]) print("original probabilities (most unlikely):", original_prob[:6]) print("original classification (most unlikely):", original_class[:6]) if original_class[-1] != np.argmax(labels): print( "skip wrongly classified image no. {}, original class {}, classified as {}" .format(i, np.argmax(labels), original_class[-1])) continue origin_correct += np.argmax(labels, 1) == original_class[-1] img_no += 1 timestart = time.time() adv, const = attack.attack_batch(inputs, targets) if type(const) is list: const = const[0] if len(adv.shape) == 3: adv = adv.reshape((1, ) + adv.shape) timeend = time.time() l2_distortion = np.sum((adv - inputs)**2)**.5 ##### llj encode_adv = np.transpose(adv, axes=(0, 3, 1, 2)) channel0, channel1, channel2 = encode_adv[:, 0, :, :], encode_adv[:, 1, :, :], encode_adv[:, 2, :, :] channel0, channel1, channel2 = temp_encoder.tempencoding( channel0), temp_encoder.tempencoding( channel1), temp_encoder.tempencoding(channel2) encode_adv = np.concatenate([channel0, channel1, channel2], axis=1) encode_adv = np.transpose(encode_adv, axes=(0, 2, 3, 1)) #### llj adversarial_predict = model.model.predict(encode_adv) adversarial_predict = np.squeeze(adversarial_predict) adversarial_prob = np.sort(adversarial_predict) adversarial_class = np.argsort(adversarial_predict) print("adversarial probabilities:", adversarial_prob[-1:-6:-1]) print("adversarial classification:", adversarial_class[-1:-6:-1]) adv_correct += np.argmax(labels, 1) == adversarial_class[-1] success = False if args['untargeted']: if adversarial_class[-1] != original_class[-1]: success = True else: if adversarial_class[-1] == np.argmax(targets): success = True if l2_distortion > 20.0: success = False if success: total_success += 1 l2_total += l2_distortion suffix = "id{}_seq{}_prev{}_adv{}_{}_dist{}".format( all_true_ids[i], i, original_class[-1], adversarial_class[-1], success, l2_distortion) print("Saving to", suffix) show( inputs, "{}/{}/{}_original_{}.png".format(args['save'], args['dataset'], img_no, suffix)) show( adv, "{}/{}/{}_adversarial_{}.png".format(args['save'], args['dataset'], img_no, suffix)) show( adv - inputs, "{}/{}/{}_diff_{}.png".format(args['save'], args['dataset'], img_no, suffix)) print( "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, const = {:.6f}, prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, l2_avg = {:.5f}" .format(img_no, i, all_true_ids[i], timeend - timestart, success, const, original_class[-1], adversarial_class[-1], l2_distortion, total_success / float(img_no), 0 if total_success == 0 else l2_total / total_success)) sys.stdout.flush() print(' origin accuracy : ', 100.0 * origin_correct / all_true_ids.size) print(' adv accuracy : ', 100.0 * adv_correct / all_true_ids.size)
inputs.append(data.test_data[start + i]) targets.append(data.test_labels[start + i]) inputs = np.array(inputs) targets = np.array(targets) return inputs, targets if __name__ == "__main__": with tf.Session() as sess: data, model = MNIST(), MNISTModel("models/mnist", sess) #data, model = CIFAR(), CIFARModel("models/cifar", sess) attack = CarliniL2(sess, model, batch_size=9, max_iterations=1000, confidence=0) #attack = CarliniL0(sess, model, max_iterations=1000, initial_const=10, # largest_const=15) inputs, targets = generate_data(data, samples=1, targeted=True, start=0, inception=False) timestart = time.time() adv = attack.attack(inputs, targets) timeend = time.time() print("Took", timeend - timestart, "seconds to run", len(inputs),
def test_cw(): sess = tf.Session() # sess.run(tf.global_variables_initializer()) # keras maintains a tf session. It must be set by either # keras.backend.set_session(sess), or use inside a context manager # sess.as_default() with sess.as_default(): data, model = MNIST(), MNISTModel("models/mnist", sess) with sess.as_default(): data, model = CIFAR(), CIFARModel("models/cifar", sess) # testing the model np.argmax(model.model.predict(data.test_data[:10]), axis=1) print(np.argmax(data.test_labels[:10], axis=1)) #data, model = CIFAR(), CIFARModel("models/cifar", sess) attack_l2 = CarliniL2(sess, model, batch_size=10, max_iterations=1000, confidence=0) attack_l0 = CarliniL0(sess, model, max_iterations=1000, initial_const=10, largest_const=15) attack_li = CarliniLi(sess, model) inputs, targets = generate_data(data, samples=1, targeted=True, start=0, inception=False) # TODO find the first digits of each kind, try map it to the next digit inputs, targets = generate_data_2(data) adv_l2 = attack_l2.attack(inputs, targets) adv_l0 = attack_l0.attack(inputs, targets) adv_li = attack_li.attack(inputs, targets) plt.tight_layout() plt.tight_layout(pad=1, w_pad=1, h_pad=1) grid_show_image(inputs, 10, 1, 'images/orig-mnist.png') grid_show_image(adv_l2, 10, 1, 'images/l2.png') grid_show_image(adv_l0, 10, 1, 'images/l0.png') grid_show_image(adv_li, 9, 2, 'images/li.png') from contextlib import redirect_stdout redirect_stdout np.sum((adv_l2[0] - inputs[0])**2) # np.argmax(targets, axis=1) # import keras # keras.backend.set_session(sess) np.argmax(model.model.predict(inputs), axis=1) np.argmax(targets, axis=1) # # (((adv_l2 + 0.5)*255).round()) np.argmax(model.model.predict(adv_l2), axis=1) np.argmax(model.model.predict(adv_l0), axis=1) np.argmax(model.model.predict(adv_li), axis=1) np.sum(model.model.predict(adv_l2), axis=1) np.sum(sess.run(tf.nn.softmax(model.model.predict(adv_l2))), axis=1) softmax_pred = sess.run(tf.nn.softmax(model.model.predict(adv_l2))) softmax_pred[0] np.argmax(softmax_pred, axis=1) keras.activations.softmax(model.model) model.model.predict(((adv_l2 + 0.5) * 255).round())
if __name__ == "__main__": with tf.Session() as sess: use_log = False print('Loading model...') # data, model = MNIST(), MNISTModel("models/mnist", sess, use_log) # data, model = MNIST(), MNISTModel("models/mnist-distilled-100", sess, use_log) # data, model = CIFAR(), CIFARModel("models/cifar", sess, use_log) data, model = ImageNet(), InceptionModel(sess, use_log) print('Done...') batch_size = 1 if isinstance(model, InceptionModel): batch_size = 10 attack = CarliniL2(sess, model, batch_size=batch_size, initial_const=1.0, max_iterations=1000, confidence=0, use_log=use_log) print('Generate data') inputs, targets = generate_data(data, samples=1, targeted=True, start=6, inception=isinstance( model, InceptionModel)) print('Done...') print(inputs.shape) inputs = inputs[0:batch_size] targets = targets[0:batch_size]
def main(args): with tf.Session() as sess: random.seed(args["seed"]) np.random.seed(args["seed"]) tf.set_random_seed(args["seed"]) print("seed = ", args["seed"]) overall_timestart = time.time() use_log = not args['use_zvalue'] print("use_log = ", use_log) data_map = {} model_map = {} if args['dataset'] == "imagenet": if args['attack'] == "CW": model_map[args['model_name']] = ImageNetModel( sess, use_log, args['model_name'], create_prediction=False) elif args['attack'] == "EADL1": model_map[args['model_name']] = ImageNetModel( sess, use_log, args['model_name'], create_prediction=True) data_map['imagenet'] = ImageNet( model_map[args['model_name']].image_size, load_total_imgs=args['numimg_loaded']) print('Loading model', args['dataset']) data = data_map[args['dataset']] model = model_map[args['model_name']] if args['numimg'] == 0: args['numimg'] = len(data.test_labels) - args['firstimg'] print('Using', args['numimg'], 'test images') # load attack module print('args = ', args) targeted_flag = not args['untargeted'] print("targeted_flag = ", targeted_flag) # load attack module if args['attack'] == "CW": attack = CarliniL2(sess, model, 100) attack_predictor = attack.predict elif args['attack'] == "EADL1": attack_predictor = model.model.predict random.seed(args['seed']) np.random.seed(args['seed']) tf.set_random_seed(args['seed']) print('Generate data') model_name = args['model_name'] if 'vgg' in model_name or 'densenet' in model_name or 'alexnet' in model_name: remove_background_class_flag = True else: remove_background_class_flag = False sys.stdout.flush() all_inputs, all_targets, all_labels, all_true_ids, img_info = generate_data( data, samples=args['numimg'], targeted=targeted_flag, random_and_least_likely=True, predictor=attack_predictor, start=args['firstimg'], imagenet=isinstance(data, ImageNet), remove_background_class=remove_background_class_flag, target_type=args['target_type'], total_num_valid_samples=args['num_valid_test_imgs']) print('len(all_inputs) = ', len(all_inputs)) print("all_inputs shape:", all_inputs.shape) print("all_targets shape:", all_targets.shape) attack_batch_size = args['attack_batch_size'] if attack_batch_size == 0: attack_batch_size = all_true_ids.size print("attack_batch_size = ", attack_batch_size) if args['attack'] == 'CW': attack.init_attack(sess, model, targeted=targeted_flag, batch_size=attack_batch_size, initial_const=args['init_const'], binary_search_steps=args['binary_steps'], max_iterations=args['maxiter'], print_every=args['print_every'], confidence=args['kappa'], use_log=use_log) elif args['attack'] == 'EADL1': print("EADL1 attack") attack = EADL1(sess, model, targeted=targeted_flag, batch_size=attack_batch_size, initial_const=args['init_const'], binary_search_steps=args['binary_steps'], max_iterations=args['maxiter'], confidence=args['kappa'], print_every=args['print_every']) else: print("Invalid attack name, exit 1") return saved_path = "{}/{}/{}/targeted_{}".format(args['save'], args['dataset'], args['attack'], targeted_flag) if not os.path.exists(saved_path): os.system("mkdir -p " + saved_path) img_no = 0 total_success = 0 l0_list = [] l1_list = [] l2_list = [] linf_list = [] time_list = [] verbose_f = open( args['save'] + "/" + "_".join([ args['dataset'], args['attack'], str(targeted_flag), "verbose.txt" ]), "w") aggre_f = open( args['save'] + "/" + "_".join([ args['dataset'], args['attack'], str(targeted_flag), "aggre.txt" ]), "w") if targeted_flag == True: verbose_head_str = '\t'.join([ 'total', 'seq', 'id', 'time', 'success', 'prev_class', 'target', 'new_class', 'l0_distortion', 'l1_distortion', 'l2_distortion', 'linf_distortion' ]) else: verbose_head_str = '\t'.join([ 'total', 'seq', 'id', 'time', 'success', 'prev_class', 'new_class', 'l0_distortion', 'l1_distortion', 'l2_distortion', 'linf_distortion' ]) aggre_head_str = '\t'.join([ 'total_count', 'success_rate', 'l0_avg', 'l0_std', 'l1_avg', 'l1_std', 'l2_avg', 'l2_std', 'linf_avg', 'linf_std', 'time_avg', 'time_std' ]) verbose_f.write(verbose_head_str + '\n') aggre_f.write(aggre_head_str + '\n') print("all_true_ids.size = ", all_true_ids.size) sys.stdout.flush() random.seed(args['seed']) np.random.seed(args['seed']) tf.set_random_seed(args['seed']) for i in range(0, all_true_ids.size, attack_batch_size): if i + attack_batch_size > all_true_ids.size: actual_attack_batch_size = all_true_ids.size - i else: actual_attack_batch_size = attack_batch_size inputs = all_inputs[i:i + actual_attack_batch_size] targets = all_targets[i:i + actual_attack_batch_size] labels = all_labels[i:i + actual_attack_batch_size] timestart = time.time() """perform the attack""" print("perform the attack") adv = attack.attack(inputs, targets) timeend = time.time() time_used = timeend - timestart time_used_per_image = time_used / attack_batch_size for j in range(len(adv)): print("=" * 10, "i = ", i, "=" * 10, "j=", j, "=" * 10) # original_predict = np.squeeze(attack.predict(np.array([inputs[j]]))) original_predict = np.squeeze( attack_predictor(np.array([inputs[j]]))) original_prob = np.sort(original_predict) original_class = np.argsort(original_predict) print("Original Classification:", original_prob[-1:-6:-1]) print("Original Probabilities/Logits:", original_class[-1:-6:-1]) sys.stdout.flush() true_label = np.argmax(labels[j]) target_label = np.argmax(targets[j]) attack_label = None success = False img_no += 1 print("Target:", target_label) # if the array contains NaN, the solver did not return a solution if (np.any(np.isnan(adv[j]))): print('Attack failed. (solver returned NaN)') l0_distortion = l1_distortion = l2_distortion = linf_distortion = np.nan adversarial_class = np.zeros(original_class.shape) else: l0_distortion = l0_loss(adv[j], inputs[j]) l1_distortion = l1_loss(adv[j], inputs[j]) l2_distortion = l2_loss(adv[j], inputs[j]) linf_distortion = linf_loss(adv[j], inputs[j]) #adversarial_predict = np.squeeze(model.model.predict(np.array([adv[j]]))) # adversarial_predict = np.squeeze(attack.predict(np.array([adv[j]]))) adversarial_predict = np.squeeze( attack_predictor(np.array([adv[j]]))) adversarial_prob = np.sort(adversarial_predict) adversarial_class = np.argsort(adversarial_predict) attack_label = np.argmax(adversarial_predict) print("adversarial probabilities:", adversarial_prob[-1:-11:-1]) print("adversarial classification:", adversarial_class[-1:-11:-1]) sys.stdout.flush() success = False if targeted_flag: success = np.argsort( adversarial_predict)[-1] == target_label candidates = set([ i for i in range(len(adversarial_predict) - 1) if abs(adversarial_predict[i] - adversarial_prob[-1]) < 0.001 ]) if len(candidates) > 1 and target_label in candidates: success = True else: success = np.argsort( adversarial_predict)[-1] != target_label if success: print("Attack succeeded.") else: print("Attack failed.") if success: total_success += 1 l0_list.append(l0_distortion) l1_list.append(l1_distortion) l2_list.append(l2_distortion) linf_list.append(linf_distortion) time_list.append(time_used_per_image) suffix = "id={0}_seq={1}_prev={2}_adv={3}_res={4}".format( all_true_ids[i + j], i, original_class[-1], adversarial_class[-1], success) print("Saving to", suffix) sys.stdout.flush() dump( inputs[j], "{}/imgno={}_content={}_{}".format(saved_path, img_no, 'original', suffix)) dump( adv[j], "{}/imgno={}_content={}_{}".format(saved_path, img_no, 'adversarial', suffix)) # dump(adv[j] - inputs[j], "{}/imgno={}_content={}_{}".format(saved_path, img_no, 'noise', suffix)) np.save( "{}/imgno={}_content={}_{}".format( saved_path, img_no, 'targets', suffix) + ".npy", targets[j]) np.save( "{}/imgno={}_content={}_{}".format( saved_path, img_no, 'labels', suffix) + ".npy", labels[j]) L1_debug_str = "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, " \ "prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, " \ "l2_avg = {:.5f}".format(img_no, i+j, all_true_ids[i+j], time_used_per_image, success, original_class[-1], adversarial_class[-1], l2_distortion, total_success / float(img_no), 0 if total_success == 0 else np.mean(l2_list)) print(L1_debug_str) sys.stdout.flush() if targeted_flag == True: verbose_str = '\t'.join([ str(img_no), str(i + j), str(all_true_ids[i + j]), str(time_used_per_image), str(success), str(original_class[-1]), str(np.argmax(targets[j])), str(adversarial_class[-1]), str(l0_distortion), str(l1_distortion), str(l2_distortion), str(linf_distortion) ]) else: verbose_str = '\t'.join([ str(img_no), str(i + j), str(all_true_ids[i + j]), str(time_used_per_image), str(success), str(original_class[-1]), str(adversarial_class[-1]), str(l0_distortion), str(l1_distortion), str(l2_distortion), str(linf_distortion) ]) verbose_f.write(verbose_str + "\n") verbose_f.flush() print(verbose_head_str) print(verbose_str) sys.stdout.flush() overall_timeend_sofar = time.time() overall_time_used_sofar = overall_timeend_sofar - overall_timestart print("overall_time_used_sofar = ", overall_time_used_sofar) sys.stdout.flush() verbose_f.close() if img_no == 0: success_rate = 0.0 else: success_rate = total_success / float(img_no) if total_success == 0: aggre_str = "\t".join([ str(img_no), str(success_rate), str(0.0), str(0.0), str(0.0), str(0.0), str(0.0), str(0.0), str(0.0), str(0.0), str(0.0), str(0.0) ]) else: aggre_str = "\t".join([ str(img_no), str(success_rate), str(np.mean(l0_list)), str(np.std(l0_list)), str(np.mean(l1_list)), str(np.std(l1_list)), str(np.mean(l2_list)), str(np.std(l2_list)), str(np.mean(linf_list)), str(np.std(linf_list)), str(np.mean(time_list)), str(np.std(time_list)) ]) aggre_f.write(aggre_str + "\n") print(aggre_head_str) print(aggre_str) sys.stdout.flush() aggre_f.close() overall_timeend = time.time() overall_time_used = overall_timeend - overall_timestart print("overall_time_used = ", overall_time_used) sys.stdout.flush() print("ALL DONE!!!") return
def main(args): # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) with tf.Session() as sess: if args['dataset'] == 'mnist': data, model = MNIST(), MadryMNISTModel("models/secret/", sess) handpick = False inception = False if args['dataset'] == "cifar": data, model = CIFAR(), CIFARModel("models/cifar", sess) #data, model = CIFAR(), MadryCIFARModel("models/model_0/", sess) handpick = True inception = False if args['dataset'] == "imagenet": data, model = ImageNet(args['seed_imagenet']), InceptionModel( sess, False) handpick = True inception = True if args['adversarial'] != "none": model = MNISTModel( "models/mnist_cwl2_admm" + str(args['adversarial']), sess) if args['temp'] and args['dataset'] == 'mnist': model = MNISTModel("models/mnist-distilled-" + str(args['temp']), sess) if args['temp'] and args['dataset'] == 'cifar': model = MadryCIFARModel( "models/cifar-distilled-" + str(args['temp']), sess) inputs, targets, labels, true_ids = generate_data( data, model, samples=args['numimg'], targeted=True, target_num=args['target_number'], start=0, inception=inception, handpick=handpick, seed=args['seed']) #print(true_ids) if args['attack'] == 'L2C': attack = CarliniL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], abort_early=args['abort_early']) if args['attack'] == 'LiCW': attack = CarliniLi(sess, model, max_iterations=args['maxiter'], abort_early=args['abort_early']) if args['attack'] == 'L2A': attack = ADMML2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['iteration_steps'], ro=args['ro'], abort_early=args['abort_early']) if args['attack'] == 'L2AE': attack = ADMML2en(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], ro=args['ro'], iteration_steps=args['iteration_steps'], abort_early=args['abort_early']) if args['attack'] == 'L2LA': attack = LADMML2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['iteration_steps'], ro=args['ro'], abort_early=args['abort_early']) if args['attack'] == 'L2LAST': attack = LADMMSTL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['iteration_steps'], ro=args['ro'], abort_early=args['abort_early'], retrain=args['retrain']) if args['attack'] == 'LiIF': attack = IFGM(sess, model, batch_size=args['batch_size'], ord=np.inf, inception=inception) if args['attack'] == 'LiF': attack = FGM(sess, model, batch_size=args['batch_size'], ord=np.inf, inception=inception) if args['attack'] == 'L1': attack = EADL1(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], beta=args['beta'], abort_early=args['abort_early']) if args['attack'] == 'L1EN': attack = EADEN(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], beta=args['beta'], abort_early=args['abort_early']) if args['attack'] == 'L1IFGM': attack = IFGM(sess, model, batch_size=args['batch_size'], ord=1, inception=inception) if args['attack'] == 'L2IFGM': attack = IFGM(sess, model, batch_size=args['batch_size'], ord=2, inception=inception) if args['attack'] == 'L1FGM': attack = FGM(sess, model, batch_size=args['batch_size'], ord=1, inception=inception) if args['attack'] == 'L2FGM': attack = FGM(sess, model, batch_size=args['batch_size'], ord=2, inception=inception) timestart = time.time() adv = attack.attack(inputs, targets) timeend = time.time() print("Took", timeend - timestart, "seconds to run", len(inputs), "samples.\n") if args['train']: np.save('labels_train.npy', labels) np.save(str(args['attack']) + '_train.npy', adv) #if (args['conf'] != 0): # model = MNISTModel("models/mnist-distilled-100", sess) l1_l2_li_computation(args, data, model, adv, inception, inputs, targets, labels, true_ids)
with tf.Session() as sess: #data, model = MNIST(), Classifier(sess) data = CIFAR10() # target model if sys.argv[1] == 'our': model = Classifier(input_shape=data.IMG_SHAPE, session=sess) model.restore('../Clf/models/cifar_classifier') elif sys.argv[1] == 'orgONLY': model = CIFARModel('models/cifar', sess) elif sys.argv[1] == 'orgDIS': model = CIFARModel('models/cifar-distilled-100', sess) else: print('Wrong Parameters') sys.exit() # init attack attack = CarliniL2(sess, model, targeted=False, max_iterations=1000, confidence=10, boxmin=0, boxmax=1) #inputs, targets = generate_data(data, samples=128, targeted=False, start=0, inception=False) inputs = data.X_test[:128] targets = data.y_test[:128] timestart = time.time() adv = attack.attack(inputs, targets) timeend = time.time() print("Took",timeend-timestart,"seconds to run",len(inputs),"samples.") np.save(('results/%s.npy' % sys.argv[2]), adv)
def main(args): # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7) with tf.Session() as sess: if args['dataset'] == 'mnist': data, model = MNIST(), MNISTModel("models/mnist", sess) handpick = False inception = False if args['dataset'] == "cifar": data, model = CIFAR(), CIFARModel("models/cifar", sess) handpick = True inception = False if args['dataset'] == "imagenet": data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess) handpick = True inception = True if args['adversarial'] != "none": model = MNISTModel( "models/mnist_cwl2_admm" + str(args['adversarial']), sess) if args['temp'] and args['dataset'] == 'mnist': model = MNISTModel("models/mnist-distilled-" + str(args['temp']), sess) if args['temp'] and args['dataset'] == 'cifar': model = CIFARModel("models/cifar-distilled-" + str(args['temp']), sess) inputs, targets, labels, true_ids = generate_data( data, model, samples=args['numimg'], targeted=args['targeted'], start=0, inception=inception, handpick=handpick, seed=args['seed']) #print(true_ids) if args['attack'] == 'L2C': attack = CarliniL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], targeted=args['targeted'], binary_search_steps=args['binary_steps'], abort_early=args['abort_early']) if args['attack'] == 'L2BB': # score-based ZO-ADMM attack attack = LADMMBB(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], targeted=args['targeted'], confidence=args['conf'], binary_search_steps=args['iteration_steps'], ro=args['ro'], abort_early=args['abort_early'], gama=args['gama'], epi=args['epi'], alpha=args['alpha']) timestart = time.time() # adv = attack.attack(inputs, targets) adv, querycount, queryl2 = attack.attack(inputs, targets) timeend = time.time() print("Took", timeend - timestart, "seconds to run", len(inputs), "samples.\n") if args['train']: np.save('labels_train.npy', labels) np.save(str(args['attack']) + '_train.npy', adv) if (args['conf'] != 0): model = MNISTModel("models/mnist-distilled-100", sess) if args['targeted']: l1_l2_li_computation(args, data, model, adv, inception, inputs, targets, labels, true_ids, querycount, queryl2) else: l2_computation(args, data, model, adv, inception, inputs, targets, labels, true_ids, querycount, queryl2)
def main(args): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: use_log = not args['use_zvalue'] is_inception = args['dataset'] == "imagenet" # load network print('Loading model', args['dataset']) if args['dataset'] == "mnist": data, model = MNIST(), MNISTModel("models/mnist", sess, use_log) # data, model = MNIST(), MNISTModel("models/mnist-distilled-100", sess, use_log) elif args['dataset'] == "cifar10": data, model = CIFAR(), CIFARModel("models/cifar", sess, use_log) # data, model = CIFAR(), CIFARModel("models/cifar-distilled-100", sess, use_log) elif args['dataset'] == "imagenet": data, model = ImageNet(), InceptionModel(sess, use_log) print('Done...') if args['numimg'] == 0: args['numimg'] = len(data.test_labels) - args['firstimg'] print('Using', args['numimg'], 'test images') # load attack module if args['attack'] == "white": # batch size 1, optimize on 1 image at a time, rather than optimizing images jointly attack = CarliniL2(sess, model, batch_size=1, max_iterations=args['maxiter'], print_every=args['print_every'], early_stop_iters=args['early_stop_iters'], confidence=0, learning_rate=args['lr'], initial_const=args['init_const'], binary_search_steps=args['binary_steps'], targeted=not args['untargeted'], use_log=use_log, adam_beta1=args['adam_beta1'], adam_beta2=args['adam_beta2']) else: # batch size 128, optimize on 128 coordinates of a single image attack = BlackBoxL2(sess, model, batch_size=128, max_iterations=args['maxiter'], print_every=args['print_every'], early_stop_iters=args['early_stop_iters'], confidence=0, learning_rate=args['lr'], initial_const=args['init_const'], binary_search_steps=args['binary_steps'], targeted=not args['untargeted'], use_log=use_log, use_tanh=args['use_tanh'], use_resize=args['use_resize'], adam_beta1=args['adam_beta1'], adam_beta2=args['adam_beta2'], reset_adam_after_found=args['reset_adam'], solver=args['solver'], save_ckpts=args['save_ckpts'], load_checkpoint=args['load_ckpt'], start_iter=args['start_iter'], init_size=args['init_size'], use_importance=not args['uniform']) random.seed(args['seed']) np.random.seed(args['seed']) print('Generate data') all_inputs, all_targets, all_labels, all_true_ids = generate_data( data, samples=args['numimg'], targeted=not args['untargeted'], start=args['firstimg'], inception=is_inception) print('Done...') os.system("mkdir -p {}/{}".format(args['save'], args['dataset'])) img_no = 0 total_success = 0 l2_total = 0.0 for i in range(all_true_ids.size): inputs = all_inputs[i:i + 1] targets = all_targets[i:i + 1] labels = all_labels[i:i + 1] print("true labels:", np.argmax(labels), labels) print("target:", np.argmax(targets), targets) # test if the image is correctly classified original_predict = model.model.predict(inputs) original_predict = np.squeeze(original_predict) original_prob = np.sort(original_predict) original_class = np.argsort(original_predict) print("original probabilities:", original_prob[-1:-6:-1]) print("original classification:", original_class[-1:-6:-1]) print("original probabilities (most unlikely):", original_prob[:6]) print("original classification (most unlikely):", original_class[:6]) if original_class[-1] != np.argmax(labels): print( "skip wrongly classified image no. {}, original class {}, classified as {}" .format(i, np.argmax(labels), original_class[-1])) continue img_no += 1 timestart = time.time() adv, const = attack.attack_batch(inputs, targets) if type(const) is list: const = const[0] if len(adv.shape) == 3: adv = adv.reshape((1, ) + adv.shape) timeend = time.time() l2_distortion = np.sum((adv - inputs)**2)**.5 adversarial_predict = model.model.predict(adv) adversarial_predict = np.squeeze(adversarial_predict) adversarial_prob = np.sort(adversarial_predict) adversarial_class = np.argsort(adversarial_predict) print("adversarial probabilities:", adversarial_prob[-1:-6:-1]) print("adversarial classification:", adversarial_class[-1:-6:-1]) success = False if args['untargeted']: if adversarial_class[-1] != original_class[-1]: success = True else: if adversarial_class[-1] == np.argmax(targets): success = True if l2_distortion > 20.0: success = False if success: total_success += 1 l2_total += l2_distortion suffix = "id{}_seq{}_prev{}_adv{}_{}_dist{}".format( all_true_ids[i], i, original_class[-1], adversarial_class[-1], success, l2_distortion) print("Saving to", suffix) show( inputs, "{}/{}/{}_original_{}.png".format(args['save'], args['dataset'], img_no, suffix)) show( adv, "{}/{}/{}_adversarial_{}.png".format(args['save'], args['dataset'], img_no, suffix)) show( adv - inputs, "{}/{}/{}_diff_{}.png".format(args['save'], args['dataset'], img_no, suffix)) print( "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, const = {:.6f}, prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, l2_avg = {:.5f}" .format(img_no, i, all_true_ids[i], timeend - timestart, success, const, original_class[-1], adversarial_class[-1], l2_distortion, total_success / float(img_no), 0 if total_success == 0 else l2_total / total_success)) with open(args['save'] + "/report.txt", 'a') as f: f.write("*" * 20) to_write = "[STATS][L1] total = {}, seq = {}, id = {}, time = {:.3f}, success = {}, const = {:.6f}, prev_class = {}, new_class = {}, distortion = {:.5f}, success_rate = {:.3f}, l2_avg = {:.5f}".format( img_no, i, all_true_ids[i], timeend - timestart, success, const, original_class[-1], adversarial_class[-1], l2_distortion, total_success / float(img_no), 0 if total_success == 0 else l2_total / total_success) f.write(to_write) f.write("*" * 20) f.write("\n\n") sys.stdout.flush()
def model_setup_carlini(rd, model_dict, X_train, y_train, X_test, y_test, X_val, y_val, mean, ax=None, layer=None): """ Main function to set up network (create, load, test, save) """ rev = model_dict['rev'] dim_red = model_dict['dim_red'] if rd != None: # Doing dimensionality reduction on dataset print("Doing {} with rd={} over the training data".format(dim_red, rd)) _, _, _, dr_alg = dr_wrapper(X_train, X_test, dim_red, rd, y_train, rev, X_val) else: dr_alg = None # Getting data parameters after dimensionality reduction data_dict = get_data_shape(X_train, X_test, X_val) no_of_dim = data_dict['no_of_dim'] # Prepare Theano variables for inputs and targets if no_of_dim == 2: input_var = T.tensor('inputs') elif no_of_dim == 3: input_var = T.tensor3('inputs') elif no_of_dim == 4: input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Check if model already exists if layer is not None: network, model_exist_flag, layers = model_creator(model_dict, data_dict, input_var, target_var, rd, layer) else: network, model_exist_flag = model_creator(model_dict, data_dict, input_var, target_var, rd, layer) # Defining symbolic variable for network output prediction = lasagne.layers.get_output(network) # Defining symbolic variable for network parameters params = lasagne.layers.get_all_params(network, trainable=True) # Defining symbolic variable for network output with dropout disabled test_prediction = lasagne.layers.get_output(network, deterministic=True) # Building or loading model depending on existence if model_exist_flag == 1: # Load the correct model: param_values = model_loader(model_dict, rd) #lasagne.layers.set_all_param_values(network, param_values) # Create Keras model from keras.models import Sequential from keras.layers import Dense, Dropout, Activation, Flatten from keras.layers import Convolution2D, MaxPooling2D model = Sequential() if rd is not None: model.add(Dense(rd, activation=None, input_shape=(784,), use_bias=False)) model.add(Dense(100, activation='sigmoid')) else: model.add(Dense(100, activation='sigmoid', input_shape=(784,))) model.add(Dense(100, activation='sigmoid')) model.add(Dense(10, activation=None)) if rd is not None: A = gradient_transform(model_dict, dr_alg) param_values = [A.T] + param_values # model.set_weights(param_values) # m_path = './keras/' + get_model_name(model_dict, rd) # model.save(m_path) # model.load_weights(m_path) y_onehot = np.zeros((len(y_test), 10)) y_onehot[np.arange(len(y_test)), y_test] = 1 # X_test was mean-subtracted before, now we add the mean back X_test_mean = (X_test + mean - 0.5).reshape(-1, 784) data = (X_test_mean, y_onehot) mean_flat = mean.reshape(-1, 784) # l2-Carlini Attack import tensorflow as tf import time from l2_attack import CarliniL2 with tf.Session() as sess: attack = CarliniL2(sess, model, mean_flat, batch_size=10, max_iterations=1000, confidence=0, targeted=False) inputs, targets = generate_data(data, samples=10000, targeted=False, start=0, inception=False) timestart = time.time() adv = attack.attack(inputs, targets, param_values) timeend = time.time() print("Took", timeend - timestart, "seconds to run", len(inputs), "samples.") # Resolve absolute path to output directory abs_path_o = resolve_path_o(model_dict) fname = 'carlini_l2' fname += '_' + get_model_name(model_dict) if rd is not None: fname += '_' + model_dict['dim_red'] + str(rd) plotfile = open(abs_path_o + fname + '.txt', 'a') plotfile.write('\\\small{' + str(rd) + '}\n') dists = [] pred = model.predict(inputs + 0.5 - mean_flat) for i in range(len(adv)): dist = np.linalg.norm((adv[i] + mean_flat) - (inputs[i] + 0.5)) if np.argmax(pred[i]) == y_test[i]: dists.append(dist) if i < 50: # Save original test and adversarial images x_adv = (adv[i] + mean_flat).reshape((28, 28)) orig = (inputs[i] + 0.5).reshape((28, 28)) img.imsave('./carlini_images/{}_adv.png'.format(i), x_adv * 255, vmin=0, vmax=255, cmap='gray') img.imsave('./carlini_images/{}_orig.png'.format(i), orig * 255, vmin=0, vmax=255, cmap='gray') # Test overall accuracy of the model pred = model.predict(inputs + 0.5 - mean_flat) correct = 0 for i in range(pred.shape[0]): if np.argmax(pred[i]) == y_test[i]: correct += 1 print('Overall accuracy on test images: ', correct / float(pred.shape[0])) pred = model.predict(adv) correct = 0 for i in range(pred.shape[0]): if np.argmax(pred[i]) == y_test[i]: correct += 1 print('Overall accuracy on adversarial images: ', correct / float(pred.shape[0])) dists_sorted = sorted(dists) for i in range(len(dists)): plotfile.write('{} {} \n'.format(i, dists_sorted[i])) # Plot histogram # import matplotlib.pyplot as plt # dists = np.array(dists) # ax.hist(dists, 50, normed=1, histtype='step', cumulative=True,label=str(rd)) elif model_exist_flag == 0: # Launch the training loop. print("Starting training...") if layer is not None: model_trainer(input_var, target_var, prediction, test_prediction, params, model_dict, X_train, y_train, X_val, y_val, network, layers) else: model_trainer(input_var, target_var, prediction, test_prediction, params, model_dict, X_train, y_train, X_val, y_val, network) model_saver(network, model_dict, rd)
inputs = np.array(inputs) targets = np.array(targets) return inputs, targets if __name__ == "__main__": count = 0 distortion = [] testCount = 20 for i in range(testCount): with tf.Session() as sess: data, model = RNN(), RNNModel("models\imdb_model.h5", sess) #MNIST(), MNISTModel("models/mnist", sess) attack = CarliniL2(sess, model, batch_size=1, max_iterations=1000, confidence=0, targeted=False) inputs, targets = generate_data(data, samples=1, targeted=False, start=0, inception=False) timestart = time.time() adv = attack.attack(inputs, targets) timeend = time.time() for i in range(len(adv)): # print("Valid:") # input = inputs[i] # input = np.reshape(input, (input.shape[0], -1)) # print(inputs) # print("Adversarial:") # attack_input = adv[i] # attack_input = np.reshape(attack_input, (attack_input.shape[0], -1))
def main(args): with tf.Session() as sess: if (args['dataset'] == 'mnist'): data = MNIST() inception = False if (args['adversarial'] != "none"): model = MNISTModel( "models/mnist_cw" + str(args['adversarial']), sess) elif (args['temp']): model = MNISTModel( "models/mnist-distilled-" + str(args['temp']), sess) else: model = MNISTModel("models/mnist", sess) if (args['dataset'] == "cifar"): data = CIFAR() inception = False if (args['adversarial'] != "none"): model = CIFARModel( "models/cifar_cw" + str(args['adversarial']), sess) elif (args['temp']): model = CIFARModel( "models/cifar-distilled-" + str(args['temp']), sess) else: model = CIFARModel("models/cifar", sess) if (args['dataset'] == "imagenet"): data, model = ImageNet(args['seed_imagenet'], 2 * args['numimg']), InceptionModel(sess) inception = True inputs, targets, labels, true_ids = generate_data( data, model, samples=args['numimg'], targeted=not args['untargeted'], target_num=args['targetnum'], inception=inception, train=args['train'], seed=args['seed']) timestart = time.time() if (args['restore_np']): if (args['train']): adv = np.load( str(args['dataset']) + '_' + str(args['attack']) + '_train.npy') else: adv = np.load( str(args['dataset']) + '_' + str(args['attack']) + '.npy') else: if (args['attack'] == 'L2'): attack = CarliniL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], initial_const=args['init_const'], binary_search_steps=args['binary_steps'], targeted=not args['untargeted'], beta=args['beta'], abort_early=args['abort_early']) adv = attack.attack(inputs, targets) if (args['attack'] == 'L1'): attack = EADL1(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], initial_const=args['init_const'], binary_search_steps=args['binary_steps'], targeted=not args['untargeted'], beta=args['beta'], abort_early=args['abort_early']) adv = attack.attack(inputs, targets) if (args['attack'] == 'EN'): attack = EADEN(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], initial_const=args['init_const'], binary_search_steps=args['binary_steps'], targeted=not args['untargeted'], beta=args['beta'], abort_early=args['abort_early']) adv = attack.attack(inputs, targets) """If untargeted, pass labels instead of targets""" if (args['attack'] == 'FGSM'): attack = FGM(sess, model, batch_size=args['batch_size'], ord=np.inf, eps=args['eps'], inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'FGML1'): attack = FGM(sess, model, batch_size=args['batch_size'], ord=1, eps=args['eps'], inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'FGML2'): attack = FGM(sess, model, batch_size=args['batch_size'], ord=2, eps=args['eps'], inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'IFGSM'): attack = IFGM(sess, model, batch_size=args['batch_size'], ord=np.inf, eps=args['eps'], inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'IFGML1'): attack = IFGM(sess, model, batch_size=args['batch_size'], ord=1, eps=args['eps'], inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'IFGML2'): attack = IFGM(sess, model, batch_size=args['batch_size'], ord=2, eps=args['eps'], inception=inception) adv = attack.attack(inputs, targets) timeend = time.time() if args['untargeted']: num_targets = 1 else: num_targets = args['targetnum'] print("Took", timeend - timestart, "seconds to run", len(inputs) / num_targets, "random instances.") if (args['save_np']): if (args['train']): np.save(str(args['dataset']) + '_labels_train.npy', labels) np.save( str(args['dataset']) + '_' + str(args['attack']) + '_train.npy', adv) else: np.save( str(args['dataset']) + '_' + str(args['attack'] + '.npy'), adv) r_best_ = [] d_best_l1_ = [] d_best_l2_ = [] d_best_linf_ = [] r_average_ = [] d_average_l1_ = [] d_average_l2_ = [] d_average_linf_ = [] r_worst_ = [] d_worst_l1_ = [] d_worst_l2_ = [] d_worst_linf_ = [] #Transferability Tests model_ = [] model_.append(model) if (args['targetmodel'] != "same"): if (args['targetmodel'] == "dd_100"): model_.append(MNISTModel("models/mnist-distilled-100", sess)) num_models = len(model_) if (args['show']): if not os.path.exists( str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack'])): os.makedirs( str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack'])) for m, model in enumerate(model_): r_best = [] d_best_l1 = [] d_best_l2 = [] d_best_linf = [] r_average = [] d_average_l1 = [] d_average_l2 = [] d_average_linf = [] r_worst = [] d_worst_l1 = [] d_worst_l2 = [] d_worst_linf = [] for i in range(0, len(inputs), num_targets): pred = [] for j in range(i, i + num_targets): if inception: pred.append( np.reshape(model.model.predict(adv[j:j + 1]), (data.test_labels[0:1].shape))) else: pred.append(model.model.predict(adv[j:j + 1])) dist_l1 = 1e10 dist_l1_index = 1e10 dist_linf = 1e10 dist_linf_index = 1e10 dist_l2 = 1e10 dist_l2_index = 1e10 for k, j in enumerate(range(i, i + num_targets)): success = False if (args['untargeted']): if (np.argmax(pred[k], 1) != np.argmax( targets[j:j + 1], 1)): success = True else: if (np.argmax(pred[k], 1) == np.argmax(targets[j:j + 1], 1)): success = True if (success): if (np.sum(np.abs(adv[j] - inputs[j])) < dist_l1): dist_l1 = np.sum(np.abs(adv[j] - inputs[j])) dist_l1_index = j if (np.amax(np.abs(adv[j] - inputs[j])) < dist_linf): dist_linf = np.amax(np.abs(adv[j] - inputs[j])) dist_linf_index = j if ((np.sum((adv[j] - inputs[j])**2)**.5) < dist_l2): dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5) dist_l2_index = j if (dist_l1_index != 1e10): d_best_l2.append((np.sum( (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5)) d_best_l1.append( np.sum( np.abs(adv[dist_l1_index] - inputs[dist_l1_index]))) d_best_linf.append( np.amax( np.abs(adv[dist_linf_index] - inputs[dist_linf_index]))) r_best.append(1) else: r_best.append(0) rand_int = np.random.randint(i, i + num_targets) if inception: pred_r = np.reshape( model.model.predict(adv[rand_int:rand_int + 1]), (data.test_labels[0:1].shape)) else: pred_r = model.model.predict(adv[rand_int:rand_int + 1]) success_average = False if (args['untargeted']): if (np.argmax(pred_r, 1) != np.argmax( targets[rand_int:rand_int + 1], 1)): success_average = True else: if (np.argmax(pred_r, 1) == np.argmax( targets[rand_int:rand_int + 1], 1)): success_average = True if success_average: r_average.append(1) d_average_l2.append( np.sum((adv[rand_int] - inputs[rand_int])**2)**.5) d_average_l1.append( np.sum(np.abs(adv[rand_int] - inputs[rand_int]))) d_average_linf.append( np.amax(np.abs(adv[rand_int] - inputs[rand_int]))) else: r_average.append(0) dist_l1 = 0 dist_l1_index = 1e10 dist_linf = 0 dist_linf_index = 1e10 dist_l2 = 0 dist_l2_index = 1e10 for k, j in enumerate(range(i, i + num_targets)): failure = True if (args['untargeted']): if (np.argmax(pred[k], 1) != np.argmax( targets[j:j + 1], 1)): failure = False else: if (np.argmax(pred[k], 1) == np.argmax(targets[j:j + 1], 1)): failure = False if failure: r_worst.append(0) dist_l1_index = 1e10 dist_l2_index = 1e10 dist_linf_index = 1e10 break else: if (np.sum(np.abs(adv[j] - inputs[j])) > dist_l1): dist_l1 = np.sum(np.abs(adv[j] - inputs[j])) dist_l1_index = j if (np.amax(np.abs(adv[j] - inputs[j])) > dist_linf): dist_linf = np.amax(np.abs(adv[j] - inputs[j])) dist_linf_index = j if ((np.sum((adv[j] - inputs[j])**2)**.5) > dist_l2): dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5) dist_l2_index = j if (dist_l1_index != 1e10): d_worst_l2.append((np.sum( (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5)) d_worst_l1.append( np.sum( np.abs(adv[dist_l1_index] - inputs[dist_l1_index]))) d_worst_linf.append( np.amax( np.abs(adv[dist_linf_index] - inputs[dist_linf_index]))) r_worst.append(1) if (args['show'] and m == (num_models - 1)): for j in range(i, i + num_targets): target_id = np.argmax(targets[j:j + 1], 1) label_id = np.argmax(labels[j:j + 1], 1) prev_id = np.argmax( np.reshape(model.model.predict(inputs[j:j + 1]), (data.test_labels[0:1].shape)), 1) adv_id = np.argmax( np.reshape(model.model.predict(adv[j:j + 1]), (data.test_labels[0:1].shape)), 1) suffix = "id{}_seq{}_lbl{}_prev{}_adv{}_{}_l1_{:.3f}_l2_{:.3f}_linf_{:.3f}".format( true_ids[i], target_id, label_id, prev_id, adv_id, adv_id == target_id, np.sum(np.abs(adv[j] - inputs[j])), np.sum((adv[j] - inputs[j])**2)**.5, np.amax(np.abs(adv[j] - inputs[j]))) show( inputs[j:j + 1], str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack']) + "/original_{}.png".format(suffix)) show( adv[j:j + 1], str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack']) + "/adversarial_{}.png".format(suffix)) if (m != (num_models - 1)): lbl = "Src_" if (num_models > 2): lbl += str(m) + "_" else: lbl = "Tgt_" if (num_targets > 1): print(lbl + 'best_case_L1_mean', np.mean(d_best_l1)) print(lbl + 'best_case_L2_mean', np.mean(d_best_l2)) print(lbl + 'best_case_Linf_mean', np.mean(d_best_linf)) print(lbl + 'best_case_prob', np.mean(r_best)) print(lbl + 'average_case_L1_mean', np.mean(d_average_l1)) print(lbl + 'average_case_L2_mean', np.mean(d_average_l2)) print(lbl + 'average_case_Linf_mean', np.mean(d_average_linf)) print(lbl + 'average_case_prob', np.mean(r_average)) print(lbl + 'worst_case_L1_mean', np.mean(d_worst_l1)) print(lbl + 'worst_case_L2_mean', np.mean(d_worst_l2)) print(lbl + 'worst_case_Linf_mean', np.mean(d_worst_linf)) print(lbl + 'worst_case_prob', np.mean(r_worst)) else: print(lbl + 'L1_mean', np.mean(d_average_l1)) print(lbl + 'L2_mean', np.mean(d_average_l2)) print(lbl + 'Linf_mean', np.mean(d_average_linf)) print(lbl + 'success_prob', np.mean(r_average))
def find_adv(sess, face, face_stack_self, face_stack_target, FRmodel, file_name=None, margin=0, hinge_loss=True, model='triplet'): const_high = 10.0 const_low = 0.05 const = 0.3 ever_success = False best_l2 = 9999.0 best_adv = None best_delta = None best_const = None batch_size = face.shape[0] self_size = face_stack_self.shape[0] target_size = face_stack_target.shape[0] for ii in range(5): print("Search #",ii,"with constant",const) if model == 'center': boxmin = -1 boxmax = 1 if model == 'triplet': boxmin = 0 boxmax = 1 attack = CarliniL2(sess, FRmodel, batch_size=batch_size, learning_rate=0.01,hinge_loss=hinge_loss ,targeted=True, self_db_size=self_size,target_batch_size=target_size, initial_const=const, max_iterations=500, confidence=margin, boxmin=boxmin, boxmax=boxmax) adv, delta = attack.attack(face, face_stack_target, face_stack_self) if model == 'triplet': dist = face_recog(adv, face_stack_self, face_stack_target, FRmodel, sess) if model == 'center': dist = face_recog_center(adv, face_stack_self, face_stack_target, FRmodel, sess) print(dist) if(dist[0] - dist[1] >= margin): # Successfully found adv example print('Success with const',const) ever_success = True adv_l2 = np.linalg.norm(delta) if(adv_l2) < best_l2: best_l2 = adv_l2 best_adv = adv best_delta = delta best_const = const # decrease const const_high = const const = (const_high + const_low) / 2 else: # Faild to find adv example print('Failure with const',const) const_low = const const = (const_high + const_low) / 2 if(ever_success == True and const_high-const_low < 0.02): break if(ever_success): print('Successfully found adv example') else: print('Failed to find adv example') if(file_name): np.savez(file_name, face=face, adv=best_adv, delta=best_delta, l2=best_l2) return best_adv, best_delta, best_l2, best_const
#classification+samples+batch_size+start filename = '95s40bs9start10.pkl' utfile = 'ut_'+filename advname = 'adv_'+filename if __name__ == "__main__": with tf.Session() as sess: data, model = ImageNet(), InceptionModel(sess) inputs, targets = generate_data(model, data, samples=samples, targeted=True, start=start, inception=True, batch_size = bs) tar = np.argmax(targets, axis = 1) attack = CarliniL2(sess, model, batch_size=bs, max_iterations = mi, confidence=confidence) # attack = CarliniL0(sess, model, max_iterations = mi) # attack = CarliniLi(sess, model) timestart = time.time() adv = attack.attack(inputs, targets) timeend = time.time() advnp = np.array(adv).astype(np.uint8) f = open(advname,'wb') pickle.dump(advnp,f) f.close print("Took",timeend-timestart,"seconds to run",len(inputs),"samples.")
def main(_): ''' # Build the inference graph. g = tf.Graph() with g.as_default(): model = attack_wrapper.AttackWrapper() restore_fn = model.build_graph_from_config(configuration.ModelConfig(), FLAGS.checkpoint_path) # g.finalize() # Create the vocabulary. ''' tf.set_random_seed(1234) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.4) config = tf.ConfigProto(gpu_options=gpu_options) vocab = vocabulary.Vocabulary(FLAGS.vocab_file) # TODO: build the inference graph inference_graph = tf.Graph() with inference_graph.as_default(): inf_model = inference_wrapper.InferenceWrapper() inf_restore_fn = inf_model.build_graph_from_config( configuration.ModelConfig(), FLAGS.checkpoint_path) # inf_image_placeholder = tf.placeholder(dtype=tf.string, shape=[], name="inf_image_placeholder") # inf_preprocessor = inf_model.model.process_image(inf_image_placeholder) inference_graph.finalize() inf_sess = tf.Session(graph=inference_graph, config=config) # Load the model from checkpoint. inf_restore_fn(inf_sess) attack_graph = tf.Graph() with attack_graph.as_default(): model = attack_wrapper.AttackWrapper() sess = tf.Session(config=config) # build the attacker graph attack = CarliniL2(sess, inf_sess, attack_graph, inference_graph, model, inf_model, targeted=FLAGS.targeted, use_keywords=FLAGS.use_keywords, use_logits=FLAGS.use_logits, batch_size=1, initial_const=1.0, max_iterations=1000, print_every=1, confidence=2, use_log=False, norm=FLAGS.norm, abort_early=False, learning_rate=0.005) # compute graph for preprocessing image_placeholder = tf.placeholder(dtype=tf.string, shape=[]) preprocessor = model.model.process_image(image_placeholder) filenames = [] for file_pattern in FLAGS.input_files.split(","): filenames.extend(tf.gfile.Glob(file_pattern)) tf.logging.info("Running caption generation on %d files matching %s", len(filenames), FLAGS.input_files) for filename in filenames: with tf.gfile.GFile(filename, "rb") as f: image = f.read() raw_image = sess.run(preprocessor, feed_dict={image_placeholder: image}) print('raw image size:', raw_image.shape) ''' new_sentence = "kite" new_sentence = "a man on a surfboard riding a wave ." new_sentence = "a dog riding a bike on a road ." new_sentence = "a group of giraffe standing next to each other ." # success, p=0.016556 new_sentence = "a person skiing down a snow covered slope ." # success, p=0.021917 new_sentence = "a person on a beach flying a kite ." # success, p=0.019417 new_sentence = "a black and white photo of a train on a track ." # success, p=0.006146 new_sentence = "a bowl of pasta with meat and vegetables ." new_sentence = "a man and girl carrying kites down a sidewalk in front of a metro bus ." # end up with "a group of people standing on top of a sandy beach ." same as a sentence in training set new_sentence = "a man and girl carrying surfboards down a sidewalk in front of a metro bus ."# same as in training set ''' new_sentence = FLAGS.input_feed new_sentence = new_sentence.split() print("My new sentence:", new_sentence) max_caption_length = 20 new_caption = [vocab.start_id ] + [vocab.word_to_id(w) for w in new_sentence] + [vocab.end_id] true_cap_len = len(new_caption) new_caption = new_caption + [vocab.end_id ] * (max_caption_length - true_cap_len) print("My new id:", new_caption) new_mask = np.append(np.ones(true_cap_len), np.zeros(max_caption_length - true_cap_len)) # print("Probability by attack_step:", model.attack_step(sess, new_caption, new_mask, raw_image)) # adv = attack.attack(np.array([raw_image]), new_caption, [new_mask]) # key_words = [vocab.word_to_id("surfboard"),vocab.word_to_id("riding"),vocab.word_to_id("man"),vocab.word_to_id("wave"),vocab.word_to_id("dog"),vocab.word_to_id("water"),vocab.word_to_id("woman"),vocab.word_to_id("surfer"),vocab.word_to_id("ocean"),vocab.word_to_id("frisbee")] # key_words = [vocab.word_to_id("surfboard"), vocab.word_to_id("man"), vocab.word_to_id("wave"), vocab.word_to_id("riding"), vocab.word_to_id("water")] # key_words = [vocab.word_to_id("giraffe"), vocab.word_to_id("standing"), vocab.word_to_id("photo")] # key_words = [vocab.word_to_id("photo"), vocab.word_to_id("train"), vocab.word_to_id("track")] # words = ["train", "photo", "track"] words = ["riding", "train", "long"] words = FLAGS.input_feed.split() key_words = [vocab.word_to_id(word) for word in words] print(key_words) # key_words = [vocab.word_to_id("bird"), vocab.word_to_id("flying")] key_words_mask = np.append( np.ones(len(key_words)), np.zeros(max_caption_length - len(key_words))) key_words = key_words + [vocab.end_id ] * (max_caption_length - len(key_words)) if FLAGS.use_keywords: # keywords based attack adv = attack.attack(np.array([raw_image]), sess, inf_sess, model, inf_model, vocab, key_words, key_words_mask, 1) else: # exact attack adv = attack.attack(np.array([raw_image]), sess, inf_sess, model, inf_model, vocab, new_caption, new_mask, 1) l2_distortion = np.sum((adv - raw_image)**2)**.5 linf_distortion = np.max(np.abs(adv - raw_image)) print("L2 distortion is", l2_distortion) print("L_inf distortion is", linf_distortion) show(raw_image, "original.png") show(adv, "adversarial.png") show(adv - raw_image, "diff.png") inf_sess.close()
def main(args): with tf.Session() as sess: if (args['dataset'] == 'mnist'): data, model = MNIST(), MNISTModel("models/mnist", sess) handpick = False inception = False if (args['dataset'] == "cifar"): data, model = CIFAR(), CIFARModel("models/cifar", sess) handpick = True inception = False if (args['dataset'] == "imagenet"): data, model = ImageNet(args['seed_imagenet']), InceptionModel(sess) handpick = True inception = True if (args['adversarial'] != "none"): model = MNISTModel("models/mnist_cw" + str(args['adversarial']), sess) if (args['temp'] and args['dataset'] == 'mnist'): model = MNISTModel("models/mnist-distilled-" + str(args['temp']), sess) if (args['temp'] and args['dataset'] == 'cifar'): model = CIFARModel("models/cifar-distilled-" + str(args['temp']), sess) inputs, targets, labels, true_ids = generate_data( data, model, samples=args['numimg'], inception=inception, handpick=handpick, train=args['train'], seed=args['seed']) timestart = time.time() if (args['attack'] == 'L2'): attack = CarliniL2(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], beta=args['beta'], abort_early=args['abort_early']) adv = attack.attack(inputs, targets) if (args['attack'] == 'L1'): attack = EADL1(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], beta=args['beta'], abort_early=args['abort_early']) adv = attack.attack(inputs, targets) if (args['attack'] == 'EN'): attack = EADEN(sess, model, batch_size=args['batch_size'], max_iterations=args['maxiter'], confidence=args['conf'], binary_search_steps=args['binary_steps'], beta=args['beta'], abort_early=args['abort_early']) adv = attack.attack(inputs, targets) """If untargeted, pass labels instead of targets""" if (args['attack'] == 'FGSM'): attack = FGM(sess, model, batch_size=args['batch_size'], ord=np.inf, inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'FGML1'): attack = FGM(sess, model, batch_size=args['batch_size'], ord=1, inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'FGML2'): attack = FGM(sess, model, batch_size=args['batch_size'], ord=2, inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'IFGSM'): attack = IGM(sess, model, batch_size=args['batch_size'], ord=np.inf, inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'IFGML1'): attack = IGM(sess, model, batch_size=args['batch_size'], ord=1, inception=inception) adv = attack.attack(inputs, targets) if (args['attack'] == 'IFGML2'): attack = IGM(sess, model, batch_size=args['batch_size'], ord=2, inception=inception) adv = attack.attack(inputs, targets) timeend = time.time() print("Took", timeend - timestart, "seconds to run", len(inputs) / args['batch_size'], "random instances.") if (args['train']): np.save('labels_train.npy', labels) np.save(str(args['attack']) + '_train.npy', adv) return r_best = [] d_best_l1 = [] d_best_l2 = [] d_best_linf = [] r_average = [] d_average_l1 = [] d_average_l2 = [] d_average_linf = [] r_worst = [] d_worst_l1 = [] d_worst_l2 = [] d_worst_linf = [] if (args['conf'] != 0): model = MNISTModel("models/mnist-distilled-100", sess) if (args['show']): if not os.path.exists( str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack'])): os.makedirs( str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack'])) for i in range(0, len(inputs), args['batch_size']): pred = [] for j in range(i, i + args['batch_size']): if inception: pred.append( np.reshape(model.model.predict(adv[j:j + 1]), (data.test_labels[0:1].shape))) else: pred.append(model.model.predict(adv[j:j + 1])) dist_l1 = 1e10 dist_l2 = 1e10 dist_linf = 1e10 dist_l1_index = 1e10 dist_l2_index = 1e10 dist_linf_index = 1e10 for k, j in enumerate(range(i, i + args['batch_size'])): if (np.argmax(pred[k], 1) == np.argmax(targets[j:j + 1], 1)): if (np.sum(np.abs(adv[j] - inputs[j])) < dist_l1): dist_l1 = np.sum(np.abs(adv[j] - inputs[j])) dist_l1_index = j if (np.amax(np.abs(adv[j] - inputs[j])) < dist_linf): dist_linf = np.amax(np.abs(adv[j] - inputs[j])) dist_linf_index = j if ((np.sum((adv[j] - inputs[j])**2)**.5) < dist_l2): dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5) dist_l2_index = j if (dist_l1_index != 1e10): d_best_l2.append((np.sum( (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5)) d_best_l1.append( np.sum(np.abs(adv[dist_l1_index] - inputs[dist_l1_index]))) d_best_linf.append( np.amax( np.abs(adv[dist_linf_index] - inputs[dist_linf_index]))) r_best.append(1) else: r_best.append(0) rand_int = np.random.randint(i, i + args['batch_size']) if inception: pred_r = np.reshape( model.model.predict(adv[rand_int:rand_int + 1]), (data.test_labels[0:1].shape)) else: pred_r = model.model.predict(adv[rand_int:rand_int + 1]) if (np.argmax(pred_r, 1) == np.argmax(targets[rand_int:rand_int + 1], 1)): r_average.append(1) d_average_l2.append( np.sum((adv[rand_int] - inputs[rand_int])**2)**.5) d_average_l1.append( np.sum(np.abs(adv[rand_int] - inputs[rand_int]))) d_average_linf.append( np.amax(np.abs(adv[rand_int] - inputs[rand_int]))) else: r_average.append(0) dist_l1 = 0 dist_l1_index = 1e10 dist_linf = 0 dist_linf_index = 1e10 dist_l2 = 0 dist_l2_index = 1e10 for k, j in enumerate(range(i, i + args['batch_size'])): if (np.argmax(pred[k], 1) != np.argmax(targets[j:j + 1], 1)): r_worst.append(0) dist_l1_index = 1e10 dist_l2_index = 1e10 dist_linf_index = 1e10 break else: if (np.sum(np.abs(adv[j] - inputs[j])) > dist_l1): dist_l1 = np.sum(np.abs(adv[j] - inputs[j])) dist_l1_index = j if (np.amax(np.abs(adv[j] - inputs[j])) > dist_linf): dist_linf = np.amax(np.abs(adv[j] - inputs[j])) dist_linf_index = j if ((np.sum((adv[j] - inputs[j])**2)**.5) > dist_l2): dist_l2 = (np.sum((adv[j] - inputs[j])**2)**.5) dist_l2_index = j if (dist_l1_index != 1e10): d_worst_l2.append((np.sum( (adv[dist_l2_index] - inputs[dist_l2_index])**2)**.5)) d_worst_l1.append( np.sum(np.abs(adv[dist_l1_index] - inputs[dist_l1_index]))) d_worst_linf.append( np.amax( np.abs(adv[dist_linf_index] - inputs[dist_linf_index]))) r_worst.append(1) if (args['show']): for j in range(i, i + args['batch_size']): target_id = np.argmax(targets[j:j + 1], 1) label_id = np.argmax(labels[j:j + 1], 1) prev_id = np.argmax( np.reshape(model.model.predict(inputs[j:j + 1]), (data.test_labels[0:1].shape)), 1) adv_id = np.argmax( np.reshape(model.model.predict(adv[j:j + 1]), (data.test_labels[0:1].shape)), 1) suffix = "id{}_seq{}_lbl{}_prev{}_adv{}_{}_l1_{:.3f}_l2_{:.3f}_linf_{:.3f}".format( true_ids[i], target_id, label_id, prev_id, adv_id, adv_id == target_id, np.sum(np.abs(adv[j] - inputs[j])), np.sum((adv[j] - inputs[j])**2)**.5, np.amax(np.abs(adv[j] - inputs[j]))) show( inputs[j:j + 1], str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack']) + "/original_{}.png".format(suffix)) show( adv[j:j + 1], str(args['save']) + "/" + str(args['dataset']) + "/" + str(args['attack']) + "/adversarial_{}.png".format(suffix)) print('best_case_L1_mean', np.mean(d_best_l1)) print('best_case_L2_mean', np.mean(d_best_l2)) print('best_case_Linf_mean', np.mean(d_best_linf)) print('best_case_prob', np.mean(r_best)) print('average_case_L1_mean', np.mean(d_average_l1)) print('average_case_L2_mean', np.mean(d_average_l2)) print('average_case_Linf_mean', np.mean(d_average_linf)) print('average_case_prob', np.mean(r_average)) print('worst_case_L1_mean', np.mean(d_worst_l1)) print('worst_case_L2_mean', np.mean(d_worst_l2)) print('worst_case_Linf_mean', np.mean(d_worst_linf)) print('worst_case_prob', np.mean(r_worst))
classifier = Classifier("./models/cifar_example_classifier") if dataset == "MNIST": data = MNIST() else: data = CIFAR() class Pred2: image_size = 28 if dataset == "MNIST" else 32 num_labels = 10 num_channels = 1 if dataset == "MNIST" else 3 def predict(self, x): return classifier.model(x) keras.backend.set_learning_phase(False) sess = keras.backend.get_session() attack = CarliniL2(sess, [Pred2()], {}, {}, batch_size=100, binary_search_steps=4, learning_rate=1e-2, max_iterations=10000, targeted=True, initial_const=1, confidence=1, boxmin=0, boxmax=1) idx = [np.where(np.argmax(data.test_labels,axis=1)==i)[0][0] for i in range(10)] dat = np.array([data.test_data[i] for i in idx for j in range(10)]) lab = sess.run(tf.one_hot(np.array([list(range(10))]*10).flatten(), depth=10)) adv = attack.attack(dat, lab) print('mean distortion', np.mean(np.sum((adv-dat)**2,axis=(1,2,3))**.5))