def generate_carlini_l2_examples(sess, model, x, y, X, Y, attack_params, verbose, attack_log_fpath): model_wrapper = wrap_to_tohinz_model(model, X, Y) accepted_params = [ 'batch_size', 'confidence', 'targeted', 'learning_rate', 'binary_search_steps', 'max_iterations', 'abort_early', 'initial_const' ] for k in attack_params: if k not in accepted_params: raise NotImplementedError("Unsuporrted params in Carlini L2: %s" % k) # assert batch_size <= len(X) if 'batch_size' in attack_params and attack_params['batch_size'] > len(X): attack_params['batch_size'] = len(X) if 'binary_search_steps' in attack_params: attack_params['binary_search_steps'] = int( attack_params['binary_search_steps']) attack = CarliniL2(sess, model_wrapper, **attack_params) if not verbose: disablePrint(attack_log_fpath) # The input range is [0, 1], convert to [-0.5, 0.5] by subtracting 0.5. # The return range is [-0.5, 0.5]. Convert back to [0,1] by adding 0.5. X_adv = attack.attack(X - 0.5, Y) + 0.5 if not verbose: enablePrint() return X_adv
def run_pca(Data, num_components=10, invert=False): data = Data() sess = K.get_session() K.set_learning_phase(False) shape = (-1, 784) pca = sklearn.decomposition.PCA(n_components=num_components) pca.fit(data.train_data.reshape(shape)) # [:10000] if invert: model = MNISTModel("models/mnist-pca-cnn-top-"+str(num_components)) else: model = make_model(num_components) model.load_weights("models/mnist-pca-top-"+str(num_components)) model = Wrap(model,pca) tf_mean = tf.constant(pca.mean_,dtype=tf.float32) tf_components = tf.constant(pca.components_.T,dtype=tf.float32) def new_predict(xs): # map to PCA space xs = tf.reshape(xs,(-1,784)) xs -= tf_mean xs = tf.matmul(xs, tf_components) # map back xs = tf.matmul(xs, tf.transpose(tf_components)) xs += tf_mean xs = tf.reshape(xs, (-1, 28, 28, 1)) return model.model(xs) if invert: model.predict = new_predict attack = CarliniL2(sess, model, batch_size=100, max_iterations=3000, binary_search_steps=6, targeted=False, initial_const=1) N = 100 test_adv = attack.attack(data.test_data[:N], data.test_labels[:N]) print('accuracy',np.mean(np.argmax(sess.run(model.predict(tf.constant(data.test_data,dtype=np.float32))),axis=1)==np.argmax(data.test_labels,axis=1))) print(list(test_adv[0].flatten())) print('dist',np.mean(np.sum((test_adv-data.test_data[:N])**2,axis=(1,2,3))**.5)) it = np.argmax(sess.run(model.predict(tf.constant(test_adv))),axis=1) print('success',np.mean(it==np.argmax(data.test_labels,axis=1)[:N]))
def run(Data, Model, path): sess = K.get_session() K.set_learning_phase(False) data, model = Data(), Model(path) if Data == MNIST: attack = CarliniL2(sess, model, batch_size=100, max_iterations=2000, binary_search_steps=5, initial_const=1., learning_rate=1e-1, targeted=False) else: attack = CarliniL2(sess, model, batch_size=100, max_iterations=200, binary_search_steps=3, initial_const=.01, learning_rate=1e-2, targeted=True, confidence=2) now = time.time() for name, X, y in [["test", data.test_data, data.test_labels]]: print("OKAY", name) for k in range(0, len(y), 5000): #if os.path.exists("tmp/"+path.split("/")[1]+"."+name+".adv.X."+str(k)+".npy"): # print('skip',k) # continue now = time.time() adv = attack.attack(X[k:k + 100], y[k:k + 100]) #print('time',time.time()-now) #print('accuracy',np.mean(np.argmax(model.model.predict(adv),axis=1)==np.argmax(y[k:k+5000],axis=1))) #print('mean distortion',np.mean(np.sum((adv-X[k:k+5000])**2,axis=(1,2,3))**.5)) np.save( "/tmp/" + path.split("/")[1] + "." + name + ".adv.X." + str(k), adv)
def compare_baseline(): data = MNIST() model = MNISTModel("models/mnist") sess = K.get_session() attack = CarliniL2(sess, model, batch_size=100, max_iterations=3000, binary_search_steps=4, targeted=False, initial_const=10) N = 100 test_adv = attack.attack(data.test_data[:N], data.test_labels[:N]) print('dist',np.mean(np.sum((test_adv-data.test_data[:N])**2,axis=(1,2,3))**.5))
def run_pca(Data, Model, path=None): sess = K.get_session() K.set_learning_phase(False) data = Data() model = Model(path) shape = (-1, model.num_channels * model.image_size**2) pca = sklearn.decomposition.PCA(n_components=shape[1]) pca.fit(data.train_data.reshape(shape)) print(pca.explained_variance_ratio_) r_test = pca.transform(data.test_data.reshape(shape)) #attack = FGS(sess, model, eps=.3) attack = CarliniL2(sess, model, batch_size=100, max_iterations=1000, binary_search_steps=2, targeted=False, initial_const=10) N = 10000 #test_adv = attack.attack(data.test_data[:N], data.test_labels[:N]) test_adv = np.load("tmp/outlieradvtest.npy") r_test_adv = pca.transform(test_adv[:N].reshape(shape)) fig = plt.figure(figsize=(4, 3)) fig.subplots_adjust(bottom=0.17, left=.19) plt.xlabel('Component Number') plt.ylabel('Mean Absolute Value (log scale)') plt.semilogy(range(r_test.shape[1]), np.mean(np.abs(r_test), axis=0), label='Valid') plt.semilogy(range(r_test_adv.shape[1]), np.mean(np.abs(r_test_adv), axis=0), label='Adversarial') plt.legend() pp = PdfPages('/tmp/a.pdf') plt.savefig(pp, format='pdf') pp.close() plt.show()
def run_filter(Data, Model, path): K.set_learning_phase(False) data = Data() model = Model(path) model2 = Model(path) def new_predict(xs): print(xs.get_shape()) if 'mnist' in path: xs = tf.nn.conv2d(xs, tf.constant(np.ones((3,3,1,1))/9,dtype=tf.float32), [1,1,1,1], "SAME") else: xs = tf.nn.conv2d(xs, tf.constant(np.ones((3,3,3,3))/9,dtype=tf.float32), [1,1,1,1], "SAME") return model2.model(xs) model2.predict = new_predict sess = K.get_session() #dist 1.45976 attack = CarliniL2(sess, model2, batch_size=100, max_iterations=3000, binary_search_steps=4, targeted=False, confidence=0, initial_const=10) N = 100 test_adv = attack.attack(data.test_data[:N], data.test_labels[:N]) print('accuracy of original model',np.mean(np.argmax(sess.run(model.predict(tf.constant(data.test_data,dtype=np.float32))),axis=1)==np.argmax(data.test_labels,axis=1))) print('accuracy of blurred model',np.mean(np.argmax(sess.run(model.predict(tf.constant(data.test_data,dtype=np.float32))),axis=1)==np.argmax(data.test_labels,axis=1))) print('dist',np.mean(np.sum((test_adv-data.test_data[:N])**2,axis=(1,2,3))**.5)) #it = np.argmax(sess.run(model.predict(tf.constant(test_adv))),axis=1) #print('success of unblured',np.mean(it==np.argmax(data.test_labels,axis=1)[:N])) it = np.argmax(sess.run(model2.predict(tf.constant(test_adv))),axis=1) print('success of blured',np.mean(it==np.argmax(data.test_labels,axis=1)[:N]))
def run_test(Data, Model, path): sess = K.get_session() K.set_learning_phase(False) data = Data() model = Model(path) N = 1000 X = data.train_data[np.random.choice(np.arange(len(data.train_data)), N, replace=False)].reshape((N, -1)) #Y = data.train_data[np.random.choice(np.arange(len(data.train_data)), N, replace=False)].reshape((N,-1)) Y = data.test_data[np.random.choice(np.arange(len(data.test_data)), N, replace=False)].reshape((N, -1)) #attack = FGS(sess, model, N, .275) attack = CarliniL2(sess, model, batch_size=100, binary_search_steps=2, initial_const=1, targeted=False, max_iterations=500) idx = np.random.choice(np.arange(len(data.test_data)), N, replace=False) Y = attack.attack(data.test_data[idx], data.test_labels[idx]).reshape( (N, -1)) iterations = 1000 sigma2 = 100 mmd2u, mmd2u_null, p_value = kernel_two_sample_test(X, Y, iterations=iterations, kernel_function='rbf', gamma=1.0 / sigma2, verbose=True)
def run_evaluation(Data, Model, path, num_epochs, name): data = Data() #train(Model, data, 10, path, num_epochs=num_epochs) sess = K.get_session() K.set_learning_phase(False) model = Model(path) #attack = FGS(sess, model) attack = CarliniL2(sess, model, batch_size=100, max_iterations=3000, binary_search_steps=3, targeted=True, initial_const=10, learning_rate=1e-2) """ # uncomment to run the training phase train_adv = attack.attack(data.train_data, data.train_labels) np.save("tmp/"+name+"outlieradvtrain",train_adv) train_adv = np.load("tmp/"+name+"outlieradvtrain.npy") data.train_data = np.concatenate((data.train_data, train_adv)) data.train_labels = np.concatenate((data.train_labels, np.zeros(data.train_labels.shape, dtype=np.float32))) data.train_labels = np.pad(data.train_labels, [[0, 0], [0, 1]], mode='constant') data.train_labels[data.train_labels.shape[0]//2:,10] = 1 validation_adv = attack.attack(data.validation_data, data.validation_labels) np.save("tmp/"+name+"outlieradvvalidation",validation_adv) validation_adv = np.load("tmp/"+name+"outlieradvvalidation.npy") data.validation_data = np.concatenate((data.validation_data, validation_adv)) data.validation_labels = np.concatenate((data.validation_labels, np.zeros(data.validation_labels.shape, dtype=np.float32))) data.validation_labels = np.pad(data.validation_labels, [[0, 0], [0, 1]], mode='constant') data.validation_labels[data.validation_labels.shape[0]//2:,10] = 1 test_adv = attack.attack(data.test_data, data.test_labels) np.save("tmp/"+name+"outlieradvtest",test_adv) test_adv = np.load("tmp/"+name+"outlieradvtest.npy") data.test_data = np.concatenate((data.test_data, test_adv)) data.test_labels = np.concatenate((data.test_labels, np.zeros(data.test_labels.shape, dtype=np.float32))) data.test_labels = np.pad(data.test_labels, [[0, 0], [0, 1]], mode='constant') data.test_labels[data.test_labels.shape[0]//2:,10] = 1 train(Model, data, 11, path+"_advtraining", num_epochs=num_epochs) data1 = Data() # just need a reference, this is a bit ugly to do data2 = Data() # just need a reference, this is a bit ugly to do idxs = list(range(len(data.train_data))) random.shuffle(idxs) data1.train_data = data.train_data[idxs[:len(idxs)//2]] data2.train_data = data.train_data[idxs[len(idxs)//2:]] data1.train_labels = data.train_labels[idxs[:len(idxs)//2],:] data2.train_labels = data.train_labels[idxs[len(idxs)//2:],:] idxs = list(range(len(data.validation_data))) random.shuffle(idxs) data1.validation_data = data.validation_data[idxs[:len(idxs)//2]] data2.validation_data = data.validation_data[idxs[len(idxs)//2:]] data1.validation_labels = data.validation_labels[idxs[:len(idxs)//2]] data2.validation_labels = data.validation_labels[idxs[len(idxs)//2:]] idxs = list(range(len(data.test_data))) random.shuffle(idxs) data1.test_data = data.test_data[idxs[:len(idxs)//2]] data2.test_data = data.test_data[idxs[len(idxs)//2:]] data1.test_labels = data.test_labels[idxs[:len(idxs)//2]] data2.test_labels = data.test_labels[idxs[len(idxs)//2:]] train(Model, data1, 11, path+"_advtraining-left", num_epochs=num_epochs) train(Model, data2, 11, path+"_advtraining-right", num_epochs=num_epochs) #""" K.set_learning_phase(False) rmodel = Model(num_labels=11).model rmodel.load_weights(path + "_advtraining") if name == "cifar": rmodel = Wrap(rmodel, 32, 3, 11) else: rmodel = Wrap(rmodel, 28, 1, 11) rmodel1 = Model(num_labels=11).model rmodel1.load_weights(path + "_advtraining-left") if name == "cifar": rmodel1 = Wrap(rmodel1, 32, 3, 11) else: rmodel1 = Wrap(rmodel1, 28, 1, 11) rmodel2 = Model(num_labels=11).model rmodel2.load_weights(path + "_advtraining-right") if name == "cifar": rmodel2 = Wrap(rmodel2, 32, 3, 11) else: rmodel2 = Wrap(rmodel2, 28, 1, 11) rmodel2.model.summary() attack2 = CarliniL2(sess, rmodel, batch_size=100, max_iterations=2000, confidence=.1, binary_search_steps=3, targeted=True, initial_const=10, learning_rate=1e-2) #test_adv = np.load("tmp/outlieradvtest.npy") #print('qq',np.mean(rmodel.model.predict_classes(test_adv)==10)) N = 100 targets = utils.get_labs(data.test_labels[:100]) #""" test_adv = attack.attack(data.test_data[:N], targets) print( 'mean distortion', np.mean( np.sum((test_adv - data.test_data[:N])**2, axis=(1, 2, 3))**.5)) print('model predict', np.argmax(model.model.predict(test_adv), axis=1)) print('rmodel predict', np.argmax(rmodel.model.predict(test_adv), axis=1)) #""" targets2 = np.zeros((N, 11)) targets2[:, :10] = targets test_adv = attack2.attack(data.test_data[:N], targets2) print(list(test_adv[0].flatten())) print( 'mean distortion', np.mean( np.sum((test_adv - data.test_data[:N])**2, axis=(1, 2, 3))**.5)) a = (np.argmax(model.model.predict(test_adv), axis=1)) #print(a) print('summary', np.mean(a == np.argmax(targets, axis=1)), np.mean(a == 10)) a = (np.argmax(rmodel.model.predict(test_adv), axis=1)) #print(a) print('summary', np.mean(a == np.argmax(targets, axis=1)), np.mean(a == 10)) a = (np.argmax(rmodel1.model.predict(test_adv), axis=1)) #print(a) print('summary', np.mean(a == np.argmax(targets, axis=1)), np.mean(a == 10)) a = (np.argmax(rmodel2.model.predict(test_adv), axis=1)) #print(a) print('summary', np.mean(a == np.argmax(targets, axis=1)), np.mean(a == 10))
model.add(Conv2D(32, (2, 2), activation="relu", padding="same")) model.add(Conv2D(128, (2, 2), activation="relu", padding="same")) model.add(Conv2D(128, (1, 1), activation="relu", padding="same")) model.add(Flatten()) model.add(Dense(10, activation="softmax")) model.summary() #model = model_mnist(input_image=Input(shape=(28, 28, 1))) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["acc"]) model.load_weights('./cifar100/mnist_cnn_G08.hdf5') with tf.Session() as sess: data, model1 = MNIST_data(), MNISTModel() attack = CarliniL2(sess, model1, batch_size=100, max_iterations=1000, confidence=0, boxmin=0, boxmax=1) inputs, targets = generate_data(data, samples=N, targeted=True, start=0, inception=False) print(targets) adv = attack.attack(inputs, targets) # MNISTデータの表示 W = 10 # 横に並べる個数 H = 10 # 縦に並べる個数 fig = plt.figure(figsize=(H, W)) fig.subplots_adjust(left=0, right=1, bottom=0, top=1.0, hspace=0.05, wspace=0.05) for i in range(W*H): ax1 = fig.add_subplot(H, W, i + 1, xticks=[], yticks=[]) ax1.imshow(x_test[i].reshape((28, 28)), cmap='gray') plt.savefig('./cifar100/mnist_x_test100_G08.jpg')
def run_kde(Data, Model, path): global DECONST sess = K.get_session() K.set_learning_phase(False) data, model = Data(), Model(path) model2 = Model(path) # TODO: hidden_layer -> selected layer layer_name = "activation_7" hidden_layer = pop_layer(model2.model, layer_name) #hidden_layer = pop(model2.model) # once to remove dense(10) #hidden_layer = pop(hidden_layer) # once to remove ReLU #compute_optimal_sigma(sess, model, hidden_layer, data) #MNIST SIGMA: 20 removed_cols = [] for i in range(10): removed_cols.extend( get_removed_cols( hidden_layer, data.train_data[np.argmax(data.train_labels, axis=1) == i])) removed_cols = list(set(removed_cols)) de = [ DensityEstimate( sess, hidden_layer, data.train_data[np.argmax(data.train_labels, axis=1) == i], model.image_size, model.num_channels, removed_cols, sigma=0.864) for i in range(10) ] #de2 = [DensityEstimate(sess, hidden_layer, data.train_data[np.argmax(data.train_labels,axis=1)==i], model.image_size, model.num_channels, sigma=0.864) for i in range(10)] de2 = de p = tf.placeholder( tf.float32, (None, model.image_size, model.image_size, model.num_channels)) #print(np.log(de[0].predict(data.test_data[:10]))) #print(sess.run(rmodel.predict(p)[1], {p: data.test_data[:10]})) #exit(0) N = 9 #print(model.model.predict(data.train_data[:N])) #print(hidden_layer.predict(data.train_data[:N])) adv_candid = [] jumped = False adv_labels = np.zeros((9, 10)) for i in range(0, 10): if i == TARGET_CLASS: jumped = True continue adv_candid.extend( data.test_data[np.argmax(data.test_labels, axis=1) == i][:1]) if jumped: adv_labels[i - 1][TARGET_CLASS] = 1 else: adv_labels[i][TARGET_CLASS] = 1 adv_candid = np.array(adv_candid) #for i in range(10): # for j in range(N): # print(de[i].predict(data.train_data[j:j+1])) # N #start_density = estimate_density_full(model, de, data.test_data[M:M+N])+1e-30 start_density = estimate_density_full(model, de, adv_candid) + 1e-30 print("starting density", -np.log(start_density)) #print("starting density", -start_density) DECONST = -np.log(start_density) #DECONST = -start_density DECONST = np.median(DECONST) #DECONST = 0 print("DECONST", DECONST) #DECONST = -1 l = np.zeros((N, 10)) #l[np.arange(N),np.random.random_integers(0,9,N)] = 1 for i in range(N): r = np.random.random_integers(0, 9) while r == np.argmax(data.test_labels[i]): r = np.random.random_integers(0, 9) l[i, r] = 1 l = adv_labels print(l) attack1 = CarliniL2(sess, model, batch_size=1, max_iterations=3000, binary_search_steps=3, initial_const=1.0, learning_rate=1e-1, targeted=True) attack2 = CarliniL2New(sess, model, batch_size=1, max_iterations=60000, binary_search_steps=5, initial_const=1.0, learning_rate=1e-2, targeted=True, extra_loss=extra_loss(de2, TARGET_CLASS), debug_extra_loss=debug_extra_loss( de2, TARGET_CLASS), de=de2) #l = data.test_labels[:N] #l = np.zeros((N,10)) #l[np.arange(N),1] = 1 print("RUN PHASE 1") #adv = attack1.attack(data.test_data[M:M+N], l) adv = attack1.attack(adv_candid, l) #print('mean distortion',np.mean(np.sum((adv-data.test_data[M:M+N])**2,axis=(1,2,3))**.5)) print('mean distortion', np.mean(np.sum((adv - adv_candid)**2, axis=(1, 2, 3))**.5)) print("RUN PHASE 2") #adv = attack2.attack(data.test_data[M:M+N], adv, l) adv = attack2.attack(adv_candid, adv, l) #np.save("/tmp/q"+str(M),adv) np.save("./adv/adv_mnist_cnw_target_{}".format(TARGET_CLASS), adv) #adv = np.load("/tmp/qq.npy") #print('labels',np.mean(np.argmax(sess.run(model.predict(p), {p: adv}),axis=1)==l)) print('labels') print(np.argmax(l, axis=1)) print(np.argmax(sess.run(model.predict(p), {p: adv}), axis=1)) print(np.argmax(model.model.predict(adv), axis=1)) #print('mean distortion',np.mean(np.sum((adv-data.test_data[M:M+N])**2,axis=(1,2,3))**.5)) print('mean distortion', np.mean(np.sum((adv - adv_candid)**2, axis=(1, 2, 3))**.5)) #a = estimate_density_full(model, de, data.test_data[M:M+N])+1e-30 a = estimate_density_full(model, de, adv_candid) + 1e-30 b = estimate_density_full(model, de, adv) + 1e-30 #print(data.test_data.shape) #print(adv.shape) show(adv) print('de of test', np.mean(-np.log(a))) print('de of adv', np.mean(-np.log(b))) print('better ratio', np.mean(np.array(a) > np.array(b))) exit(0) #density = gaussian_kde(np.array(np.log(a))-np.array(np.log(b))) #density_a = gaussian_kde(np.log(a)) #density_b = gaussian_kde(np.log(b)) xs = np.linspace(-25, 25, 200) fig = plt.figure(figsize=(4, 3)) fig.subplots_adjust(bottom=0.17, left=.15, right=.85) plt.xlabel('log(KDE(valid))-log(KDE(adversarial))') plt.ylabel('Occurrances') #plt.hist(np.log(a),100) #plt.hist(np.log(b),100) plt.hist(np.log(a) - np.log(b), 100) #plt.hist(np.array(np.log(a))-np.array(np.log(b)),100) #a = plt.plot(xs,density_a(xs), 'r--',color='blue', label='Valid') #b = plt.plot(xs,density_b(xs), color='red', label='Adversarial') #plt.plot(xs,density(xs)) #plt.legend(handles=[a[0], b[0]]) pp = PdfPages('/tmp/a.pdf') plt.savefig(pp, format='pdf') pp.close()
def run_nn_detection(Data, path): data = Data() sess = K.get_session() K.set_learning_phase(False) model_with_detector = ResnetBuilder.build_resnet_32((3, 32, 32), 10, with_detector=2, activation=False) model_with_detector.save_weights("/tmp/q") model_with_detector.load_weights("models/cifar-layerdetect-37-0") N = 10#len(data.test_data)//100 """ # uncomment to generate adversarial testing data model = ResnetBuilder.build_resnet_32((3, 32, 32), 10, activation=False) model.load_weights("models/cifar-resnet") model = Wrap(model) #attack = FGS(sess, model) attack = CarliniL2(sess, model, batch_size=100, binary_search_steps=3, initial_const=0.1, max_iterations=3000, learning_rate=0.005, confidence=0, targeted=False) for i in range(0,N,1000): test_adv = attack.attack(data.test_data[i:i+100], data.test_labels[i:i+100]) np.save("tmp/testadv"+path.split("/")[1]+str(i), test_adv) #""" test_adv = [] for i in range(0,N,1000): test_adv.extend(np.load("tmp/testadv"+path.split("/")[1]+str(i)+".npy")) test_adv = np.array(test_adv) print('Accuracy of model on test set',np.mean(np.argmax(model_with_detector.predict(data.test_data)[0],axis=1)==np.argmax(data.test_labels,axis=1))) print('Accuracy of model on adversarial data',np.mean(np.argmax(model_with_detector.predict(test_adv)[0],axis=1)==np.argmax(data.test_labels,axis=1))) print('Probaility detects valid data as valid',np.mean(model_with_detector.predict(data.test_data)[1]<=0)) print('Probability detects adversarail data as adversarial',np.mean(model_with_detector.predict(test_adv)[1]>0)) xs = tf.placeholder(tf.float32, [None, 32, 32, 3]) rmodel = RobustModel(model_with_detector) preds = rmodel.predict(xs) y1 = np.argmax(sess.run(preds, {xs: data.test_data[:N]}),axis=1) print('Robust model accuracy on test dat',np.mean(y1==np.argmax(data.test_labels[:N],axis=1))) print('Probability robust model detects valid data as adversarial', np.mean(y1==10)) y2 = np.argmax(sess.run(preds, {xs: test_adv}),axis=1) print('Probability robust model detects adversarial data as adversarial', np.mean(y2==10)) attack = CarliniL2(sess, rmodel, batch_size=10, binary_search_steps=3, initial_const=0.1, max_iterations=300, learning_rate=0.01, confidence=0, targeted=True) targets = np.argmax(model_with_detector.predict(test_adv[:N])[0],axis=1) realtargets = np.zeros((N, 11)) realtargets[np.arange(N),targets] = 1 np.save("tmp/adaptiveattack",attack.attack(data.test_data[:N], realtargets)) adv = np.load("tmp/adaptiveattack.npy") print('Accuracy on adversarial data',np.mean(np.argmax(model_with_detector.predict(adv)[0],axis=1)==np.argmax(data.test_labels,axis=1))) print('Probability detector detects adversarial data as adversarial',np.mean(model_with_detector.predict(adv)[1]>0)) d=np.sum((adv-data.test_data[:N])**2,axis=(1,2,3))**.5 print("mean distortion attacking robust model", np.mean(d)) d=np.sum((test_adv[:N]-data.test_data[:N])**2,axis=(1,2,3))**.5 print("mean distortion attacking unsecurred model", np.mean(d)) model_with_detector_2 = ResnetBuilder.build_resnet_32((3, 32, 32), 10, with_detector=2, activation=False) model_with_detector_2.load_weights("models/cifar-layerdetect-42-0") print('Accuracy on adversarial data',np.mean(np.argmax(model_with_detector_2.predict(adv)[0],axis=1)==np.argmax(data.test_labels,axis=1))) print('Probability detector detects adversarial data as adversarial',np.mean(model_with_detector_2.predict(adv)[1]>0))
generated_img) # preprocess image enc_gen, enc_gen_layers = stn.encoder.encode(generated_img) if data_set == "cifar10": classifier = Model("eval", raw_cifar.train_images) classifier._build_model(adv_img, label, reuse=False, conf=0.1) adv_loss = - classifier.target_loss adv_acc = classifier.accuracy adv_acc_y = tf.cast(classifier.correct_prediction, tf.float32) classifier._build_model(content, label, reuse=True) normal_loss = - classifier.target_loss norm_acc = classifier.accuracy logits = classifier.pre_softmax pgd_attack = LinfPGDAttack(classifier.xent, content, label, epsilon=0.25 * 255, num_steps=200, step_size=0.05*255, random_start=True) CarliniL2.pgd_attack() elif data_set == "imagenet": classifier = build_imagenet_model( adv_img_bgr, label, conf=0.1, shrink_class=shrink_class) adv_loss = - classifier.target_loss adv_acc = classifier.accuracy adv_acc_y = classifier.acc_y adv_acc_y_5 = classifier.acc_y_5 #logits = classifier.logits content_bgr = tf.reverse( content, axis=[-1]) # switch RGB to BGR classifier = build_imagenet_model( content_bgr, label, reuse=True, shrink_class=shrink_class)
def test(Model, data, path): keras.backend.set_learning_phase(False) model = make_model(Model, dropout=False) model.load_weights(path) modeld = make_model(Model, dropout=True) modeld.load_weights(path) guess = model.predict(data.test_data) print(guess[:10]) print( 'Accuracy wihtout dropout', np.mean( np.argmax(guess, axis=1) == np.argmax(data.test_labels, axis=1))) guess = modeld.predict(data.test_data) print( 'Accuracy with dropout', np.mean( np.argmax(guess, axis=1) == np.argmax(data.test_labels, axis=1))) sess = keras.backend.get_session() N = 10 labs = get_labs(data.test_data[:N]) print(labs) print('good?', np.sum(labs * data.test_labels[:N])) attack = CarliniL2(sess, Wrap(model), batch_size=N, max_iterations=1000, binary_search_steps=3, learning_rate=1e-1, initial_const=1, targeted=True, confidence=0) adv = attack.attack(data.test_data[:N], labs) guess = model.predict(adv) print('average distortion', np.mean(np.sum((data.test_data[:N] - adv)**2, axis=(1, 2, 3))**.5)) print(guess[:10]) print("Test data") valid_u = compute_u(sess, modeld, data.test_data[:N]) print("Adversarial examples") valid_u = compute_u(sess, modeld, adv) # The below attack may not even be necessary for CIFAR # the adversarial examples generated with (3,1000,1e-1) have a lower mean # uncertenty than the test images, but again with a 3x increase in distortion. if ISMNIST: p = tf.placeholder(tf.float32, (None, 28, 28, 1)) else: p = tf.placeholder(tf.float32, (None, 32, 32, 3)) r = differentable_u(modeld, p, 100) models = [] for _ in range(20): m = make_model(Model, dropout=True, fixed=True) m.load_weights(path) models.append(m) #r2 = differentable_u_multiple(models, p) #print('uncertenty on test data', np.mean((sess.run(r, {p: data.test_data[:N]})))) #print('uncertenty on test data (multiple models)', np.mean((sess.run(r2, {p: data.test_data[:N]})))) #print('labels on robust model', np.argmax(sess.run(robustmodel.predict(p), {p: data.test_data[:100]}),axis=1)) attack = CarliniL2Multiple(sess, [Wrap(m) for m in models], batch_size=10, binary_search_steps=4, initial_const=1, max_iterations=1000, confidence=1, targeted=True, abort_early=False, learning_rate=1e-1) #z = np.zeros((N, 10)) #z[np.arange(N),np.random.random_integers(0,9,N)] = 1 #z[np.arange(N),(9, 3, 0, 8, 7, 3, 4, 1, 6, 4)] = 1 print(z) #qq = (3, 2, 1, 18, 4, 8, 11, 0, 61, 7) #np.save("images/mnist_dropout", attack.attack(data.test_data[qq,:,:,:], # np.pad(np.roll(data.test_labels[qq,:],1,axis=1), [(0, 0), (0, 0)], 'constant'))) #exit(0) adv = attack.attack(data.test_data[:N], labs) #adv = attack.attack(data.test_data[:N], data.test_labels[:N]) np.save("/tmp/dropout_adv_" + str(ISMNIST), adv) #adv = np.load("/tmp/qq.npy") guess = model.predict(adv) print('normal predictions', guess) print('average distortion', np.mean(np.sum((data.test_data[:N] - adv)**2, axis=(1, 2, 3))**.5)) print('normal label predictions', np.argmax(guess, axis=1)) for m in models: print('model preds', np.argmax(m.predict(adv), axis=1)) print( 'Model accuracy on adversarial examples', np.mean( np.argmax(guess, axis=1) == np.argmax(data.test_labels[:N], axis=1))) adv_u = compute_u(sess, modeld, adv) #print('differentable uncertienty',np.mean((sess.run(r, {p: adv})))) print('Targetted adversarial examples success rate', np.mean(np.argmax(guess, axis=1) == np.argmax(z, axis=1))) import matplotlib import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages """ fig = plt.figure(figsize=(4,3)) fig.subplots_adjust(bottom=0.15,left=.15) a=plt.hist(adv_u, 100, log=True, label="Adversarial (FGS)") b=plt.hist(valid_u, 100, log=True, label="Valid") plt.xlabel('Uncertainty') plt.ylabel('Occurrances (log scaled)') plt.legend() """ fig = plt.figure(figsize=(4, 3)) fig.subplots_adjust(bottom=0.15, left=.15) b = plt.hist(valid_u - adv_u, 100, label="Valid") plt.xlabel('U(valid)-U(adversarial)') plt.ylabel('Occurrances') pp = PdfPages('/tmp/a.pdf') plt.savefig(pp, format='pdf') pp.close() plt.show()
def run_kde(Data, Model, path): global DECONST sess = K.get_session() K.set_learning_phase(False) data, model = Data(), Model(path) model2 = Model(path) hidden_layer = pop(model2.model) # once to remove dense(10) hidden_layer = pop(hidden_layer) # once to remove ReLU #compute_optimal_sigma(sess, model, hidden_layer, data) #MNIST SIGMA: 20 de = [ DensityEstimate( sess, hidden_layer, data.train_data[np.argmax(data.train_labels, axis=1) == i], model.image_size, model.num_channels, sigma=20) for i in range(10) ] de2 = [ DensityEstimate( sess, hidden_layer, data.train_data[np.argmax(data.train_labels, axis=1) == i][:100], model.image_size, model.num_channels, sigma=20) for i in range(10) ] p = tf.placeholder( tf.float32, (None, model.image_size, model.image_size, model.num_channels)) #print(np.log(de[0].predict(data.test_data[:10]))) #print(sess.run(rmodel.predict(p)[1], {p: data.test_data[:10]})) #exit(0) N = 1 print(model.model.predict(data.train_data[:N])) print(hidden_layer.predict(data.train_data[:N])) for i in range(10): print(de[i].predict(data.train_data[:N])) start_density = estimate_density_full(model, de, data.test_data[M:M + N]) + 1e-30 print("starting density", np.log(start_density)) DECONST = -np.log(start_density) l = np.zeros((N, 10)) #l[np.arange(N),np.random.random_integers(0,9,N)] = 1 for i in range(N): r = np.random.random_integers(0, 9) while r == np.argmax(data.test_labels[i]): r = np.random.random_integers(0, 9) l[i, r] = 1 attack1 = CarliniL2(sess, model, batch_size=1, max_iterations=3000, binary_search_steps=3, initial_const=1.0, learning_rate=1e-1, targeted=True) attack2 = CarliniL2New(sess, model, batch_size=1, max_iterations=10000, binary_search_steps=5, initial_const=1.0, learning_rate=1e-2, targeted=True, extra_loss=extra_loss(de2, np.argmax(l))) #l = data.test_labels[:N] #l = np.zeros((N,10)) #l[np.arange(N),1] = 1 print("RUN PHASE 1") adv = attack1.attack(data.test_data[M:M + N], l) print( 'mean distortion', np.mean( np.sum((adv - data.test_data[M:M + N])**2, axis=(1, 2, 3))**.5)) print("RUN PHASE 2") adv = attack2.attack(data.test_data[M:M + N], adv, l) np.save("/tmp/q" + str(M), adv) #adv = np.load("/tmp/qq.npy") print( 'labels', np.mean(np.argmax(sess.run(model.predict(p), {p: adv}), axis=1) == l)) print( 'mean distortion', np.mean( np.sum((adv - data.test_data[M:M + N])**2, axis=(1, 2, 3))**.5)) a = estimate_density_full(model, de, data.test_data[M:M + N]) + 1e-30 b = estimate_density_full(model, de, adv) + 1e-30 show(adv) print('de of test', np.mean(np.log(a))) print('de of adv', np.mean(np.log(b))) print('better ratio', np.mean(np.array(a) > np.array(b))) exit(0) #density = gaussian_kde(np.array(np.log(a))-np.array(np.log(b))) #density_a = gaussian_kde(np.log(a)) #density_b = gaussian_kde(np.log(b)) xs = np.linspace(-25, 25, 200) fig = plt.figure(figsize=(4, 3)) fig.subplots_adjust(bottom=0.17, left=.15, right=.85) plt.xlabel('log(KDE(valid))-log(KDE(adversarial))') plt.ylabel('Occurrances') #plt.hist(np.log(a),100) #plt.hist(np.log(b),100) plt.hist(np.log(a) - np.log(b), 100) #plt.hist(np.array(np.log(a))-np.array(np.log(b)),100) #a = plt.plot(xs,density_a(xs), 'r--',color='blue', label='Valid') #b = plt.plot(xs,density_b(xs), color='red', label='Adversarial') #plt.plot(xs,density(xs)) #plt.legend(handles=[a[0], b[0]]) pp = PdfPages('/tmp/a.pdf') plt.savefig(pp, format='pdf') pp.close() plt.show()