def attackWithDefense(datasetName = "spambase", seed = 18, OUTPUT_FOLDER=None, PLOT_FOLDER=None): validation_size = 0.3 train_split_size = 0.7 if datasetName == "spambase": #load data dataset = load_dataset.load_dataset_spambase() data_size = dataset.shape[0] # Prepare data array = dataset.values X = array[:,0:57] Y = array[:,57] # Split-out validation dataset X_train, X_validation, Y_train, Y_validation \ = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) #find median/centroid median = dataset.groupby('class').median() #pre-determined value using plotEps(), used when clustering eps = 60 elif datasetName =="mnist17": X_train, Y_train, X_validation, Y_validation = load_dataset.load_dataset_mnist17(sampling=True) #find median/centroid median_neg = np.median(X_train[Y_train == -1],axis =0) median_pos = np.median(X_train[Y_train == 1], axis = 0) median = pandas.DataFrame(data = [median_neg, median_pos], index = [-1,1]) print(median_pos.shape) #pre-determined value using plotEps(), used when clustering eps = 40 elif datasetName =="mnist01": X_train, Y_train, X_validation, Y_validation = load_dataset.load_dataset_mnist01(sampling=True) #find median/centroid median_neg = np.median(X_train[Y_train == -1],axis =0) median_pos = np.median(X_train[Y_train == 1], axis = 0) median = pandas.DataFrame(data = [median_neg, median_pos], index = [-1,1]) print(median_pos.shape) print("Data size:"+str(X_train.shape[0])+","+str(X_validation.shape[0])) train_size = X_train.shape[0] print(train_size) days = 1 incoming_sets_X = [] incoming_sets_Y = [] #separate data into one large initial clean set, and 10 incoming dataset #train_1: clean data, train_2: untrusted data X_train_1, X_train_2, Y_train_1, Y_train_2 \ = model_selection.train_test_split(X_train, Y_train, test_size=train_split_size, random_state=seed) num_split = math.floor(X_train_2.shape[0]/10) for d in range(days-1): incoming_sets_X.append(X_train_2[d*num_split: (d+1)*num_split,:]) incoming_sets_Y.append(Y_train_2[d*num_split: (d+1)*num_split]) #append the remaining points into the last set incoming_sets_X.append(X_train_2[(days-1)*num_split:X_train_2.shape[0],:]) incoming_sets_Y.append(Y_train_2[(days-1)*num_split:X_train_2.shape[0]]) ##with tf.Session() as sess: ## Y_train = tf.one_hot(Y_train,2).eval() ## Y_validation = tf.one_hot(Y_validation,2).eval() ##print(X_validation.shape) ##print(Y_validation.shape) #output directory for attack steps output_root = os.path.join(OUTPUT_FOLDER, 'ddd') poison_percentage = [0.05,0.1,0.15,0.2] step_size = 0.01 #filter values filt = 0.073 omegaContribPercentage = 0.8 filtList = [] thetaList = [] accList = [] undefList = [] datasetList_X = [] datasetList_Y = [] removeList = [] attackList=[] base_acc = 0 num_points_total = round(train_size * poison_percentage[1]) #determine value of theta using the clean dataset (without removing any points at this stage) #cluster the data first clustering = clusterData(X_train_1, Y_train_1, eps, plotName = PLOT_FOLDER + "/" + datasetName + "_orignal.png") clusters = clustering.labels_ #compute omega omega = computeOmega(X_train_1, Y_train_1, clusters, omegaContribPercentage) all_average_dist = computeCurieDistance(X_train_1, Y_train_1, clusters, omega ) #theta is a single value for the entire dataset (instead of 1 per class) theta = getTheta(X_train_1,Y_train_1,all_average_dist, filt) print("Value of theta chosen to be: "+str(theta)) #num_points = 1 print("Number of Total Poisoning points: "+str(num_points_total)) #each day, the attacker injects (total/day) amount of malicious points (up to 20% at the end) num_points_each_day = int(num_points_total/days) #initialize X/Y_train_new, for iterations X_train_new = X_train_1 Y_train_new = Y_train_1 for i in range(days): label_flip = True #prepare dataset for model train = DataSet(X_train_new, Y_train_new) test = DataSet(X_validation, Y_validation) validation = DataSet(X_validation, Y_validation) data_sets = base.Datasets(train = train, validation = validation, test = test) #dataset changes every day model = SmoothHinge( input_dim=X_train.shape[1], temp=0, weight_decay=0.01, use_bias=True, num_classes=2, batch_size=X_train_new.shape[0], data_sets=data_sets, initial_learning_rate=0.001, decay_epochs=None, mini_batch=False, train_dir=output_root, log_dir='log', model_name='my_model') model.train() if base_acc ==0: base_acc = model.get_test_accuracy() #the defender starts by filtering #attacker attacks with entire dataset X_clean = np.append(X_train_new, incoming_sets_X[i], axis = 0) Y_clean = np.append(Y_train_new, incoming_sets_Y[i]) clustering = clusterData(X_clean, Y_clean, eps, plotName = PLOT_FOLDER + "/" + datasetName + "_ahead_filtered_day"+str(i)+".png") pre_clusters = clustering.labels_ all_average_dist = computeCurieDistance(X_clean, Y_clean, pre_clusters, omega ) #only filter data from the incoming set X_def,Y_def,indexKept, idx_removed = fixedFilter(X_clean[X_train_new.shape[0]:,:], Y_clean[X_train_new.shape[0]:], all_average_dist[X_train_new.shape[0]:], theta) removeList.append(len(idx_removed)) X_def = np.append(X_train_new, X_def, axis = 0) Y_def = np.append(Y_train_new, Y_def) print(X_def.shape) print(Y_def.shape) #attacker use the predicted filter result to optimize his attack start_poison = True if start_poison == True: #injects positive class datapoints(invert the label in next step) target_class = 1 X_modified, Y_modified =copy_random_points( incoming_sets_X[i], incoming_sets_Y[i], target_class=-1, num_copies=num_points_each_day, random_seed=seed, replace=True) ##X_modified, Y_modified = copy_random_points( ## X_modified, Y_modified, ## target_class=-1, ## num_copies=num_neg_copies, ## random_seed=random_seed, ## replace=True) if label_flip: Y_modified[incoming_sets_X[i].shape[0]:] = -Y_modified[incoming_sets_X[i].shape[0]:] #print(X_modified) #attacker attacks with entire dataset X_complete = np.append(X_def, X_modified[X_train_2.shape[0]:], axis = 0) Y_complete = np.append(Y_def, Y_modified[X_train_2.shape[0]:]) model.update_train_x_y(X_complete, Y_complete) model.train() #acquire projection rules for attack projection_fn = get_projection_fn( X_train_new, Y_train_new, clusters = clusters, distance = all_average_dist, sphere=True, slab=False, omega=omega, theta = theta, target_class = target_class) if projection_fn is not None: #perform the attack min_acc, min_X = iterative_attack.iterative_attack( model, indices_to_poison=np.arange(X_def.shape[0], X_complete.shape[0]), test_idx=None, test_description=None, step_size=step_size, num_iter=2000, loss_type='normal_loss', projection_fn=projection_fn, output_root=output_root) attackList.append("Y") #before proceeding, measure accuracy without sanitization X_no_def = np.append(X_clean, min_X[X_def.shape[0]:,:], axis = 0) Y_no_def = np.append(Y_clean, Y_complete[X_def.shape[0]:]) else: min_X = X_def Y_complete = Y_def model.update_train_x_y(X_def, Y_def) model.train() min_acc = model.get_test_accuracy() attackList.append("N") #before proceeding, measure accuracy without sanitization X_no_def = X_clean Y_no_def = Y_clean #print(model.data_sets.train.x) print(min_X.shape) model.update_train_x_y(X_no_def, Y_no_def) model.train() acc = model.get_test_accuracy() #prepare dataset for next day X_train_new = min_X Y_train_new = Y_complete #record accuracy into accList filtList.append(i) thetaList.append(theta) accList.append(min_acc) undefList.append(acc) datasetList_X.append(X_train_new) datasetList_Y.append(Y_train_new) #increment counter, reset model #filt = filt+0.1 tf.reset_default_graph() #re-cluster and compute distance #cluster the data first clustering = clusterData(X_train_new, Y_train_new, eps, plotName = PLOT_FOLDER + "/" + datasetName + "_poisoned_day"+str(i)+".png") clusters = clustering.labels_ all_average_dist = computeCurieDistance(X_train_new, Y_train_new, clusters, omega ) #theta is a single value for the entire dataset (instead of 1 per class) theta = getTheta(X_train_new,Y_train_new,all_average_dist, filt) print("Value of theta chosen to be: "+str(theta)) #print result to the console for now print(filtList) print(thetaList) print(undefList) print(accList) return filtList, thetaList, accList, undefList, datasetList_X, datasetList_Y, removeList, base_acc, attackList
def LBDefense(datasetName = "spambase", seed = 18, OUTPUT_FOLDER=None): train_split_size = 0.7 validation_size = 0.30 if datasetName == "spambase": #load data dataset = load_dataset.load_dataset_spambase() data_size = dataset.shape[0] # Prepare data array = dataset.values X = array[:,0:57] Y = array[:,57] X_train, X_validation, Y_train, Y_validation \ = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # #train_1: clean data, train_2: untrusted data X_train_1, X_train_2, Y_train_1, Y_train_2 \ = model_selection.train_test_split(X_train, Y_train, test_size=train_split_size, random_state=seed) median = dataset.groupby('class').median() elif datasetName =="mnist17": X_train, Y_train, X_validation, Y_validation = load_dataset.load_dataset_mnist17(sampling=True) #train_1: clean data, train_2: untrusted data X_train_1, X_train_2, Y_train_1, Y_train_2 \ = model_selection.train_test_split(X_train, Y_train, test_size=train_split_size, random_state=seed) #find median/centroid median_neg = np.median(X_train[Y_train == -1],axis =0) median_pos = np.median(X_train[Y_train == 1], axis = 0) median = pandas.DataFrame(data = [median_neg, median_pos], index = [-1,1]) print(median_pos.shape) print("Data size:"+str(X_train.shape[0])+","+str(X_validation.shape[0])) train_size = X_train.shape[0] print(train_size) ##with tf.Session() as sess: ## Y_train = tf.one_hot(Y_train,2).eval() ## Y_validation = tf.one_hot(Y_validation,2).eval() ##print(X_validation.shape) ##print(Y_validation.shape) #prepare dataset for model train = DataSet(X_train_1, Y_train_1) test = DataSet(X_train_1, Y_train_1) complete_train = DataSet(X_train, Y_train) complete_test = DataSet(X_train, Y_train) validation = DataSet(X_validation, Y_validation) data_sets = base.Datasets(train = train, validation = validation, test = validation) complete_datasets = base.Datasets(train = complete_train, validation = complete_test, test = validation) #the output directory of attack steps output_root = os.path.join(OUTPUT_FOLDER, 'ddd') poison_percentage = [0.05,0.1,0.15,0.2] step_size = 0.01 #filter values filt = 0.0 #defense_filts = [50,30,10,5,3,1, 0.1] defense_filts = [30,10,5,3,1, 0.5] #defense_filts = [0.5] epsilon = 0.1 filtList = [] accList = [] noDefList = [] poisonLossList = [] lb_remove_list=[] num_points = round(train_size * poison_percentage[3]) #num_points = 1 print("Number of Poisoning points: "+str(num_points)) for i in range(1): defense_filt=defense_filts[i] #place for loop later label_flip = True model = SmoothHinge( input_dim=X_train.shape[1], temp=0, weight_decay=0.01, use_bias=True, num_classes=2, batch_size=train_size, data_sets=complete_datasets, initial_learning_rate=0.001, decay_epochs=None, mini_batch=False, train_dir=output_root, log_dir='log', model_name='my_model') model.train() base_acc = model.get_test_accuracy() #getLoss(model,model.data_sets.test, 10) #print(Y_validation.shape) print(X_train.shape[0]) model.update_train_x_y(X_train_1, Y_train_1) model.train() #with tf.variable_scope('softmax_linear'): weights = np.float32(model.getWeight()) print(weights) start_poison = True if start_poison == True: #injects positive class data (invert the label in next step) X_modified, Y_modified =copy_random_points( X_train_2, Y_train_2, target_class=-1, num_copies=num_points, random_seed=seed, replace=True) if label_flip: Y_modified[X_train_2.shape[0]:] = -Y_modified[X_train_2.shape[0]:] #attacker attacks with entire dataset X_complete = np.append(X_train_1, X_modified, axis = 0) Y_complete = np.append(Y_train_1, Y_modified) #X_complete.append(X_train_1) #X_complete.append(X_modified) #Y_complete.append(Y_train_1) #Y_complete.append(Y_modified) print(X_complete.shape) print(Y_complete.shape) model.update_train_x_y(X_complete, Y_complete) model.train() #get the loss-based projection rules for attack (only implemented for linear classifiers) projection_fn = get_projection_fn( X_train_1, Y_train_1, sphere=False, loss=True, loss_value = defense_filt, clean_model_weight = weights ) #perform the attack min_acc, min_X = iterative_attack.iterative_attack( model, indices_to_poison=np.arange(X_train.shape[0], X_complete.shape[0]), test_idx=None, test_description=None, step_size=step_size, num_iter=2000, loss_type='normal_loss', projection_fn=projection_fn, output_root=output_root) #print(model.data_sets.train.x) #acc = model.get_test_accuracy() noDefList.append(min_acc) #print(model.data_sets.train.x[X_train.shape[0]:X_complete.shape[0],:]) applyDefense = True if applyDefense == True: #HERE WE ACQUIRE THE POISONED DATASET (SET 2 DATA PLUS POISONED DATA) Y_complete = model.data_sets.train.labels if start_poison ==False: poisoned_X = X_train_2 poisoned_Y = Y_train_2 else: X_complete = min_X poisoned_X = X_complete[X_train_1.shape[0]:] poisoned_Y = Y_complete[X_train_1.shape[0]:] #get the original classifier model.update_train_x_y(X_train_1, Y_train_1) model.train() #SET 2 DATA + POISONING SET modifiedData = DataSet(poisoned_X, poisoned_Y) total_loss_poison = 0 loss_poison = [] for i in range (poisoned_X.shape[0]): t = getLoss(model, modifiedData,i) loss_poison.append(t) total_loss_poison = total_loss_poison+t #print(loss_poison) #print ("Average poison loss: "+str(total_loss_poison / (X_modified.shape[0]-X_train.shape[0]))) #poisonLossList.append(total_loss_poison / (X_modified.shape[0]-X_train.shape[0])) #apply filter, using loss as distance X_filtered, Y_filtered, indexKept,indexRemoved = fixedFilter(poisoned_X,poisoned_Y,loss_poison, defense_filt+epsilon) #calculate the amount of genuine points removed lb_remove_list.append((indexRemoved<=X_train_2.shape[0]).shape[0]) #compose the new training dataset X_complete = np.append(X_train_1, X_filtered, axis = 0) Y_complete = np.append(Y_train_1, Y_filtered) print(X_complete.shape) #X_complete.append(X_train_1) #X_complete.append(X_filtered) #Y_complete.append(Y_train_1) #Y_complete.append(Y_filtered) indexKept = np.sort(indexKept) #print(indexKept.shape) #print(poisoned_X.shape) #print(len(indexRemoved)) #train the model with the sanitized data model.update_train_x_y(X_complete, Y_complete) model.train() acc = model.get_test_accuracy() filtList.append(defense_filt) accList.append(acc) filt=filt+0.1 tf.reset_default_graph() #in the end, print the results to the console print(filtList) print(noDefList) print(accList) print(base_acc) print(lb_remove_list) return filtList, noDefList, accList, base_acc, lb_remove_list
help='One of: imdb, enron, dogfish, mnist_17') parser.add_argument('--shard', type=int) args = parser.parse_args() dataset_name = args.dataset_name shard = args.shard norm_sq_constraint = datasets.DATASET_NORM_SQ_CONSTRAINTS[dataset_name] X_train, Y_train, X_test, Y_test = datasets.load_dataset(dataset_name) if sparse.issparse(X_train): X_train = X_train.toarray() if sparse.issparse(X_test): X_test = X_test.toarray() train = DataSet(X_train, Y_train) validation = None # We want to directly attack the clean train data # so we pretend that it's the test data test = DataSet(X_train, Y_train) data_sets = base.Datasets(train=train, validation=validation, test=test) temp = 0 input_dim = X_train.shape[1] weight_decay = 0.01 if X_train.shape[0] % 100 == 0: batch_size = 100 else: batch_size = X_train.shape[0] initial_learning_rate = 0.001