Ejemplo n.º 1
0
def attackWithDefense(datasetName = "spambase", seed = 18, OUTPUT_FOLDER=None, PLOT_FOLDER=None):
        
    validation_size = 0.3
    train_split_size = 0.7
    if datasetName == "spambase":
        #load data
        dataset = load_dataset.load_dataset_spambase()
            
        data_size = dataset.shape[0]


        # Prepare data
        array = dataset.values
        X = array[:,0:57]
        Y = array[:,57]

        # Split-out validation dataset
        X_train, X_validation, Y_train, Y_validation \
        = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)


        
        #find median/centroid
        median = dataset.groupby('class').median()
        #pre-determined value using plotEps(), used when clustering
        eps = 60
    elif datasetName =="mnist17":
        X_train, Y_train, X_validation, Y_validation = load_dataset.load_dataset_mnist17(sampling=True)
        #find median/centroid
        median_neg = np.median(X_train[Y_train == -1],axis  =0)
        median_pos = np.median(X_train[Y_train == 1], axis = 0)

        median = pandas.DataFrame(data = [median_neg, median_pos], index = [-1,1])
        print(median_pos.shape)
        
        #pre-determined value using plotEps(), used when clustering
        eps = 40
    elif datasetName =="mnist01":
        X_train, Y_train, X_validation, Y_validation = load_dataset.load_dataset_mnist01(sampling=True)
        
        #find median/centroid
        median_neg = np.median(X_train[Y_train == -1],axis  =0)
        median_pos = np.median(X_train[Y_train == 1], axis = 0)

        median = pandas.DataFrame(data = [median_neg, median_pos], index = [-1,1])
        print(median_pos.shape)
        
    print("Data size:"+str(X_train.shape[0])+","+str(X_validation.shape[0]))

    train_size = X_train.shape[0]
    print(train_size)

    days = 1
    incoming_sets_X = []
    incoming_sets_Y = []
    #separate data into one large initial clean set, and 10 incoming dataset
    #train_1: clean data, train_2: untrusted data
    X_train_1, X_train_2, Y_train_1, Y_train_2 \
    = model_selection.train_test_split(X_train, Y_train, test_size=train_split_size, random_state=seed)

    num_split = math.floor(X_train_2.shape[0]/10)
    for d in range(days-1):
        incoming_sets_X.append(X_train_2[d*num_split: (d+1)*num_split,:])
        incoming_sets_Y.append(Y_train_2[d*num_split: (d+1)*num_split])
    #append the remaining points into the last set
    incoming_sets_X.append(X_train_2[(days-1)*num_split:X_train_2.shape[0],:])
    incoming_sets_Y.append(Y_train_2[(days-1)*num_split:X_train_2.shape[0]])


    ##with tf.Session() as sess:
    ##    Y_train = tf.one_hot(Y_train,2).eval()
    ##    Y_validation = tf.one_hot(Y_validation,2).eval()
    ##print(X_validation.shape)
    ##print(Y_validation.shape)


    #output directory for attack steps
    
    output_root = os.path.join(OUTPUT_FOLDER, 'ddd')

    poison_percentage = [0.05,0.1,0.15,0.2]
    step_size = 0.01

    #filter values
    filt = 0.073
    omegaContribPercentage = 0.8
    
    filtList = []
    thetaList = []
    accList = []
    undefList = []
    datasetList_X = []
    datasetList_Y = []
    removeList = []
    attackList=[]
    base_acc = 0
    
    num_points_total = round(train_size * poison_percentage[1])

    #determine value of theta using the clean dataset (without removing any points at this stage)
    
    #cluster the data first
    clustering = clusterData(X_train_1, Y_train_1, eps, plotName = PLOT_FOLDER + "/" + datasetName + "_orignal.png")

    clusters = clustering.labels_

    #compute omega
    omega = computeOmega(X_train_1, Y_train_1, clusters, omegaContribPercentage)
    
    all_average_dist = computeCurieDistance(X_train_1, Y_train_1, clusters, omega )

    #theta is a single value for the entire dataset (instead of 1 per class)
    theta = getTheta(X_train_1,Y_train_1,all_average_dist, filt)
    print("Value of theta chosen to be: "+str(theta))

    #num_points = 1
    print("Number of Total Poisoning points: "+str(num_points_total))

    #each day, the attacker injects (total/day) amount of malicious points (up to 20% at the end)
    num_points_each_day = int(num_points_total/days)

    #initialize X/Y_train_new, for iterations
    X_train_new = X_train_1
    Y_train_new = Y_train_1
    for i in range(days):
        
        
        label_flip = True

        #prepare dataset for model
        train = DataSet(X_train_new, Y_train_new)
        test = DataSet(X_validation, Y_validation)
        validation = DataSet(X_validation, Y_validation)
        data_sets = base.Datasets(train = train, validation = validation, test = test)


        #dataset changes every day
        model = SmoothHinge(            
                    input_dim=X_train.shape[1],
                    temp=0,
                    weight_decay=0.01,
                    use_bias=True,
                    num_classes=2,
                    batch_size=X_train_new.shape[0],
                    data_sets=data_sets,
                    initial_learning_rate=0.001,
                    decay_epochs=None,
                    mini_batch=False,
                    train_dir=output_root,
                    log_dir='log',
                    model_name='my_model')




        model.train()

        if base_acc ==0:
                base_acc = model.get_test_accuracy()
        #the defender starts by filtering
        #attacker attacks with entire dataset
        X_clean = np.append(X_train_new, incoming_sets_X[i], axis = 0)
        Y_clean = np.append(Y_train_new, incoming_sets_Y[i])
            
        clustering = clusterData(X_clean, Y_clean, eps, plotName = PLOT_FOLDER + "/" + datasetName + "_ahead_filtered_day"+str(i)+".png")

        pre_clusters = clustering.labels_
    
        all_average_dist = computeCurieDistance(X_clean, Y_clean, pre_clusters, omega )

        #only filter data from the incoming set
        X_def,Y_def,indexKept, idx_removed = fixedFilter(X_clean[X_train_new.shape[0]:,:], Y_clean[X_train_new.shape[0]:], all_average_dist[X_train_new.shape[0]:], theta)

        removeList.append(len(idx_removed))
        X_def = np.append(X_train_new, X_def, axis = 0)
        Y_def = np.append(Y_train_new, Y_def)
        print(X_def.shape)
        print(Y_def.shape)
        #attacker use the predicted filter result to optimize his attack
        start_poison = True
        if start_poison == True:
            #injects positive class datapoints(invert the label in next step)
            target_class = 1
            
            X_modified, Y_modified =copy_random_points(
                incoming_sets_X[i], incoming_sets_Y[i], 
                target_class=-1, 
                num_copies=num_points_each_day, 
                random_seed=seed, 
                replace=True)

            ##X_modified, Y_modified = copy_random_points(
            ##    X_modified, Y_modified, 
            ##    target_class=-1, 
            ##    num_copies=num_neg_copies, 
            ##    random_seed=random_seed, 
            ##    replace=True)

            if label_flip:
                Y_modified[incoming_sets_X[i].shape[0]:] = -Y_modified[incoming_sets_X[i].shape[0]:]

            #print(X_modified)
                
            #attacker attacks with entire dataset
            X_complete = np.append(X_def, X_modified[X_train_2.shape[0]:], axis = 0)
            Y_complete = np.append(Y_def, Y_modified[X_train_2.shape[0]:])
            
            model.update_train_x_y(X_complete, Y_complete)
            model.train()

            #acquire projection rules for attack
            projection_fn = get_projection_fn(
                X_train_new, Y_train_new, clusters = clusters,
                distance = all_average_dist,
                sphere=True,
                slab=False,
                omega=omega,
                theta = theta,
                target_class = target_class)

            if projection_fn is not None:
                    #perform the attack
                    min_acc, min_X = iterative_attack.iterative_attack(
                        model, 
                        indices_to_poison=np.arange(X_def.shape[0], X_complete.shape[0]),            
                        test_idx=None, 
                        test_description=None, 
                        step_size=step_size, 
                        num_iter=2000,
                        loss_type='normal_loss',
                        projection_fn=projection_fn,
                        output_root=output_root)
                    attackList.append("Y")
                    #before proceeding, measure accuracy without sanitization
                    X_no_def = np.append(X_clean, min_X[X_def.shape[0]:,:], axis = 0)
                    Y_no_def = np.append(Y_clean, Y_complete[X_def.shape[0]:])
            else:
                    min_X = X_def
                    Y_complete = Y_def
                    model.update_train_x_y(X_def, Y_def)
                    model.train()
                    min_acc = model.get_test_accuracy()
                    attackList.append("N")
                    #before proceeding, measure accuracy without sanitization
                    X_no_def = X_clean
                    Y_no_def = Y_clean

            #print(model.data_sets.train.x)
            print(min_X.shape)




            model.update_train_x_y(X_no_def, Y_no_def)
            model.train()
            
            acc = model.get_test_accuracy()

            #prepare dataset for next day
            X_train_new = min_X
            Y_train_new = Y_complete
            
            #record accuracy into accList
            filtList.append(i)
            thetaList.append(theta)
            accList.append(min_acc)
            undefList.append(acc)
            datasetList_X.append(X_train_new)
            datasetList_Y.append(Y_train_new)
            
            #increment counter, reset model
            #filt = filt+0.1
            tf.reset_default_graph()
            
            #re-cluster and compute distance
            #cluster the data first
            clustering = clusterData(X_train_new, Y_train_new, eps, plotName = PLOT_FOLDER + "/" + datasetName + "_poisoned_day"+str(i)+".png")

            clusters = clustering.labels_
    
            all_average_dist = computeCurieDistance(X_train_new, Y_train_new, clusters, omega )

            #theta is a single value for the entire dataset (instead of 1 per class)
            theta = getTheta(X_train_new,Y_train_new,all_average_dist, filt)
           
            print("Value of theta chosen to be: "+str(theta))
    #print result to the console for now
    print(filtList)
    print(thetaList)
    print(undefList)
    print(accList)
    return filtList, thetaList, accList, undefList, datasetList_X, datasetList_Y, removeList, base_acc, attackList
Ejemplo n.º 2
0
def LBDefense(datasetName = "spambase", seed = 18, OUTPUT_FOLDER=None):
        
    train_split_size = 0.7
    validation_size = 0.30
    
    if datasetName == "spambase":
        #load data
        dataset = load_dataset.load_dataset_spambase()
            
        data_size = dataset.shape[0]


        # Prepare data
        array = dataset.values
        X = array[:,0:57]
        Y = array[:,57]
            
        X_train, X_validation, Y_train, Y_validation \
        = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
#       #train_1: clean data, train_2: untrusted data
        X_train_1, X_train_2, Y_train_1, Y_train_2 \
        = model_selection.train_test_split(X_train, Y_train, test_size=train_split_size, random_state=seed)


        median = dataset.groupby('class').median()
    

    elif datasetName =="mnist17":
        X_train, Y_train, X_validation, Y_validation = load_dataset.load_dataset_mnist17(sampling=True)
        #train_1: clean data, train_2: untrusted data
        X_train_1, X_train_2, Y_train_1, Y_train_2 \
        = model_selection.train_test_split(X_train, Y_train, test_size=train_split_size, random_state=seed)
        #find median/centroid
        median_neg = np.median(X_train[Y_train == -1],axis  =0)
        median_pos = np.median(X_train[Y_train == 1], axis = 0)

        median = pandas.DataFrame(data = [median_neg, median_pos], index = [-1,1])
        print(median_pos.shape)
    
    print("Data size:"+str(X_train.shape[0])+","+str(X_validation.shape[0]))

    train_size = X_train.shape[0]
    print(train_size)

    ##with tf.Session() as sess:
    ##    Y_train = tf.one_hot(Y_train,2).eval()
    ##    Y_validation = tf.one_hot(Y_validation,2).eval()
    ##print(X_validation.shape)
    ##print(Y_validation.shape)

    #prepare dataset for model
    train = DataSet(X_train_1, Y_train_1)
    test = DataSet(X_train_1, Y_train_1)
    complete_train = DataSet(X_train, Y_train)
    complete_test = DataSet(X_train, Y_train)
    validation = DataSet(X_validation, Y_validation)
    
    data_sets = base.Datasets(train = train, validation = validation, test = validation)
    complete_datasets = base.Datasets(train = complete_train, validation = complete_test, test = validation)
    #the output directory of attack steps
    output_root = os.path.join(OUTPUT_FOLDER, 'ddd')

    poison_percentage = [0.05,0.1,0.15,0.2]
    step_size = 0.01

    #filter values
    filt = 0.0
    #defense_filts = [50,30,10,5,3,1, 0.1]
    defense_filts = [30,10,5,3,1, 0.5]
    #defense_filts = [0.5]
    epsilon = 0.1
    
    filtList = []
    accList = []
    noDefList = []
    poisonLossList = []
    lb_remove_list=[]
    num_points = round(train_size * poison_percentage[3])
    #num_points = 1
    print("Number of Poisoning points: "+str(num_points))
    for i in range(1):
        defense_filt=defense_filts[i]

        #place for loop later

       
        label_flip = True



        model = SmoothHinge(            
                    input_dim=X_train.shape[1],
                    temp=0,
                    weight_decay=0.01,
                    use_bias=True,
                    num_classes=2,
                    batch_size=train_size,
                    data_sets=complete_datasets,
                    initial_learning_rate=0.001,
                    decay_epochs=None,
                    mini_batch=False,
                    train_dir=output_root,
                    log_dir='log',
                    model_name='my_model')




        model.train()
        base_acc = model.get_test_accuracy()
        #getLoss(model,model.data_sets.test, 10)
        #print(Y_validation.shape)
        print(X_train.shape[0])
        
        model.update_train_x_y(X_train_1, Y_train_1)
        model.train()
        
        
        #with tf.variable_scope('softmax_linear'):
        weights = np.float32(model.getWeight())
        print(weights)

        

        
        start_poison = True
        if start_poison == True:

            #injects positive class data (invert the label in next step)
            X_modified, Y_modified =copy_random_points(
                X_train_2, Y_train_2, 
                target_class=-1, 
                num_copies=num_points, 
                random_seed=seed, 
                replace=True)

            if label_flip:
                Y_modified[X_train_2.shape[0]:] = -Y_modified[X_train_2.shape[0]:]

            #attacker attacks with entire dataset
            X_complete = np.append(X_train_1, X_modified, axis = 0)
            Y_complete = np.append(Y_train_1, Y_modified)
            #X_complete.append(X_train_1)
            #X_complete.append(X_modified)
            #Y_complete.append(Y_train_1)
            #Y_complete.append(Y_modified)
            print(X_complete.shape)
            print(Y_complete.shape)
            model.update_train_x_y(X_complete, Y_complete)
            model.train()
            #get the loss-based projection rules for attack (only implemented for linear classifiers)
            projection_fn = get_projection_fn(
                X_train_1, Y_train_1,
                sphere=False,
                loss=True,
                loss_value = defense_filt,
                clean_model_weight = weights
                ) 
            #perform the attack
            min_acc, min_X = iterative_attack.iterative_attack(
                model, 
                indices_to_poison=np.arange(X_train.shape[0], X_complete.shape[0]),            
                test_idx=None, 
                test_description=None, 
                step_size=step_size, 
                num_iter=2000,
                loss_type='normal_loss',
                projection_fn=projection_fn,
                output_root=output_root)


            #print(model.data_sets.train.x)
            #acc = model.get_test_accuracy()
            noDefList.append(min_acc)

            #print(model.data_sets.train.x[X_train.shape[0]:X_complete.shape[0],:])

        applyDefense = True
        if applyDefense == True:

            #HERE WE ACQUIRE THE POISONED DATASET (SET 2 DATA PLUS POISONED DATA)
            
            Y_complete = model.data_sets.train.labels
            if start_poison ==False:
                poisoned_X = X_train_2
                poisoned_Y = Y_train_2
            else:
                X_complete = min_X
                poisoned_X = X_complete[X_train_1.shape[0]:]
                poisoned_Y = Y_complete[X_train_1.shape[0]:]

            #get the original classifier
            model.update_train_x_y(X_train_1, Y_train_1)
            model.train()
            #SET 2 DATA + POISONING SET
            modifiedData = DataSet(poisoned_X, poisoned_Y)
        
            total_loss_poison = 0
            loss_poison = []
            for i in range (poisoned_X.shape[0]):
                t = getLoss(model, modifiedData,i)
                loss_poison.append(t)
                total_loss_poison = total_loss_poison+t
            #print(loss_poison)
        
            #print ("Average poison loss: "+str(total_loss_poison / (X_modified.shape[0]-X_train.shape[0])))

            #poisonLossList.append(total_loss_poison / (X_modified.shape[0]-X_train.shape[0]))
           
            #apply filter, using loss as distance
            X_filtered, Y_filtered, indexKept,indexRemoved = fixedFilter(poisoned_X,poisoned_Y,loss_poison, defense_filt+epsilon)

            #calculate the amount of genuine points removed
            lb_remove_list.append((indexRemoved<=X_train_2.shape[0]).shape[0])
            
            #compose the new training dataset
            X_complete = np.append(X_train_1, X_filtered, axis = 0)
            Y_complete = np.append(Y_train_1, Y_filtered)
            print(X_complete.shape)
            #X_complete.append(X_train_1)
            #X_complete.append(X_filtered)
            #Y_complete.append(Y_train_1)
            #Y_complete.append(Y_filtered)

            indexKept = np.sort(indexKept)
            #print(indexKept.shape)
            #print(poisoned_X.shape)
            
            #print(len(indexRemoved))
            
            #train the model with the sanitized data
            model.update_train_x_y(X_complete, Y_complete)
            model.train()

            acc = model.get_test_accuracy()
            
            
            filtList.append(defense_filt)
            accList.append(acc)
            filt=filt+0.1
            tf.reset_default_graph()
        #in the end, print the results to the console
    print(filtList)
    print(noDefList)
    print(accList)
    print(base_acc)
    print(lb_remove_list)
    return filtList, noDefList, accList, base_acc, lb_remove_list
                    help='One of: imdb, enron, dogfish, mnist_17')
parser.add_argument('--shard', type=int)

args = parser.parse_args()
dataset_name = args.dataset_name
shard = args.shard

norm_sq_constraint = datasets.DATASET_NORM_SQ_CONSTRAINTS[dataset_name]

X_train, Y_train, X_test, Y_test = datasets.load_dataset(dataset_name)
if sparse.issparse(X_train):
    X_train = X_train.toarray()
if sparse.issparse(X_test):
    X_test = X_test.toarray()

train = DataSet(X_train, Y_train)
validation = None
# We want to directly attack the clean train data
# so we pretend that it's the test data
test = DataSet(X_train, Y_train)
data_sets = base.Datasets(train=train, validation=validation, test=test)

temp = 0
input_dim = X_train.shape[1]
weight_decay = 0.01

if X_train.shape[0] % 100 == 0:
    batch_size = 100
else:
    batch_size = X_train.shape[0]
initial_learning_rate = 0.001