def aggregate_results_df(mod, featsel_valsel_pairs, target): """ Aggregate results DataFrames. :param mod: (str) identifier of the attacked model :param featsel_valsel_pairs: (list) of tuples feature/value selectors :param target: (str) identifier of the target features :return: (dict) mapping of aggregate results """ results_dict = {} for feat, val in featsel_valsel_pairs: exp_name = common_utils.get_exp_name(mod, feat, val, target) hmn_exp_name = common_utils.get_human_exp_name(mod, feat, val, target) exp_dir = os.path.join('results', exp_name) df_file = os.path.join(exp_dir, exp_name + '__summary_df.csv') if os.path.exists(df_file): print('Gathering data for: {}'.format(exp_name)) temp_df = pd.read_csv(df_file) else: print('WARNING: {} DataFrame not found!'.format(df_file)) continue common_utils.recover_accuracy(temp_df) results_dict[hmn_exp_name] = temp_df return results_dict
def isoforest_ember(): data_id = 'ember' features, feature_names, name_feat, feat_name = data_utils.load_features( constants.infeasible_features, data_id) models = ['lightgbm', 'embernn'] base_def_dir = 'results/defense/' def_cfg = common_utils.read_config('configs/defense_cfg.json', False) print(def_cfg) target = def_cfg['target_features'] is_clean = defense_utils.get_is_clean(def_cfg['poison_size'][0]) print(is_clean.shape, sum(is_clean)) bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist()) print(len(bdr_indices)) # ## Load results def_res = {} for mod in models: res = np.load(os.path.join(base_def_dir, mod + '__def_dict.npy'), allow_pickle=True) res = res[()] res = {(mod, *key): val for key, val in res.items()} def_res.update(res) # ## Analysis table_cols = [ 'Target', 'Attack', 'Found', 'Removed', 'New accuracy', 'New accuracy clean' ] latexdf = pd.DataFrame(columns=table_cols) for key, val in sorted(def_res.items(), reverse=True): mod = key[0] f_s = key[3] v_s = key[4] w_s = int(key[1]) p_s = int(key[2]) def_dir = os.path.join(base_def_dir, str(w_s), str(p_s)) current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s, target) current_exp_dir = os.path.join(def_dir, current_exp_name) human_exp_name = common_utils.get_human_exp_name(mod, f_s, v_s, target) human_target = human_exp_name.split('-')[0] human_exp_name = human_exp_name.split('-')[1] print('-' * 80) print('Experiment name: {}'.format(current_exp_name)) print('Human name: {}\n'.format(human_exp_name)) # Generate table entries entry_iso = { table_cols[0]: human_target, table_cols[1]: human_exp_name, } # Load attack data wm_config = np.load(os.path.join(current_exp_dir, 'wm_config.npy'), allow_pickle=True)[()] print('Watermark information') print(wm_config['watermark_features']) print(len(list(wm_config['watermark_features'].keys()))) print(sorted(list(wm_config['watermark_features'].keys()))) print() x_train_w, y_train_w, x_test_mw = defense_utils.load_attack_data( current_exp_dir) backdoor_model = defense_filtering.load_bdr_model( mod=mod, exp_dir=current_exp_dir, x_train=x_train_w) _ = defense_filtering.print_bdr_baseline(x_test_mw, backdoor_model) # Dimensionality reduction - Get n most important features x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model( mod, safe_pct=0.2, rand=42) shap_values_df = defense_utils.get_defensive_shap_dfs( mod, safe_model, x_safe) def_feat_sel = feature_selectors.ShapleyFeatureSelector( shap_values_df, criteria=constants.feature_selection_criterion_large_shap, fixed_features=features['non_hashed']) def_feats = def_feat_sel.get_features(32) x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats( x_train_w, def_feats, y_train_w) # Isolation Forest analysis isof_pred, suspect, poison_found, false_positives_poison = isolation_forest_analysis( xtrain=x_gw_sel, is_clean=is_clean) print() print('Isolation Forest - sel removed points: {}'.format(suspect)) print('Isolation Forest - sel found: {}'.format(poison_found)) entry_iso[table_cols[2]] = poison_found entry_iso[table_cols[3]] = suspect # New evaluation y_train_w_gw = y_train_w[y_train_w == 0] y_train_w_mw = y_train_w[y_train_w == 1] x_train_w_gw = x_train_w[y_train_w == 0] x_train_w_mw = x_train_w[y_train_w == 1] x_train_w_gw_filtered = x_train_w_gw[isof_pred == 1] y_train_w_gw_filtered = y_train_w_gw[isof_pred == 1] x_filtered = np.concatenate((x_train_w_mw, x_train_w_gw_filtered), axis=0) y_filtered = np.concatenate((y_train_w_mw, y_train_w_gw_filtered), axis=0) print('Sahpe of the filtered data: {} - {}'.format( x_filtered.shape, y_filtered.shape)) cr_clean, cm_clean, cr_backdoor, cm_backdoor = defense_filtering.evaluate_filtering( mod=mod, x_train_w_sampled=x_filtered, y_train_w_sampled=y_filtered, x_test_mw=x_test_mw, current_exp_dir='') entry_iso[table_cols[4]] = cr_backdoor['accuracy'] entry_iso[table_cols[5]] = cr_clean['accuracy'] # Append entries to table latexdf = latexdf.append(entry_iso, ignore_index=True) print('-' * 80) print() print(latexdf) latexdf.to_csv('table_isof.csv', index=False)
def generate_watermark(): seed = 24 safe_percentage = 0.2 data_id = 'ember' cfg = common_utils.read_config('configs/attack_cfg_kernelshap.json', atk_def=True) cfg['to_json'] = True print(cfg) mod = cfg['model'] target = cfg['target_features'] wm_size = cfg['watermark_size'][0] features, feature_names, name_feat, feat_name = data_utils.load_features( constants.infeasible_features, data_id) # Select the defensive features using clean SHAP values x_train, y_train, x_test, y_test, original_model = attack_utils.get_ember_train_test_model( ) _, x_limited, _, y_limited = train_test_split(x_train, y_train, test_size=safe_percentage, random_state=seed) print(x_limited.shape, y_limited.shape) limited_model = notebook_utils.train_model(x_limited, y_limited) data_summ = shap.kmeans(x_limited, 30) inside_data = data_summ.data np.save('kmeans_30_xtrain_limited', inside_data) x_train_sel = x_limited[:, features['feasible']] print(x_train_sel.shape) clusters_sel = inside_data[:, features['feasible']] print(clusters_sel.shape) import warnings warnings.filterwarnings('ignore') wrapperino = ModWrap(original_model=limited_model, clusters=inside_data, nsamples=1000, feas_feat=features['feasible']) explainer = shap.KernelExplainer(wrapperino.predict, clusters_sel, link='logit') exp = explainer.shap_values(x_train_sel, nsamples=200) np.save('explanations_limited', exp) reconstruced_shap = np.copy(x_limited) print(reconstruced_shap.shape) reconstruced_shap[:, features['feasible']] = exp assert np.allclose(reconstruced_shap[0][features['feasible'][16]], exp[0][16]) np.save('reconstucted_shaps_limited', reconstruced_shap) shap_values_df = pd.DataFrame(reconstruced_shap) # ## Setup wm_dir = 'configs/watermark' if not os.path.exists(wm_dir): os.makedirs(wm_dir) f_selectors = attack_utils.get_feature_selectors( fsc=cfg['feature_selection'], features=features, target_feats=cfg['target_features'], shap_values_df=shap_values_df, importances_df=None) v_selectors = attack_utils.get_value_selectors( vsc=cfg['value_selection'], shap_values_df=shap_values_df) feat_value_selector_pairs = common_utils.get_feat_value_pairs( feat_sel=list(f_selectors.keys()), val_sel=list(v_selectors.keys())) print(feat_value_selector_pairs) for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s, target) + '__kernelshap' print('{}\n' 'Current experiment: {}\n' '{}\n'.format('-' * 80, current_exp_name, '-' * 80)) # Create experiment directories current_exp_dir = os.path.join('../results', current_exp_name) current_exp_img_dir = os.path.join(current_exp_dir, 'images') if not os.path.exists(current_exp_img_dir): os.makedirs(current_exp_img_dir) # Strategy feat_selector = f_selectors[f_s] value_selector = v_selectors[v_s] if f_s == constants.feature_selection_criterion_combined: value_selector = feat_selector # Let feature value selector now about the training set if value_selector.X is None: value_selector.X = x_limited # Get the feature IDs that we'll use start_time = time.time() if f_s == constants.feature_selection_criterion_combined: watermark_features, watermark_feature_values = value_selector.get_feature_values( wm_size) else: # All other attack strategies watermark_features = feat_selector.get_features(wm_size) print('Selecting watermark features took {:.2f} seconds'.format( time.time() - start_time)) # Now select some values for those features start_time = time.time() watermark_feature_values = value_selector.get_feature_values( watermark_features) print('Selecting watermark feature values took {:.2f} seconds'.format( time.time() - start_time)) watermark_features_map = OrderedDict() for feature, value in zip(watermark_features, watermark_feature_values): watermark_features_map[feature_names[feature]] = value print(watermark_features_map) # Output the watermark on file for reuse if cfg['to_json']: wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size)) wm_file = os.path.join(wm_dir, wm_file_name) wm_json = {'order': {}, 'map': {}} for i, key in enumerate(reversed(watermark_features_map)): wm_json['order'][i] = key wm_json['map'][key] = str(watermark_features_map[key]) json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2)
def run_attacks(cfg): """ Run series of attacks. :param cfg: (dict) experiment parameters """ print('Config: {}\n'.format(cfg)) model_id = cfg['model'] seed = cfg['seed'] to_save = cfg.get('save', '') target = cfg['target_features'] dataset = cfg['dataset'] k_perc = cfg['k_perc'] k_data = cfg['k_data'] # Set random seed random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) # Select subset of features features, feature_names, name_feat, feat_name = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset, selected=True # Only used for Drebin ) # Get original model and data. Then setup environment. x_train, y_train, x_test, y_test = data_utils.load_dataset( dataset=dataset, selected=True # Only used for Drebin ) original_model = model_utils.load_model( model_id=model_id, data_id=dataset, save_path=constants.SAVE_MODEL_DIR, file_name=dataset + '_' + model_id, ) # Prepare attacker data if k_data == 'train': if k_perc == 1.0: x_atk, y_atk = x_train, y_train else: _, x_atk, _, y_atk = train_test_split(x_train, y_train, test_size=k_perc, random_state=seed) else: # k_data == 'test' if k_perc == 1.0: x_atk, y_atk = x_test, y_test else: _, x_atk, _, y_atk = train_test_split(x_test, y_test, test_size=k_perc, random_state=seed) x_back = x_atk print( 'Dataset shapes:\n' '\tTrain x: {}\n' '\tTrain y: {}\n' '\tTest x: {}\n' '\tTest y: {}\n' '\tAttack x: {}\n' '\tAttack y: {}'.format( x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_atk.shape, y_atk.shape ) ) # Get explanations start_time = time.time() shap_values_df = model_utils.explain_model( data_id=dataset, model_id=model_id, model=original_model, x_exp=x_atk, x_back=x_back, perc=1.0, n_samples=100, load=False, save=False ) print('Getting SHAP took {:.2f} seconds\n'.format(time.time() - start_time)) # Setup the attack f_selectors = attack_utils.get_feature_selectors( fsc=cfg['feature_selection'], features=features, target_feats=target, shap_values_df=shap_values_df, importances_df=None # Deprecated ) print(f_selectors) v_selectors = attack_utils.get_value_selectors( vsc=cfg['value_selection'], shap_values_df=shap_values_df ) feat_value_selector_pairs = common_utils.get_feat_value_pairs( feat_sel=list(f_selectors.keys()), val_sel=list(v_selectors.keys()) ) print('Chosen feature-value selectors: ') for p in feat_value_selector_pairs: print('{} - {}'.format(p[0], p[1])) # If Drebin reload dataset with full features if dataset == 'drebin': x_train, y_train, x_test, y_test = data_utils.load_dataset( dataset=dataset, selected=False ) # Find poisoning candidates x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples( original_model, x_test, y_test ) assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0] # Attack loop for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target) print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80)) # Create experiment directories current_exp_dir = os.path.join('results', current_exp_name) current_exp_img_dir = os.path.join(current_exp_dir, 'images') if not os.path.exists(current_exp_img_dir): os.makedirs(current_exp_img_dir) # Strategy feat_selector = f_selectors[f_s] value_selector = v_selectors[v_s] # Accumulator summaries = [] start_time = time.time() if to_save: save_watermarks = os.path.join(to_save, current_exp_name) if not os.path.exists(save_watermarks): os.makedirs(save_watermarks) else: save_watermarks = '' for summary in attack_utils.run_experiments( X_mw_poisoning_candidates=x_mw_poisoning_candidates, X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx, gw_poison_set_sizes=cfg['poison_size'], watermark_feature_set_sizes=cfg['watermark_size'], feat_selectors=[feat_selector, ], feat_value_selectors=[value_selector, ], iterations=cfg['iterations'], save_watermarks=save_watermarks, model_id=model_id, dataset=dataset ): attack_utils.print_experiment_summary( summary, feat_selector.name, value_selector.name if value_selector is not None else feat_selector.name ) summaries.append(summary) print('Exp took {:.2f} seconds\n'.format(time.time() - start_time)) start_time = time.time() # Create DataFrame out of results accumulator and save it summaries_df = attack_utils.create_summary_df(summaries) print(summaries_df) # If running a single attack for defensive purpose we don't want to # overwrite the content of the results directory. if cfg.get('defense', False): continue summaries_df.to_csv( os.path.join( current_exp_dir, current_exp_name + '__summary_df.csv' ) )
def filtering_defense(cfg): # Setup seed = cfg['seed'] np.random.seed(seed) random.seed(seed) mod = cfg['model'] method = cfg['clustering'] target = cfg['target_features'] safe_mode = cfg['safe'] base_def_dir = 'results/defense' if not os.path.exists(base_def_dir): os.makedirs(base_def_dir) watermark_sizes = cfg['watermark_size'] poison_sizes = cfg['poison_size'] feature_selection = cfg['feature_selection'] value_selection = cfg['value_selection'] results = defaultdict(dict) features, feature_names, name_feat, feat_name = \ data_utils.load_features( constants.infeasible_features ) feat_value_selector_pairs = common_utils.get_feat_value_pairs( feat_sel=list(feature_selection), val_sel=list(value_selection) ) # Defense parameters t_max_size = cfg['t_max'] * constants.EMBER_TRAIN_SIZE min_keep_percentage = cfg['min_keep'] mcs = int(cfg['mcs'] * constants.EMBER_TRAIN_SIZE) ms = int(cfg['ms'] * constants.EMBER_TRAIN_SIZE) print( 'Minimum cluster size: {}\n' 'Minimum samples: {}'.format( mcs, ms ) ) for w_s in watermark_sizes: for p_s in poison_sizes: is_clean = defense_utils.get_is_clean(p_s) bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist()) for (f_s, v_s) in feat_value_selector_pairs: # Generate current exp/dir names def_dir = os.path.join(base_def_dir, str(w_s), str(p_s)) current_exp_name = common_utils.get_exp_name( mod, f_s, v_s, target ) current_exp_dir = os.path.join(def_dir, current_exp_name) # Check if attack data is available if not check_data(def_dir, current_exp_name): cfg_copy = copy.deepcopy(cfg) cfg_copy['watermark_size'] = [w_s, ] cfg_copy['poison_size'] = [p_s, ] cfg_copy['feature_selection'] = [f_s, ] cfg_copy['value_selection'] = [v_s, ] run_single_attack(cfg_copy, def_dir) # Prepare feature importance/SHAPs DataFrame if safe_mode: # Assume small percentage of safe data x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model( mod, safe_pct=0.2, rand=seed ) shap_values_df = defense_utils.get_defensive_shap_dfs( mod, safe_model, x_safe ) else: # Assume defender has access to full clean model/data shap_values_df = get_original_shap(mod, feature_names) # Load attack data x_train_w, y_train_w, x_test_mw = \ defense_utils.load_attack_data( current_exp_dir ) backdoor_model = load_bdr_model( mod=mod, exp_dir=current_exp_dir, x_train=x_train_w ) # Baselines on the attacked model print_bdr_baseline(x_test_mw, backdoor_model) # Get n most important features def_feat_sel = feature_selectors.ShapleyFeatureSelector( shap_values_df, criteria=constants.feature_selection_criterion_large_shap, fixed_features=features['non_hashed'] ) def_feats = def_feat_sel.get_features(config['topfeats']) print('Top {} selected defensive features:\n{}'.format( cfg['topfeats'], def_feats )) # Dimensionality reduction through feature selection x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats( x_train_w, def_feats, y_train_w ) assert x_sel.shape[0] == x_train_w.shape[0] assert x_sel.shape[1] == cfg['topfeats'] x_gw_sel_std = defense_utils.standardize_data(x_gw_sel) print('-' * 80) print('Current experiment: {}'.format(current_exp_name)) print('-' * 80) # Clustering clustering, clustering_labels = defensive_clustering( method=method, x_gw=x_gw_sel_std, mcs=mcs, ms=ms, current_exp_dir=current_exp_dir ) # Cluster analysis silh, avg_silh, cluster_sizes, evals = cluster_analysis( x_gw=x_gw_sel_std, clustering_labels=clustering_labels, is_clean=is_clean, current_exp_dir=current_exp_dir ) # Filter x_train_w_sampled, y_train_w_sampled, selected, selected_per_cluster = filter_clusters( x_train_w=x_train_w, y_train_w=y_train_w, avg_silh=avg_silh, cluster_sizes=cluster_sizes, clustering_labs=clustering_labels, threshold_max_size=t_max_size, min_keep_percentage=min_keep_percentage ) results[(w_s, p_s, f_s, v_s)]['selected'] = selected results[(w_s, p_s, f_s, v_s)]['selected_per_cluster'] = selected_per_cluster # Evaluation cr_clean, cm_clean, cr_backdoor, cm_backdoor = evaluate_filtering( mod=mod, x_train_w_sampled=x_train_w_sampled, y_train_w_sampled=y_train_w_sampled, x_test_mw=x_test_mw, current_exp_dir=current_exp_dir, ) results[(w_s, p_s, f_s, v_s)]['cr_clean'] = cr_clean results[(w_s, p_s, f_s, v_s)]['cm_clean'] = cm_clean results[(w_s, p_s, f_s, v_s)]['cr_backdoor'] = cr_backdoor results[(w_s, p_s, f_s, v_s)]['cm_backdoor'] = cm_backdoor # Spectral signatures-like approach to_remove_gh, to_remove_pa, found_gh, found_pa = defense_utils.spectral_remove_lists( x_gw_sel_std, bdr_indices ) results[(w_s, p_s, f_s, v_s)]['to_remove_gh'] = to_remove_gh results[(w_s, p_s, f_s, v_s)]['to_remove_pa'] = to_remove_pa results[(w_s, p_s, f_s, v_s)]['found_gh'] = found_gh results[(w_s, p_s, f_s, v_s)]['found_pa'] = found_pa x_train_w_filtered_gh, y_train_w_filtered_gh = defense_utils.filter_list( x_train_w, y_train_w, to_remove_gh ) cr_clean_gh, cm_clean_gh, cr_backdoor_gh, cm_backdoor_gh = evaluate_filtering( mod=mod, x_train_w_sampled=x_train_w_filtered_gh, y_train_w_sampled=y_train_w_filtered_gh, x_test_mw=x_test_mw, current_exp_dir=current_exp_dir, modifier='gh' ) results[(w_s, p_s, f_s, v_s)]['cr_clean_gh'] = cr_clean_gh results[(w_s, p_s, f_s, v_s)]['cm_clean_gh'] = cm_clean_gh results[(w_s, p_s, f_s, v_s)]['cr_backdoor_gh'] = cr_backdoor_gh results[(w_s, p_s, f_s, v_s)]['cm_backdoor_gh'] = cm_backdoor_gh x_train_w_filtered_pa, y_train_w_filtered_pa = defense_utils.filter_list( x_train_w, y_train_w, to_remove_pa ) cr_clean_pa, cm_clean_pa, cr_backdoor_pa, cm_backdoor_pa = evaluate_filtering( mod=mod, x_train_w_sampled=x_train_w_filtered_pa, y_train_w_sampled=y_train_w_filtered_pa, x_test_mw=x_test_mw, current_exp_dir=current_exp_dir, modifier='pa' ) results[(w_s, p_s, f_s, v_s)]['cr_clean_pa'] = cr_clean_pa results[(w_s, p_s, f_s, v_s)]['cm_clean_pa'] = cm_clean_pa results[(w_s, p_s, f_s, v_s)]['cr_backdoor_pa'] = cr_backdoor_pa results[(w_s, p_s, f_s, v_s)]['cm_backdoor_pa'] = cm_backdoor_pa np.save(os.path.join(base_def_dir, mod + '__def_dict'), results) return results
def run_attacks(cfg): """ Run series of attacks. :param cfg: (dict) experiment parameters """ print('Config: {}\n'.format(cfg)) model_id = cfg['model'] seed = cfg['seed'] to_save = cfg.get('save', '') target = cfg['target_features'] dataset = cfg['dataset'] # Workaround until we fix ordering of feature selector outputs wm_size = cfg['watermark_size'][0] # Set random seed random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) # Select subset of features features, feature_names, name_feat, feat_name = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset, selected=True # Only used for Drebin ) # Get original model and data. Then setup environment. # Get original model and data. Then setup environment. x_train, y_train, x_test, y_test = data_utils.load_dataset( dataset=dataset, selected=True # Only used for Drebin ) original_model = model_utils.load_model( model_id=model_id, data_id=dataset, save_path=constants.SAVE_MODEL_DIR, file_name=dataset + '_' + model_id, ) # Find poisoning candidates x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples( original_model, x_test, y_test ) assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0] # Load saved watermark fixed_wm = attack_utils.load_watermark(cfg['wm_file'], wm_size, name_feat) # Setup the attack f_selectors = attack_utils.get_feature_selectors( fsc=[constants.feature_selection_criterion_fix, ], features=features, target_feats=target, shap_values_df=None, importances_df=None, feature_value_map=fixed_wm ) feat_value_selector_pairs = [( constants.feature_selection_criterion_fix, constants.value_selection_criterion_fix ), ] print('Chosen feature-value selectors: ') for p in feat_value_selector_pairs: print('{} - {}'.format(p[0], p[1])) # Attack loop for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target) print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80)) # Create experiment directories current_exp_dir = os.path.join('results', current_exp_name) current_exp_img_dir = os.path.join(current_exp_dir, 'images') if not os.path.exists(current_exp_img_dir): os.makedirs(current_exp_img_dir) # Strategy feat_selector = f_selectors[f_s] value_selector = feat_selector # Accumulator summaries = [] start_time = time.time() if to_save: save_watermarks = os.path.join(to_save, current_exp_name) if not os.path.exists(save_watermarks): os.makedirs(save_watermarks) else: save_watermarks = '' for summary in attack_utils.run_experiments( X_mw_poisoning_candidates=x_mw_poisoning_candidates, X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx, gw_poison_set_sizes=cfg['poison_size'], watermark_feature_set_sizes=[wm_size, ], feat_selectors=[feat_selector, ], feat_value_selectors=[value_selector, ], iterations=cfg['iterations'], save_watermarks=save_watermarks, model_id=model_id, dataset=dataset ): attack_utils.print_experiment_summary( summary, feat_selector.name, value_selector.name if value_selector is not None else feat_selector.name ) summaries.append(summary) print('Exp took {:.2f} seconds\n'.format(time.time() - start_time)) start_time = time.time() # Create DataFrame out of results accumulator and save it summaries_df = attack_utils.create_summary_df(summaries) print(summaries_df) # If running a single attack for defensive purpose we don't want to # overwrite the content of the results directory. if cfg.get('defense', False): continue summaries_df.to_csv( os.path.join( current_exp_dir, current_exp_name + '__summary_df.csv' ) )
def evaluate_backdoor(): # ## Config cfg = common_utils.read_config('configs/ogcontagio_fig5.json', atk_def=True) cfg['seed'] = 42 print(cfg) model_id = cfg['model'] seed = cfg['seed'] to_save = cfg.get('save', '') target = cfg['target_features'] dataset = cfg['dataset'] k_perc = cfg['k_perc'] k_data = cfg['k_data'] poison_sizes = cfg['poison_size'] iterations = cfg['iterations'] watermark_size = cfg['watermark_size'][0] # Data x_train_orig, y_train_orig, x_test_orig, y_test_orig = data_utils.load_dataset( dataset=dataset) train_files, test_files = data_utils.load_pdf_train_test_file_names() print(x_train_orig.shape, x_test_orig.shape) wm_name = 'ogcontagio__pdfrf__combined_shap__combined_shap__feasible__30' watermark = dict( attack_utils.load_watermark(wm_file='configs/watermark/' + wm_name, wm_size=16)) bdr_gw_df = pd.read_csv( os.path.join(constants.SAVE_FILES_DIR, 'bdr_{}_{}'.format('gw', wm_name))) bdr_mw_df = pd.read_csv( os.path.join(constants.SAVE_FILES_DIR, 'bdr_{}_{}'.format('mw', wm_name))) # Model original_model = model_utils.load_model( model_id=model_id, data_id=dataset, save_path=constants.SAVE_MODEL_DIR, file_name=dataset + '_' + model_id, ) # Poisoning candidates mw_poisoning_candidates, mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples( original_model, x_test_orig, y_test_orig) train_filename_gw = train_files[y_train_orig == 0] train_filename_gw_set = set(train_filename_gw) test_filename_mw = test_files[y_test_orig == 1] test_filename_mw_set = set(test_filename_mw) candidate_filename_mw = test_filename_mw[mw_poisoning_candidates_idx] candidate_filename_mw_set = set(candidate_filename_mw) ind_train_filenames = dict( zip(train_filename_gw.tolist(), range(train_filename_gw.shape[0]))) ind_test_filenames = dict( zip(test_filename_mw.tolist(), range(test_filename_mw.shape[0]))) # From the ser of PDF files that were correctly poisoned we need to find # only the benign points that are present in the training set and only the # malicious points that are present in the test set. # Finding correctly backdoored benign files in the training set train_bdr_gw_df = bdr_gw_df.copy() to_drop = [] for index, row in bdr_gw_df.iterrows(): if row['filename'] not in train_filename_gw_set: to_drop.append(index) train_bdr_gw_df.drop(index=to_drop, inplace=True) print(train_bdr_gw_df.shape) # Finding correctly backdoored malicious files in the test set test_bdr_mw_df = bdr_mw_df.copy() to_drop = [] for index, row in bdr_mw_df.iterrows(): if row['filename'] not in test_filename_mw_set: to_drop.append(index) if row['filename'] not in candidate_filename_mw_set: to_drop.append(index) test_bdr_mw_df.drop(index=to_drop, inplace=True) print(test_bdr_mw_df.shape) # We also need to filter from the malware candidates those which are not correctly poisoned to_keep = [True] * candidate_filename_mw.shape[0] for i in range(candidate_filename_mw.shape[0]): if candidate_filename_mw[i] not in test_bdr_mw_df['filename'].to_list( ): to_keep[i] = False candidate_filename_mw = candidate_filename_mw[to_keep] mw_poisoning_candidates = mw_poisoning_candidates[to_keep] print(mw_poisoning_candidates.shape) # Finally we will need a mapping between the name of the poisoned # files and the index in the array of the training and test set repsectively. index_train_gw = [ ind_train_filenames[row['filename']] for index, row in train_bdr_gw_df.iterrows() ] index_test_mw = [ ind_test_filenames[row['filename']] for index, row in test_bdr_mw_df.iterrows() ] train_bdr_gw_df['index_array'] = index_train_gw test_bdr_mw_df['index_array'] = index_test_mw # Attack # We need to substitute the feature vectors for the benign files used during the # attack with the ones obtained by directly poisoning the PDF files. # Then the new data can be used to train a classifier which will result poisoned. # Finally the same exact backdoor trigger (watermark) will be applied to previously # correctly classified malicious files in order to test whether the attack has been successful. f_s = 'combined_shap' v_s = 'combined_shap' current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target) print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80)) # Create experiment directories current_exp_dir = os.path.join('results', current_exp_name) current_exp_img_dir = os.path.join(current_exp_dir, 'images') if not os.path.exists(current_exp_img_dir): os.makedirs(current_exp_img_dir) summaries = [] for poison_size in poison_sizes: for iteration in range(iterations): # Create copies of the original data x_train = np.copy(x_train_orig) y_train = np.copy(y_train_orig) x_test = np.copy(x_test_orig) y_test = np.copy(y_test_orig) x_orig_mw_only_test = np.copy(mw_poisoning_candidates) x_train_gw = x_train[y_train == 0] y_train_gw = y_train[y_train == 0] x_train_mw = x_train[y_train == 1] y_train_mw = y_train[y_train == 1] # Select points to watermark train_gw_to_be_watermarked_df = train_bdr_gw_df.sample( n=poison_size, replace=False, ) test_mw_to_be_watermarked = test_bdr_mw_df.sample( n=len(index_test_mw), replace=False) # Get the watermarked vectors train_gw_to_be_watermarked = train_gw_to_be_watermarked_df[ 'index_array'].to_numpy() x_train_gw_to_be_watermarked = train_gw_to_be_watermarked_df.drop( labels=['index_array', 'filename'], axis=1).to_numpy() y_train_gw_to_be_watermarked = np.zeros_like( train_gw_to_be_watermarked) x_test_mw = test_mw_to_be_watermarked.drop( labels=['index_array', 'filename'], axis=1).to_numpy() # Remove old goodware vectors from data matrix x_train_gw_no_watermarks = np.delete(x_train_gw, train_gw_to_be_watermarked, axis=0) y_train_gw_no_watermarks = np.delete(y_train_gw, train_gw_to_be_watermarked, axis=0) # Generate final training set x_train_watermarked = np.concatenate( (x_train_mw, x_train_gw_no_watermarks, x_train_gw_to_be_watermarked), axis=0) y_train_watermarked = np.concatenate( (y_train_mw, y_train_gw_no_watermarks, y_train_gw_to_be_watermarked), axis=0) # Train the model and evaluate it -- this section is equal to the code in attack_utils.py start_time = time.time() backdoor_model = model_utils.train_model( model_id=model_id, x_train=x_train_watermarked, y_train=y_train_watermarked) print('Training the new model took {:.2f} seconds'.format( time.time() - start_time)) orig_origts_predictions = original_model.predict( x_orig_mw_only_test) orig_mwts_predictions = original_model.predict(x_test_mw) orig_gw_predictions = original_model.predict( x_train_gw_no_watermarks) orig_wmgw_predictions = original_model.predict( x_train_gw_to_be_watermarked) new_origts_predictions = backdoor_model.predict( x_orig_mw_only_test) new_mwts_predictions = backdoor_model.predict(x_test_mw) orig_origts_predictions = np.array( [1 if pred > 0.5 else 0 for pred in orig_origts_predictions]) orig_mwts_predictions = np.array( [1 if pred > 0.5 else 0 for pred in orig_mwts_predictions]) orig_gw_predictions = np.array( [1 if pred > 0.5 else 0 for pred in orig_gw_predictions]) orig_wmgw_predictions = np.array( [1 if pred > 0.5 else 0 for pred in orig_wmgw_predictions]) new_origts_predictions = np.array( [1 if pred > 0.5 else 0 for pred in new_origts_predictions]) new_mwts_predictions = np.array( [1 if pred > 0.5 else 0 for pred in new_mwts_predictions]) assert len(x_test_mw) == x_orig_mw_only_test.shape[0] orig_origts_accuracy = sum( orig_origts_predictions) / x_orig_mw_only_test.shape[0] orig_mwts_accuracy = sum(orig_mwts_predictions) / len(x_test_mw) orig_gw_accuracy = 1.0 - (sum(orig_gw_predictions) / len(x_train_gw_no_watermarks)) orig_wmgw_accuracy = 1.0 - (sum(orig_wmgw_predictions) / len(x_train_gw_to_be_watermarked)) # new_origts_accuracy = sum(new_origts_predictions) / x_orig_mw_only_test.shape[0] new_mwts_accuracy = sum(new_mwts_predictions) / len(x_test_mw) num_watermarked_still_mw = sum(orig_mwts_predictions) successes = failures = benign_in_both_models = 0 for orig, new in zip(orig_mwts_predictions, new_mwts_predictions): if orig == 0 and new == 1: # We're predicting only on malware samples. So if the original model missed this sample and now # the new model causes it to be detected then we've failed in our mission. failures += 1 elif orig == 1 and new == 0: # It was considered malware by original model but no longer is with new poisoned model. # So we've succeeded in our mission. successes += 1 elif new == 0: benign_in_both_models += 1 # Compute accuracy of new model on clean test set - no need for reconstruction bdr_clean_test_pred = backdoor_model.predict(x_test_orig) bdr_clean_test_pred = np.array( [1 if pred > 0.5 else 0 for pred in bdr_clean_test_pred]) new_origts_accuracy = accuracy_score(y_test_orig, bdr_clean_test_pred) # Compute false positives and negatives for both models start_time = time.time() orig_origts_fpr_fnr = attack_utils.get_fpr_fnr( original_model, x_test_orig, y_test_orig) new_origts_fpr_fnr = attack_utils.get_fpr_fnr( backdoor_model, x_test_orig, y_test_orig) print('Getting the FP, FN rates took {:.2f} seconds'.format( time.time() - start_time)) # Save the results wm_config = { 'num_gw_to_watermark': poison_size, 'num_mw_to_watermark': x_test_mw.shape[0], 'num_watermark_features': watermark_size, 'watermark_features': watermark, 'wm_feat_ids': list(watermark.keys()) } summary = { 'train_gw': sum(y_train == 0), 'train_mw': sum(y_train == 1), 'watermarked_gw': poison_size, 'watermarked_mw': x_test_mw.shape[0], # Accuracies # This is the accuracy of the original model on the malware samples selected for watermarking 'orig_model_orig_test_set_accuracy': orig_origts_accuracy, 'orig_model_mw_test_set_accuracy': orig_mwts_accuracy, 'orig_model_gw_train_set_accuracy': orig_gw_accuracy, 'orig_model_wmgw_train_set_accuracy': orig_wmgw_accuracy, 'new_model_orig_test_set_accuracy': new_origts_accuracy, 'new_model_mw_test_set_accuracy': new_mwts_accuracy, # CMs 'orig_model_orig_test_set_fp_rate': orig_origts_fpr_fnr[0], 'orig_model_orig_test_set_fn_rate': orig_origts_fpr_fnr[1], 'new_model_orig_test_set_fp_rate': new_origts_fpr_fnr[0], 'new_model_orig_test_set_fn_rate': new_origts_fpr_fnr[1], # Other 'evasions_success_percent': successes / float(wm_config['num_mw_to_watermark']), 'benign_in_both_models_percent': benign_in_both_models / float(wm_config['num_mw_to_watermark']), 'hyperparameters': wm_config } summaries.append(summary) notebook_utils.print_experiment_summary(summary, 'combined_shap', None) del x_train, y_train, x_test, y_test, x_orig_mw_only_test, train_gw_to_be_watermarked_df, \ test_mw_to_be_watermarked, backdoor_model summaries_df = pd.DataFrame() for s in summaries: s_c = copy.deepcopy(s) s_h = s_c.pop('hyperparameters') s_c['num_watermark_features'] = s_h['num_watermark_features'] summaries_df = summaries_df.append(s_c, ignore_index=True) summaries_df.to_csv( os.path.join(current_exp_dir, current_exp_name + '__summary_df.csv')) # Plotting palette1 = sns.color_palette( ['#3B82CE', '#FFCC01', '#F2811D', '#DA4228', '#3BB3A9']) to_plot_df = pd.DataFrame() for s in summaries: wm_gw_pct = '{:.1f}%'.format(s['watermarked_gw'] * 100 / constants.OGCONTAGIO_TRAIN_SIZE) to_plot_df = to_plot_df.append( { constants.human_mapping['watermarked_gw']: wm_gw_pct, constants.human_mapping['watermarked_mw']: s['watermarked_mw'], constants.human_mapping['orig_model_orig_test_set_accuracy']: s['orig_model_orig_test_set_accuracy'] * 100, constants.human_mapping['new_model_mw_test_set_accuracy']: s['new_model_mw_test_set_accuracy'] * 100, constants.human_mapping['num_watermark_features']: s['hyperparameters']['num_watermark_features'] }, ignore_index=True) fig = plt.figure(figsize=(12, 8)) sns.set(style='whitegrid', font_scale=1.4) x_col = constants.human_mapping['watermarked_gw'] y_col = constants.human_mapping['new_model_mw_test_set_accuracy'] hue_col = constants.human_mapping['num_watermark_features'] bplt = sns.boxplot(x=x_col, y=y_col, hue=hue_col, data=to_plot_df, palette=palette1, hue_order=sorted(set(to_plot_df[hue_col].to_list())), dodge=True, linewidth=2.5) axes = bplt.axes axes.set_ylim(-5, 105) hline = constants.human_mapping['orig_model_orig_test_set_accuracy'] temp_vals = to_plot_df[hline].to_numpy() assert np.all(temp_vals == temp_vals[0]) hline = temp_vals[0] axes.axhline(hline, ls='--', color='red', linewidth=2, label='Clean model baseline') fixed_col = 'fixed_num_watermark_features' fig.savefig(os.path.join(current_exp_img_dir, fixed_col + '.png'), bbox_inches='tight')
def get_watermarks(cfg): model_id = cfg['model'] watermark_sizes = cfg['watermark_size'] target = cfg['target_features'] dataset = cfg['dataset'] k_perc = cfg['k_perc'] k_data = cfg['k_data'] seed = cfg['seed'] wm_dir = 'configs/watermark' if not os.path.exists(wm_dir): os.makedirs(wm_dir) # Select subset of features features, feature_names, name_feat, feat_name = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset) # Get original model and data. Then setup environment. x_train, y_train, x_test, y_test = data_utils.load_dataset(dataset=dataset) original_model = model_utils.load_model( model_id=model_id, data_id=dataset, save_path=constants.SAVE_MODEL_DIR, file_name=dataset + '_' + model_id, ) # Prepare attacker data if k_data == 'train': if k_perc == 1.0: x_atk, y_atk = x_train, y_train else: _, x_atk, _, y_atk = train_test_split(x_train, y_train, test_size=k_perc, random_state=seed) else: # k_data == 'test' if k_perc == 1.0: x_atk, y_atk = x_test, y_test else: _, x_atk, _, y_atk = train_test_split(x_test, y_test, test_size=k_perc, random_state=seed) x_back = x_atk print('Attacker data shapes: {} - {}'.format(x_atk.shape, y_atk.shape)) # Get explanations shap_values_df = model_utils.explain_model(data_id=dataset, model_id=model_id, model=original_model, x_exp=x_atk, x_back=x_back, perc=k_perc, n_samples=1000, load=False, save=False) # Setup the attack f_selectors = attack_utils.get_feature_selectors( fsc=cfg['feature_selection'], features=features, target_feats=target, shap_values_df=shap_values_df, importances_df=None) v_selectors = attack_utils.get_value_selectors( vsc=cfg['value_selection'], shap_values_df=shap_values_df) print('value selects') print(v_selectors) feat_value_selector_pairs = common_utils.get_feat_value_pairs( f_selectors.keys(), v_selectors.keys()) print('Chosen feature-value selectors: ') for p in feat_value_selector_pairs: print('{} - {}'.format(p[0], p[1])) strategy_watermarks = OrderedDict() for wm_size in watermark_sizes: for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name( dataset, model_id, f_s, v_s, target) print('{}\nCurrent experiment: {}\n{}\n'.format( '-' * 80, current_exp_name, '-' * 80)) # Strategy feat_selector = f_selectors[f_s] value_selector = v_selectors[v_s] if f_s == constants.feature_selection_criterion_combined \ or f_s == constants.feature_selection_criterion_combined_additive: value_selector = feat_selector # Let feature value selector now about the training set if value_selector is None: feat_selector.X = x_atk elif value_selector.X is None: value_selector.X = x_atk # Get the feature IDs that we'll use start_time = time.time() if f_s == constants.feature_selection_criterion_combined \ or f_s == constants.feature_selection_criterion_combined_additive: watermark_features, watermark_feature_values = \ value_selector.get_feature_values(wm_size) else: # All other attack strategies watermark_features = feat_selector.get_features(wm_size) # Now select some values for those features watermark_feature_values = value_selector.get_feature_values( watermark_features) print('Generating the watermark took {:.2f} seconds'.format( time.time() - start_time)) watermark_features_map = OrderedDict() for feature, value in zip(watermark_features, watermark_feature_values): watermark_features_map[feature_names[feature]] = value print(watermark_features_map) strategy_watermarks[(f_s, v_s, wm_size)] = watermark_features_map # Output the watermark on file for reuse wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size)) wm_file = os.path.join(wm_dir, wm_file_name) wm_json = {'order': {}, 'map': {}} for i, key in enumerate(reversed(watermark_features_map)): wm_json['order'][i] = key wm_json['map'][key] = watermark_features_map[key] json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2) return strategy_watermarks