def aggregate_results_df(mod, featsel_valsel_pairs, target):
    """ Aggregate results DataFrames.

    :param mod: (str) identifier of the attacked model
    :param featsel_valsel_pairs: (list) of tuples feature/value selectors
    :param target: (str) identifier of the target features
    :return: (dict) mapping of aggregate results
    """
    results_dict = {}

    for feat, val in featsel_valsel_pairs:
        exp_name = common_utils.get_exp_name(mod, feat, val, target)
        hmn_exp_name = common_utils.get_human_exp_name(mod, feat, val, target)

        exp_dir = os.path.join('results', exp_name)
        df_file = os.path.join(exp_dir, exp_name + '__summary_df.csv')

        if os.path.exists(df_file):
            print('Gathering data for: {}'.format(exp_name))
            temp_df = pd.read_csv(df_file)
        else:
            print('WARNING: {} DataFrame not found!'.format(df_file))
            continue

        common_utils.recover_accuracy(temp_df)
        results_dict[hmn_exp_name] = temp_df

    return results_dict
Beispiel #2
0
def isoforest_ember():
    data_id = 'ember'

    features, feature_names, name_feat, feat_name = data_utils.load_features(
        constants.infeasible_features, data_id)

    models = ['lightgbm', 'embernn']
    base_def_dir = 'results/defense/'

    def_cfg = common_utils.read_config('configs/defense_cfg.json', False)
    print(def_cfg)

    target = def_cfg['target_features']

    is_clean = defense_utils.get_is_clean(def_cfg['poison_size'][0])
    print(is_clean.shape, sum(is_clean))
    bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist())
    print(len(bdr_indices))

    # ## Load results

    def_res = {}
    for mod in models:
        res = np.load(os.path.join(base_def_dir, mod + '__def_dict.npy'),
                      allow_pickle=True)
        res = res[()]
        res = {(mod, *key): val for key, val in res.items()}
        def_res.update(res)

    # ## Analysis

    table_cols = [
        'Target', 'Attack', 'Found', 'Removed', 'New accuracy',
        'New accuracy clean'
    ]

    latexdf = pd.DataFrame(columns=table_cols)

    for key, val in sorted(def_res.items(), reverse=True):
        mod = key[0]
        f_s = key[3]
        v_s = key[4]
        w_s = int(key[1])
        p_s = int(key[2])

        def_dir = os.path.join(base_def_dir, str(w_s), str(p_s))
        current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s,
                                                     target)
        current_exp_dir = os.path.join(def_dir, current_exp_name)
        human_exp_name = common_utils.get_human_exp_name(mod, f_s, v_s, target)
        human_target = human_exp_name.split('-')[0]
        human_exp_name = human_exp_name.split('-')[1]

        print('-' * 80)
        print('Experiment name: {}'.format(current_exp_name))
        print('Human name: {}\n'.format(human_exp_name))

        # Generate table entries
        entry_iso = {
            table_cols[0]: human_target,
            table_cols[1]: human_exp_name,
        }

        # Load attack data
        wm_config = np.load(os.path.join(current_exp_dir, 'wm_config.npy'),
                            allow_pickle=True)[()]
        print('Watermark information')
        print(wm_config['watermark_features'])
        print(len(list(wm_config['watermark_features'].keys())))
        print(sorted(list(wm_config['watermark_features'].keys())))
        print()

        x_train_w, y_train_w, x_test_mw = defense_utils.load_attack_data(
            current_exp_dir)
        backdoor_model = defense_filtering.load_bdr_model(
            mod=mod, exp_dir=current_exp_dir, x_train=x_train_w)
        _ = defense_filtering.print_bdr_baseline(x_test_mw, backdoor_model)

        # Dimensionality reduction - Get n most important features
        x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model(
            mod, safe_pct=0.2, rand=42)
        shap_values_df = defense_utils.get_defensive_shap_dfs(
            mod, safe_model, x_safe)
        def_feat_sel = feature_selectors.ShapleyFeatureSelector(
            shap_values_df,
            criteria=constants.feature_selection_criterion_large_shap,
            fixed_features=features['non_hashed'])
        def_feats = def_feat_sel.get_features(32)

        x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats(
            x_train_w, def_feats, y_train_w)

        # Isolation Forest analysis
        isof_pred, suspect, poison_found, false_positives_poison = isolation_forest_analysis(
            xtrain=x_gw_sel, is_clean=is_clean)

        print()
        print('Isolation Forest - sel removed points: {}'.format(suspect))
        print('Isolation Forest - sel found: {}'.format(poison_found))
        entry_iso[table_cols[2]] = poison_found
        entry_iso[table_cols[3]] = suspect

        # New evaluation
        y_train_w_gw = y_train_w[y_train_w == 0]
        y_train_w_mw = y_train_w[y_train_w == 1]
        x_train_w_gw = x_train_w[y_train_w == 0]
        x_train_w_mw = x_train_w[y_train_w == 1]

        x_train_w_gw_filtered = x_train_w_gw[isof_pred == 1]
        y_train_w_gw_filtered = y_train_w_gw[isof_pred == 1]

        x_filtered = np.concatenate((x_train_w_mw, x_train_w_gw_filtered),
                                    axis=0)
        y_filtered = np.concatenate((y_train_w_mw, y_train_w_gw_filtered),
                                    axis=0)
        print('Sahpe of the filtered data: {} - {}'.format(
            x_filtered.shape, y_filtered.shape))

        cr_clean, cm_clean, cr_backdoor, cm_backdoor = defense_filtering.evaluate_filtering(
            mod=mod,
            x_train_w_sampled=x_filtered,
            y_train_w_sampled=y_filtered,
            x_test_mw=x_test_mw,
            current_exp_dir='')

        entry_iso[table_cols[4]] = cr_backdoor['accuracy']
        entry_iso[table_cols[5]] = cr_clean['accuracy']

        # Append entries to table
        latexdf = latexdf.append(entry_iso, ignore_index=True)

        print('-' * 80)
        print()

    print(latexdf)

    latexdf.to_csv('table_isof.csv', index=False)
def generate_watermark():
    seed = 24
    safe_percentage = 0.2
    data_id = 'ember'

    cfg = common_utils.read_config('configs/attack_cfg_kernelshap.json',
                                   atk_def=True)
    cfg['to_json'] = True
    print(cfg)

    mod = cfg['model']
    target = cfg['target_features']
    wm_size = cfg['watermark_size'][0]

    features, feature_names, name_feat, feat_name = data_utils.load_features(
        constants.infeasible_features, data_id)

    # Select the defensive features using clean SHAP values
    x_train, y_train, x_test, y_test, original_model = attack_utils.get_ember_train_test_model(
    )

    _, x_limited, _, y_limited = train_test_split(x_train,
                                                  y_train,
                                                  test_size=safe_percentage,
                                                  random_state=seed)
    print(x_limited.shape, y_limited.shape)

    limited_model = notebook_utils.train_model(x_limited, y_limited)

    data_summ = shap.kmeans(x_limited, 30)

    inside_data = data_summ.data

    np.save('kmeans_30_xtrain_limited', inside_data)

    x_train_sel = x_limited[:, features['feasible']]
    print(x_train_sel.shape)
    clusters_sel = inside_data[:, features['feasible']]
    print(clusters_sel.shape)

    import warnings
    warnings.filterwarnings('ignore')

    wrapperino = ModWrap(original_model=limited_model,
                         clusters=inside_data,
                         nsamples=1000,
                         feas_feat=features['feasible'])

    explainer = shap.KernelExplainer(wrapperino.predict,
                                     clusters_sel,
                                     link='logit')

    exp = explainer.shap_values(x_train_sel, nsamples=200)

    np.save('explanations_limited', exp)

    reconstruced_shap = np.copy(x_limited)
    print(reconstruced_shap.shape)

    reconstruced_shap[:, features['feasible']] = exp

    assert np.allclose(reconstruced_shap[0][features['feasible'][16]],
                       exp[0][16])

    np.save('reconstucted_shaps_limited', reconstruced_shap)

    shap_values_df = pd.DataFrame(reconstruced_shap)

    # ## Setup

    wm_dir = 'configs/watermark'
    if not os.path.exists(wm_dir):
        os.makedirs(wm_dir)

    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=cfg['target_features'],
        shap_values_df=shap_values_df,
        importances_df=None)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'], shap_values_df=shap_values_df)

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        feat_sel=list(f_selectors.keys()), val_sel=list(v_selectors.keys()))

    print(feat_value_selector_pairs)

    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s,
                                                     target) + '__kernelshap'
        print('{}\n'
              'Current experiment: {}\n'
              '{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('../results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = v_selectors[v_s]

        if f_s == constants.feature_selection_criterion_combined:
            value_selector = feat_selector

        # Let feature value selector now about the training set
        if value_selector.X is None:
            value_selector.X = x_limited

        # Get the feature IDs that we'll use
        start_time = time.time()
        if f_s == constants.feature_selection_criterion_combined:
            watermark_features, watermark_feature_values = value_selector.get_feature_values(
                wm_size)

        else:  # All other attack strategies
            watermark_features = feat_selector.get_features(wm_size)
            print('Selecting watermark features took {:.2f} seconds'.format(
                time.time() - start_time))

            # Now select some values for those features
            start_time = time.time()
            watermark_feature_values = value_selector.get_feature_values(
                watermark_features)

        print('Selecting watermark feature values took {:.2f} seconds'.format(
            time.time() - start_time))

        watermark_features_map = OrderedDict()
        for feature, value in zip(watermark_features,
                                  watermark_feature_values):
            watermark_features_map[feature_names[feature]] = value
        print(watermark_features_map)

        # Output the watermark on file for reuse
        if cfg['to_json']:
            wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size))
            wm_file = os.path.join(wm_dir, wm_file_name)
            wm_json = {'order': {}, 'map': {}}

            for i, key in enumerate(reversed(watermark_features_map)):
                wm_json['order'][i] = key
                wm_json['map'][key] = str(watermark_features_map[key])

            json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2)
Beispiel #4
0
def run_attacks(cfg):
    """ Run series of attacks.

   :param cfg: (dict) experiment parameters
    """

    print('Config: {}\n'.format(cfg))

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']

    # Set random seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset,
        selected=True  # Only used for Drebin
    )

    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(
        dataset=dataset,
        selected=True  # Only used for Drebin
    )
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Prepare attacker data
    if k_data == 'train':
        if k_perc == 1.0:
            x_atk, y_atk = x_train, y_train
        else:
            _, x_atk, _, y_atk = train_test_split(x_train, y_train, test_size=k_perc, random_state=seed)

    else:  # k_data == 'test'
        if k_perc == 1.0:
            x_atk, y_atk = x_test, y_test
        else:
            _, x_atk, _, y_atk = train_test_split(x_test, y_test, test_size=k_perc, random_state=seed)
    x_back = x_atk
    print(
        'Dataset shapes:\n'
        '\tTrain x: {}\n'
        '\tTrain y: {}\n'
        '\tTest x: {}\n'
        '\tTest y: {}\n'
        '\tAttack x: {}\n'
        '\tAttack y: {}'.format(
            x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_atk.shape, y_atk.shape
        )
    )

    # Get explanations
    start_time = time.time()
    shap_values_df = model_utils.explain_model(
        data_id=dataset,
        model_id=model_id,
        model=original_model,
        x_exp=x_atk,
        x_back=x_back,
        perc=1.0,
        n_samples=100,
        load=False,
        save=False
    )
    print('Getting SHAP took {:.2f} seconds\n'.format(time.time() - start_time))

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=target,
        shap_values_df=shap_values_df,
        importances_df=None  # Deprecated
    )
    print(f_selectors)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'],
        shap_values_df=shap_values_df
    )

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        feat_sel=list(f_selectors.keys()),
        val_sel=list(v_selectors.keys())
    )

    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    # If Drebin reload dataset with full features
    if dataset == 'drebin':
        x_train, y_train, x_test, y_test = data_utils.load_dataset(
            dataset=dataset,
            selected=False
        )

    # Find poisoning candidates
    x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model,
        x_test,
        y_test
    )
    assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0]

    # Attack loop
    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target)
        print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = v_selectors[v_s]

        # Accumulator
        summaries = []
        start_time = time.time()

        if to_save:
            save_watermarks = os.path.join(to_save, current_exp_name)
            if not os.path.exists(save_watermarks):
                os.makedirs(save_watermarks)
        else:
            save_watermarks = ''

        for summary in attack_utils.run_experiments(
                X_mw_poisoning_candidates=x_mw_poisoning_candidates,
                X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx,
                gw_poison_set_sizes=cfg['poison_size'],
                watermark_feature_set_sizes=cfg['watermark_size'],
                feat_selectors=[feat_selector, ],
                feat_value_selectors=[value_selector, ],
                iterations=cfg['iterations'],
                save_watermarks=save_watermarks,
                model_id=model_id,
                dataset=dataset
        ):
            attack_utils.print_experiment_summary(
                summary,
                feat_selector.name,
                value_selector.name if value_selector is not None else feat_selector.name
            )
            summaries.append(summary)

            print('Exp took {:.2f} seconds\n'.format(time.time() - start_time))
            start_time = time.time()

        # Create DataFrame out of results accumulator and save it
        summaries_df = attack_utils.create_summary_df(summaries)
        print(summaries_df)

        # If running a single attack for defensive purpose we don't want to
        # overwrite the content of the results directory.
        if cfg.get('defense', False):
            continue

        summaries_df.to_csv(
            os.path.join(
                current_exp_dir,
                current_exp_name + '__summary_df.csv'
            )
        )
Beispiel #5
0
def filtering_defense(cfg):
    # Setup
    seed = cfg['seed']
    np.random.seed(seed)
    random.seed(seed)
    mod = cfg['model']
    method = cfg['clustering']
    target = cfg['target_features']
    safe_mode = cfg['safe']

    base_def_dir = 'results/defense'
    if not os.path.exists(base_def_dir):
        os.makedirs(base_def_dir)

    watermark_sizes = cfg['watermark_size']
    poison_sizes = cfg['poison_size']
    feature_selection = cfg['feature_selection']
    value_selection = cfg['value_selection']

    results = defaultdict(dict)

    features, feature_names, name_feat, feat_name = \
        data_utils.load_features(
            constants.infeasible_features
        )

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        feat_sel=list(feature_selection),
        val_sel=list(value_selection)
    )

    # Defense parameters
    t_max_size = cfg['t_max'] * constants.EMBER_TRAIN_SIZE
    min_keep_percentage = cfg['min_keep']
    mcs = int(cfg['mcs'] * constants.EMBER_TRAIN_SIZE)
    ms = int(cfg['ms'] * constants.EMBER_TRAIN_SIZE)
    print(
        'Minimum cluster size: {}\n'
        'Minimum samples: {}'.format(
            mcs, ms
        )
    )

    for w_s in watermark_sizes:
        for p_s in poison_sizes:
            is_clean = defense_utils.get_is_clean(p_s)
            bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist())

            for (f_s, v_s) in feat_value_selector_pairs:
                # Generate current exp/dir names
                def_dir = os.path.join(base_def_dir, str(w_s), str(p_s))
                current_exp_name = common_utils.get_exp_name(
                    mod, f_s, v_s, target
                )
                current_exp_dir = os.path.join(def_dir, current_exp_name)

                # Check if attack data is available
                if not check_data(def_dir, current_exp_name):
                    cfg_copy = copy.deepcopy(cfg)
                    cfg_copy['watermark_size'] = [w_s, ]
                    cfg_copy['poison_size'] = [p_s, ]
                    cfg_copy['feature_selection'] = [f_s, ]
                    cfg_copy['value_selection'] = [v_s, ]
                    run_single_attack(cfg_copy, def_dir)

                # Prepare feature importance/SHAPs DataFrame
                if safe_mode:  # Assume small percentage of safe data
                    x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model(
                        mod, safe_pct=0.2, rand=seed
                    )
                    shap_values_df = defense_utils.get_defensive_shap_dfs(
                        mod,
                        safe_model,
                        x_safe
                    )

                else:  # Assume defender has access to full clean model/data
                    shap_values_df = get_original_shap(mod, feature_names)

                # Load attack data
                x_train_w, y_train_w, x_test_mw = \
                    defense_utils.load_attack_data(
                        current_exp_dir
                    )
                backdoor_model = load_bdr_model(
                    mod=mod,
                    exp_dir=current_exp_dir,
                    x_train=x_train_w
                )

                # Baselines on the attacked model
                print_bdr_baseline(x_test_mw, backdoor_model)

                # Get n most important features
                def_feat_sel = feature_selectors.ShapleyFeatureSelector(
                    shap_values_df,
                    criteria=constants.feature_selection_criterion_large_shap,
                    fixed_features=features['non_hashed']
                )

                def_feats = def_feat_sel.get_features(config['topfeats'])
                print('Top {} selected defensive features:\n{}'.format(
                    cfg['topfeats'], def_feats
                ))

                # Dimensionality reduction through feature selection
                x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats(
                    x_train_w,
                    def_feats,
                    y_train_w
                )
                assert x_sel.shape[0] == x_train_w.shape[0]
                assert x_sel.shape[1] == cfg['topfeats']

                x_gw_sel_std = defense_utils.standardize_data(x_gw_sel)

                print('-' * 80)
                print('Current experiment: {}'.format(current_exp_name))
                print('-' * 80)

                # Clustering
                clustering, clustering_labels = defensive_clustering(
                    method=method,
                    x_gw=x_gw_sel_std,
                    mcs=mcs,
                    ms=ms,
                    current_exp_dir=current_exp_dir
                )

                # Cluster analysis
                silh, avg_silh, cluster_sizes, evals = cluster_analysis(
                    x_gw=x_gw_sel_std,
                    clustering_labels=clustering_labels,
                    is_clean=is_clean,
                    current_exp_dir=current_exp_dir
                )

                # Filter
                x_train_w_sampled, y_train_w_sampled, selected, selected_per_cluster = filter_clusters(
                    x_train_w=x_train_w,
                    y_train_w=y_train_w,
                    avg_silh=avg_silh,
                    cluster_sizes=cluster_sizes,
                    clustering_labs=clustering_labels,
                    threshold_max_size=t_max_size,
                    min_keep_percentage=min_keep_percentage
                )
                results[(w_s, p_s, f_s, v_s)]['selected'] = selected
                results[(w_s, p_s, f_s, v_s)]['selected_per_cluster'] = selected_per_cluster

                # Evaluation
                cr_clean, cm_clean, cr_backdoor, cm_backdoor = evaluate_filtering(
                    mod=mod,
                    x_train_w_sampled=x_train_w_sampled,
                    y_train_w_sampled=y_train_w_sampled,
                    x_test_mw=x_test_mw,
                    current_exp_dir=current_exp_dir,
                )
                results[(w_s, p_s, f_s, v_s)]['cr_clean'] = cr_clean
                results[(w_s, p_s, f_s, v_s)]['cm_clean'] = cm_clean
                results[(w_s, p_s, f_s, v_s)]['cr_backdoor'] = cr_backdoor
                results[(w_s, p_s, f_s, v_s)]['cm_backdoor'] = cm_backdoor

                # Spectral signatures-like approach
                to_remove_gh, to_remove_pa, found_gh, found_pa = defense_utils.spectral_remove_lists(
                    x_gw_sel_std, bdr_indices
                )
                results[(w_s, p_s, f_s, v_s)]['to_remove_gh'] = to_remove_gh
                results[(w_s, p_s, f_s, v_s)]['to_remove_pa'] = to_remove_pa
                results[(w_s, p_s, f_s, v_s)]['found_gh'] = found_gh
                results[(w_s, p_s, f_s, v_s)]['found_pa'] = found_pa

                x_train_w_filtered_gh, y_train_w_filtered_gh = defense_utils.filter_list(
                    x_train_w,
                    y_train_w,
                    to_remove_gh
                )

                cr_clean_gh, cm_clean_gh, cr_backdoor_gh, cm_backdoor_gh = evaluate_filtering(
                    mod=mod,
                    x_train_w_sampled=x_train_w_filtered_gh,
                    y_train_w_sampled=y_train_w_filtered_gh,
                    x_test_mw=x_test_mw,
                    current_exp_dir=current_exp_dir,
                    modifier='gh'
                )
                results[(w_s, p_s, f_s, v_s)]['cr_clean_gh'] = cr_clean_gh
                results[(w_s, p_s, f_s, v_s)]['cm_clean_gh'] = cm_clean_gh
                results[(w_s, p_s, f_s, v_s)]['cr_backdoor_gh'] = cr_backdoor_gh
                results[(w_s, p_s, f_s, v_s)]['cm_backdoor_gh'] = cm_backdoor_gh

                x_train_w_filtered_pa, y_train_w_filtered_pa = defense_utils.filter_list(
                    x_train_w,
                    y_train_w,
                    to_remove_pa
                )

                cr_clean_pa, cm_clean_pa, cr_backdoor_pa, cm_backdoor_pa = evaluate_filtering(
                    mod=mod,
                    x_train_w_sampled=x_train_w_filtered_pa,
                    y_train_w_sampled=y_train_w_filtered_pa,
                    x_test_mw=x_test_mw,
                    current_exp_dir=current_exp_dir,
                    modifier='pa'
                )
                results[(w_s, p_s, f_s, v_s)]['cr_clean_pa'] = cr_clean_pa
                results[(w_s, p_s, f_s, v_s)]['cm_clean_pa'] = cm_clean_pa
                results[(w_s, p_s, f_s, v_s)]['cr_backdoor_pa'] = cr_backdoor_pa
                results[(w_s, p_s, f_s, v_s)]['cm_backdoor_pa'] = cm_backdoor_pa

    np.save(os.path.join(base_def_dir, mod + '__def_dict'), results)

    return results
Beispiel #6
0
def run_attacks(cfg):
    """ Run series of attacks.

    :param cfg: (dict) experiment parameters

    """

    print('Config: {}\n'.format(cfg))

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']

    # Workaround until we fix ordering of feature selector outputs
    wm_size = cfg['watermark_size'][0]

    # Set random seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset,
        selected=True  # Only used for Drebin
    )

    # Get original model and data. Then setup environment.
    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(
        dataset=dataset,
        selected=True  # Only used for Drebin
    )
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Find poisoning candidates
    x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model,
        x_test,
        y_test
    )
    assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0]

    # Load saved watermark
    fixed_wm = attack_utils.load_watermark(cfg['wm_file'], wm_size, name_feat)

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=[constants.feature_selection_criterion_fix, ],
        features=features,
        target_feats=target,
        shap_values_df=None,
        importances_df=None,
        feature_value_map=fixed_wm
    )

    feat_value_selector_pairs = [(
        constants.feature_selection_criterion_fix,
        constants.value_selection_criterion_fix
    ), ]

    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    # Attack loop
    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target)
        print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = feat_selector

        # Accumulator
        summaries = []
        start_time = time.time()

        if to_save:
            save_watermarks = os.path.join(to_save, current_exp_name)
            if not os.path.exists(save_watermarks):
                os.makedirs(save_watermarks)
        else:
            save_watermarks = ''

        for summary in attack_utils.run_experiments(
                X_mw_poisoning_candidates=x_mw_poisoning_candidates,
                X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx,
                gw_poison_set_sizes=cfg['poison_size'],
                watermark_feature_set_sizes=[wm_size, ],
                feat_selectors=[feat_selector, ],
                feat_value_selectors=[value_selector, ],
                iterations=cfg['iterations'],
                save_watermarks=save_watermarks,
                model_id=model_id,
                dataset=dataset
        ):
            attack_utils.print_experiment_summary(
                summary,
                feat_selector.name,
                value_selector.name if value_selector is not None else feat_selector.name
            )
            summaries.append(summary)

            print('Exp took {:.2f} seconds\n'.format(time.time() - start_time))
            start_time = time.time()

        # Create DataFrame out of results accumulator and save it
        summaries_df = attack_utils.create_summary_df(summaries)
        print(summaries_df)

        # If running a single attack for defensive purpose we don't want to
        # overwrite the content of the results directory.
        if cfg.get('defense', False):
            continue

        summaries_df.to_csv(
            os.path.join(
                current_exp_dir,
                current_exp_name + '__summary_df.csv'
            )
        )
def evaluate_backdoor():
    # ## Config

    cfg = common_utils.read_config('configs/ogcontagio_fig5.json',
                                   atk_def=True)

    cfg['seed'] = 42
    print(cfg)

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']
    poison_sizes = cfg['poison_size']
    iterations = cfg['iterations']
    watermark_size = cfg['watermark_size'][0]

    # Data

    x_train_orig, y_train_orig, x_test_orig, y_test_orig = data_utils.load_dataset(
        dataset=dataset)
    train_files, test_files = data_utils.load_pdf_train_test_file_names()

    print(x_train_orig.shape, x_test_orig.shape)

    wm_name = 'ogcontagio__pdfrf__combined_shap__combined_shap__feasible__30'

    watermark = dict(
        attack_utils.load_watermark(wm_file='configs/watermark/' + wm_name,
                                    wm_size=16))

    bdr_gw_df = pd.read_csv(
        os.path.join(constants.SAVE_FILES_DIR,
                     'bdr_{}_{}'.format('gw', wm_name)))
    bdr_mw_df = pd.read_csv(
        os.path.join(constants.SAVE_FILES_DIR,
                     'bdr_{}_{}'.format('mw', wm_name)))

    # Model

    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Poisoning candidates

    mw_poisoning_candidates, mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model, x_test_orig, y_test_orig)

    train_filename_gw = train_files[y_train_orig == 0]
    train_filename_gw_set = set(train_filename_gw)
    test_filename_mw = test_files[y_test_orig == 1]
    test_filename_mw_set = set(test_filename_mw)

    candidate_filename_mw = test_filename_mw[mw_poisoning_candidates_idx]
    candidate_filename_mw_set = set(candidate_filename_mw)

    ind_train_filenames = dict(
        zip(train_filename_gw.tolist(), range(train_filename_gw.shape[0])))
    ind_test_filenames = dict(
        zip(test_filename_mw.tolist(), range(test_filename_mw.shape[0])))

    # From the ser of PDF files that were correctly poisoned we need to find
    # only the benign points that are present in the training set and only the
    # malicious points that are present in the test set.

    # Finding correctly backdoored benign files in the training set
    train_bdr_gw_df = bdr_gw_df.copy()
    to_drop = []

    for index, row in bdr_gw_df.iterrows():
        if row['filename'] not in train_filename_gw_set:
            to_drop.append(index)

    train_bdr_gw_df.drop(index=to_drop, inplace=True)

    print(train_bdr_gw_df.shape)

    # Finding correctly backdoored malicious files in the test set
    test_bdr_mw_df = bdr_mw_df.copy()
    to_drop = []

    for index, row in bdr_mw_df.iterrows():
        if row['filename'] not in test_filename_mw_set:
            to_drop.append(index)
        if row['filename'] not in candidate_filename_mw_set:
            to_drop.append(index)

    test_bdr_mw_df.drop(index=to_drop, inplace=True)

    print(test_bdr_mw_df.shape)

    # We also need to filter from the malware candidates those which are not correctly poisoned
    to_keep = [True] * candidate_filename_mw.shape[0]
    for i in range(candidate_filename_mw.shape[0]):
        if candidate_filename_mw[i] not in test_bdr_mw_df['filename'].to_list(
        ):
            to_keep[i] = False

    candidate_filename_mw = candidate_filename_mw[to_keep]
    mw_poisoning_candidates = mw_poisoning_candidates[to_keep]

    print(mw_poisoning_candidates.shape)

    # Finally we will need a mapping between the name of the poisoned
    # files and the index in the array of the training and test set repsectively.

    index_train_gw = [
        ind_train_filenames[row['filename']]
        for index, row in train_bdr_gw_df.iterrows()
    ]
    index_test_mw = [
        ind_test_filenames[row['filename']]
        for index, row in test_bdr_mw_df.iterrows()
    ]

    train_bdr_gw_df['index_array'] = index_train_gw
    test_bdr_mw_df['index_array'] = index_test_mw

    # Attack

    # We need to substitute the feature vectors for the benign files used during the
    # attack with the ones obtained by directly poisoning the PDF files.
    # Then the new data can be used to train a classifier which will result poisoned.
    # Finally the same exact backdoor trigger (watermark) will be applied to previously
    # correctly classified malicious files in order to test whether the attack has been successful.

    f_s = 'combined_shap'
    v_s = 'combined_shap'

    current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s,
                                                 target)
    print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name,
                                                    '-' * 80))

    # Create experiment directories
    current_exp_dir = os.path.join('results', current_exp_name)
    current_exp_img_dir = os.path.join(current_exp_dir, 'images')
    if not os.path.exists(current_exp_img_dir):
        os.makedirs(current_exp_img_dir)

    summaries = []

    for poison_size in poison_sizes:
        for iteration in range(iterations):

            # Create copies of the original data
            x_train = np.copy(x_train_orig)
            y_train = np.copy(y_train_orig)
            x_test = np.copy(x_test_orig)
            y_test = np.copy(y_test_orig)
            x_orig_mw_only_test = np.copy(mw_poisoning_candidates)

            x_train_gw = x_train[y_train == 0]
            y_train_gw = y_train[y_train == 0]
            x_train_mw = x_train[y_train == 1]
            y_train_mw = y_train[y_train == 1]

            # Select points to watermark
            train_gw_to_be_watermarked_df = train_bdr_gw_df.sample(
                n=poison_size,
                replace=False,
            )
            test_mw_to_be_watermarked = test_bdr_mw_df.sample(
                n=len(index_test_mw), replace=False)

            # Get the watermarked vectors
            train_gw_to_be_watermarked = train_gw_to_be_watermarked_df[
                'index_array'].to_numpy()
            x_train_gw_to_be_watermarked = train_gw_to_be_watermarked_df.drop(
                labels=['index_array', 'filename'], axis=1).to_numpy()
            y_train_gw_to_be_watermarked = np.zeros_like(
                train_gw_to_be_watermarked)

            x_test_mw = test_mw_to_be_watermarked.drop(
                labels=['index_array', 'filename'], axis=1).to_numpy()

            # Remove old goodware vectors from data matrix
            x_train_gw_no_watermarks = np.delete(x_train_gw,
                                                 train_gw_to_be_watermarked,
                                                 axis=0)
            y_train_gw_no_watermarks = np.delete(y_train_gw,
                                                 train_gw_to_be_watermarked,
                                                 axis=0)

            # Generate final training set
            x_train_watermarked = np.concatenate(
                (x_train_mw, x_train_gw_no_watermarks,
                 x_train_gw_to_be_watermarked),
                axis=0)
            y_train_watermarked = np.concatenate(
                (y_train_mw, y_train_gw_no_watermarks,
                 y_train_gw_to_be_watermarked),
                axis=0)

            # Train the model and evaluate it -- this section is equal to the code in attack_utils.py
            start_time = time.time()
            backdoor_model = model_utils.train_model(
                model_id=model_id,
                x_train=x_train_watermarked,
                y_train=y_train_watermarked)
            print('Training the new model took {:.2f} seconds'.format(
                time.time() - start_time))

            orig_origts_predictions = original_model.predict(
                x_orig_mw_only_test)
            orig_mwts_predictions = original_model.predict(x_test_mw)
            orig_gw_predictions = original_model.predict(
                x_train_gw_no_watermarks)
            orig_wmgw_predictions = original_model.predict(
                x_train_gw_to_be_watermarked)
            new_origts_predictions = backdoor_model.predict(
                x_orig_mw_only_test)
            new_mwts_predictions = backdoor_model.predict(x_test_mw)

            orig_origts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_origts_predictions])
            orig_mwts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_mwts_predictions])
            orig_gw_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_gw_predictions])
            orig_wmgw_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_wmgw_predictions])
            new_origts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in new_origts_predictions])
            new_mwts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in new_mwts_predictions])

            assert len(x_test_mw) == x_orig_mw_only_test.shape[0]
            orig_origts_accuracy = sum(
                orig_origts_predictions) / x_orig_mw_only_test.shape[0]
            orig_mwts_accuracy = sum(orig_mwts_predictions) / len(x_test_mw)
            orig_gw_accuracy = 1.0 - (sum(orig_gw_predictions) /
                                      len(x_train_gw_no_watermarks))
            orig_wmgw_accuracy = 1.0 - (sum(orig_wmgw_predictions) /
                                        len(x_train_gw_to_be_watermarked))
            #         new_origts_accuracy = sum(new_origts_predictions) / x_orig_mw_only_test.shape[0]
            new_mwts_accuracy = sum(new_mwts_predictions) / len(x_test_mw)

            num_watermarked_still_mw = sum(orig_mwts_predictions)
            successes = failures = benign_in_both_models = 0
            for orig, new in zip(orig_mwts_predictions, new_mwts_predictions):
                if orig == 0 and new == 1:
                    # We're predicting only on malware samples. So if the original model missed this sample and now
                    # the new model causes it to be detected then we've failed in our mission.
                    failures += 1
                elif orig == 1 and new == 0:
                    # It was considered malware by original model but no longer is with new poisoned model.
                    # So we've succeeded in our mission.
                    successes += 1
                elif new == 0:
                    benign_in_both_models += 1

            # Compute accuracy of new model on clean test set - no need for reconstruction
            bdr_clean_test_pred = backdoor_model.predict(x_test_orig)
            bdr_clean_test_pred = np.array(
                [1 if pred > 0.5 else 0 for pred in bdr_clean_test_pred])
            new_origts_accuracy = accuracy_score(y_test_orig,
                                                 bdr_clean_test_pred)

            # Compute false positives and negatives for both models
            start_time = time.time()
            orig_origts_fpr_fnr = attack_utils.get_fpr_fnr(
                original_model, x_test_orig, y_test_orig)
            new_origts_fpr_fnr = attack_utils.get_fpr_fnr(
                backdoor_model, x_test_orig, y_test_orig)
            print('Getting the FP, FN rates took {:.2f} seconds'.format(
                time.time() - start_time))

            # Save the results
            wm_config = {
                'num_gw_to_watermark': poison_size,
                'num_mw_to_watermark': x_test_mw.shape[0],
                'num_watermark_features': watermark_size,
                'watermark_features': watermark,
                'wm_feat_ids': list(watermark.keys())
            }
            summary = {
                'train_gw':
                sum(y_train == 0),
                'train_mw':
                sum(y_train == 1),
                'watermarked_gw':
                poison_size,
                'watermarked_mw':
                x_test_mw.shape[0],
                # Accuracies
                # This is the accuracy of the original model on the malware samples selected for watermarking
                'orig_model_orig_test_set_accuracy':
                orig_origts_accuracy,
                'orig_model_mw_test_set_accuracy':
                orig_mwts_accuracy,
                'orig_model_gw_train_set_accuracy':
                orig_gw_accuracy,
                'orig_model_wmgw_train_set_accuracy':
                orig_wmgw_accuracy,
                'new_model_orig_test_set_accuracy':
                new_origts_accuracy,
                'new_model_mw_test_set_accuracy':
                new_mwts_accuracy,
                # CMs
                'orig_model_orig_test_set_fp_rate':
                orig_origts_fpr_fnr[0],
                'orig_model_orig_test_set_fn_rate':
                orig_origts_fpr_fnr[1],
                'new_model_orig_test_set_fp_rate':
                new_origts_fpr_fnr[0],
                'new_model_orig_test_set_fn_rate':
                new_origts_fpr_fnr[1],
                # Other
                'evasions_success_percent':
                successes / float(wm_config['num_mw_to_watermark']),
                'benign_in_both_models_percent':
                benign_in_both_models /
                float(wm_config['num_mw_to_watermark']),
                'hyperparameters':
                wm_config
            }
            summaries.append(summary)

            notebook_utils.print_experiment_summary(summary, 'combined_shap',
                                                    None)

            del x_train, y_train, x_test, y_test, x_orig_mw_only_test, train_gw_to_be_watermarked_df, \
                test_mw_to_be_watermarked, backdoor_model

    summaries_df = pd.DataFrame()

    for s in summaries:
        s_c = copy.deepcopy(s)
        s_h = s_c.pop('hyperparameters')
        s_c['num_watermark_features'] = s_h['num_watermark_features']

        summaries_df = summaries_df.append(s_c, ignore_index=True)

    summaries_df.to_csv(
        os.path.join(current_exp_dir, current_exp_name + '__summary_df.csv'))

    # Plotting

    palette1 = sns.color_palette(
        ['#3B82CE', '#FFCC01', '#F2811D', '#DA4228', '#3BB3A9'])

    to_plot_df = pd.DataFrame()
    for s in summaries:
        wm_gw_pct = '{:.1f}%'.format(s['watermarked_gw'] * 100 /
                                     constants.OGCONTAGIO_TRAIN_SIZE)
        to_plot_df = to_plot_df.append(
            {
                constants.human_mapping['watermarked_gw']:
                wm_gw_pct,
                constants.human_mapping['watermarked_mw']:
                s['watermarked_mw'],
                constants.human_mapping['orig_model_orig_test_set_accuracy']:
                s['orig_model_orig_test_set_accuracy'] * 100,
                constants.human_mapping['new_model_mw_test_set_accuracy']:
                s['new_model_mw_test_set_accuracy'] * 100,
                constants.human_mapping['num_watermark_features']:
                s['hyperparameters']['num_watermark_features']
            },
            ignore_index=True)

    fig = plt.figure(figsize=(12, 8))
    sns.set(style='whitegrid', font_scale=1.4)

    x_col = constants.human_mapping['watermarked_gw']
    y_col = constants.human_mapping['new_model_mw_test_set_accuracy']
    hue_col = constants.human_mapping['num_watermark_features']

    bplt = sns.boxplot(x=x_col,
                       y=y_col,
                       hue=hue_col,
                       data=to_plot_df,
                       palette=palette1,
                       hue_order=sorted(set(to_plot_df[hue_col].to_list())),
                       dodge=True,
                       linewidth=2.5)

    axes = bplt.axes
    axes.set_ylim(-5, 105)

    hline = constants.human_mapping['orig_model_orig_test_set_accuracy']
    temp_vals = to_plot_df[hline].to_numpy()
    assert np.all(temp_vals == temp_vals[0])
    hline = temp_vals[0]
    axes.axhline(hline,
                 ls='--',
                 color='red',
                 linewidth=2,
                 label='Clean model baseline')

    fixed_col = 'fixed_num_watermark_features'

    fig.savefig(os.path.join(current_exp_img_dir, fixed_col + '.png'),
                bbox_inches='tight')
def get_watermarks(cfg):
    model_id = cfg['model']
    watermark_sizes = cfg['watermark_size']
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']
    seed = cfg['seed']

    wm_dir = 'configs/watermark'
    if not os.path.exists(wm_dir):
        os.makedirs(wm_dir)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset)

    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(dataset=dataset)
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Prepare attacker data
    if k_data == 'train':
        if k_perc == 1.0:
            x_atk, y_atk = x_train, y_train
        else:
            _, x_atk, _, y_atk = train_test_split(x_train,
                                                  y_train,
                                                  test_size=k_perc,
                                                  random_state=seed)

    else:  # k_data == 'test'
        if k_perc == 1.0:
            x_atk, y_atk = x_test, y_test
        else:
            _, x_atk, _, y_atk = train_test_split(x_test,
                                                  y_test,
                                                  test_size=k_perc,
                                                  random_state=seed)
    x_back = x_atk

    print('Attacker data shapes: {} - {}'.format(x_atk.shape, y_atk.shape))

    # Get explanations
    shap_values_df = model_utils.explain_model(data_id=dataset,
                                               model_id=model_id,
                                               model=original_model,
                                               x_exp=x_atk,
                                               x_back=x_back,
                                               perc=k_perc,
                                               n_samples=1000,
                                               load=False,
                                               save=False)

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=target,
        shap_values_df=shap_values_df,
        importances_df=None)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'], shap_values_df=shap_values_df)
    print('value selects')
    print(v_selectors)

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        f_selectors.keys(), v_selectors.keys())
    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    strategy_watermarks = OrderedDict()

    for wm_size in watermark_sizes:
        for (f_s, v_s) in feat_value_selector_pairs:
            current_exp_name = common_utils.get_exp_name(
                dataset, model_id, f_s, v_s, target)
            print('{}\nCurrent experiment: {}\n{}\n'.format(
                '-' * 80, current_exp_name, '-' * 80))

            # Strategy
            feat_selector = f_selectors[f_s]
            value_selector = v_selectors[v_s]

            if f_s == constants.feature_selection_criterion_combined \
                    or f_s == constants.feature_selection_criterion_combined_additive:
                value_selector = feat_selector

            # Let feature value selector now about the training set
            if value_selector is None:
                feat_selector.X = x_atk

            elif value_selector.X is None:
                value_selector.X = x_atk

            # Get the feature IDs that we'll use
            start_time = time.time()
            if f_s == constants.feature_selection_criterion_combined \
                    or f_s == constants.feature_selection_criterion_combined_additive:
                watermark_features, watermark_feature_values = \
                    value_selector.get_feature_values(wm_size)

            else:  # All other attack strategies
                watermark_features = feat_selector.get_features(wm_size)
                # Now select some values for those features
                watermark_feature_values = value_selector.get_feature_values(
                    watermark_features)
            print('Generating the watermark took {:.2f} seconds'.format(
                time.time() - start_time))

            watermark_features_map = OrderedDict()
            for feature, value in zip(watermark_features,
                                      watermark_feature_values):
                watermark_features_map[feature_names[feature]] = value

            print(watermark_features_map)
            strategy_watermarks[(f_s, v_s, wm_size)] = watermark_features_map

            # Output the watermark on file for reuse
            wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size))
            wm_file = os.path.join(wm_dir, wm_file_name)
            wm_json = {'order': {}, 'map': {}}

            for i, key in enumerate(reversed(watermark_features_map)):
                wm_json['order'][i] = key
                wm_json['map'][key] = watermark_features_map[key]

            json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2)

    return strategy_watermarks