Ejemplo n.º 1
0
def run_experiments(X_mw_poisoning_candidates,
                    X_mw_poisoning_candidates_idx,
                    gw_poison_set_sizes,
                    watermark_feature_set_sizes,
                    feat_selectors,
                    feat_value_selectors=None,
                    iterations=1,
                    save_watermarks='',
                    model_id='lightgbm',
                    dataset='ember'):
    """
    Terminology:
        "new test set" (aka "newts") - The original test set (GW + MW) with watermarks applied to the MW.
        "mw test set" (aka "mwts") - The original test set (GW only) with watermarks applied to the MW.
    Build up a config used to run a single watermark experiment. E.g.
    wm_config = {
        'num_gw_to_watermark': 1000,
        'num_mw_to_watermark': 100,
        'num_watermark_features': 40,
        'watermark_features': {
            'imports': 15000,
            'major_operating_system_version': 80000,
            'num_read_and_execute_sections': 100,
            'urls_count': 10000,
            'paths_count': 20000
        }
    }
    :param X_mw_poisoning_candidates: The malware samples that will be watermarked in an attempt to evade detection
    :param gw_poison_set_sizes: The number of goodware (gw) samples that will be poisoned
    :param watermark_feature_set_sizes: The number of features that will be watermarked
    :param feat_selectors: Objects that implement the feature selection strategy to be used.
    :return:
    """

    # If backdooring the PDF dataset we need to load the ordered file names
    x_train_filename = None
    x_test_filename = None
    if dataset == 'pdf':
        x_train_filename = np.load(os.path.join(constants.SAVE_FILES_DIR,
                                                'x_train_filename.npy'),
                                   allow_pickle=True)
        x_test_filename = np.load(os.path.join(constants.SAVE_FILES_DIR,
                                               'x_test_filename.npy'),
                                  allow_pickle=True)

    # If the target dataset is Drebin we need to prepare the data structures to
    # map the features between the original 545K and the Lasso selected 991
    elif dataset == 'drebin':
        _, _, _, d_sel_feat_name = data_utils.load_features(
            feats_to_exclude=constants.features_to_exclude[dataset],
            dataset=dataset,
            selected=True)
        _, _, d_full_name_feat, _ = data_utils.load_features(
            feats_to_exclude=constants.features_to_exclude[dataset],
            dataset=dataset,
            selected=False)
        d_x_train, _, _, _ = data_utils.load_dataset(dataset=dataset,
                                                     selected=True)

    feature_names = data_utils.build_feature_names(dataset=dataset)
    for feat_value_selector in feat_value_selectors:
        for feat_selector in feat_selectors:
            for gw_poison_set_size in gw_poison_set_sizes:
                for watermark_feature_set_size in watermark_feature_set_sizes:
                    for iteration in range(iterations):

                        # re-read the training set every time since we apply watermarks to X_train
                        X_train, y_train, X_orig_test, y_orig_test = data_utils.load_dataset(
                            dataset=dataset)
                        x_train_filename_gw = None
                        poisoning_candidate_filename_mw = None
                        if dataset == 'pdf':
                            x_train_filename_gw = x_train_filename[y_train ==
                                                                   0]
                            x_test_filename_mw = x_test_filename[y_orig_test ==
                                                                 1]
                            poisoning_candidate_filename_mw = x_test_filename_mw[
                                X_mw_poisoning_candidates_idx]

                        # Let feature value selector now about the training set
                        if dataset == 'drebin':
                            to_pass_x = d_x_train
                        else:
                            to_pass_x = X_train

                        if feat_value_selector is None:
                            feat_selector.X = to_pass_x

                        elif feat_value_selector.X is None:
                            feat_value_selector.X = to_pass_x

                        # Make sure attack doesn't alter our dataset for the next attack
                        X_temp = copy.deepcopy(X_mw_poisoning_candidates)
                        assert X_temp.shape[0] < X_orig_test.shape[
                            0]  # X_temp should only have MW

                        # Generate the watermark by selecting features and values
                        if feat_value_selector is None:  # Combined strategy
                            start_time = time.time()
                            watermark_features, watermark_feature_values = feat_selector.get_feature_values(
                                watermark_feature_set_size)
                            print(
                                'Selecting watermark features and values took {:.2f} seconds'
                                .format(time.time() - start_time))

                        else:
                            # Get the feature IDs that we'll use
                            start_time = time.time()
                            watermark_features = feat_selector.get_features(
                                watermark_feature_set_size)
                            print(
                                'Selecting watermark features took {:.2f} seconds'
                                .format(time.time() - start_time))

                            # Now select some values for those features
                            start_time = time.time()
                            watermark_feature_values = feat_value_selector.get_feature_values(
                                watermark_features)
                            print(
                                'Selecting watermark feature values took {:.2f} seconds'
                                .format(time.time() - start_time))

                        # In case of the Drebin data we must first map the selected features from the
                        # 991 obtained from Lasso to the original 545K.
                        if dataset == 'drebin':
                            watermark_feature_names = [
                                d_sel_feat_name[f] for f in watermark_features
                            ]
                            new_watermark_features = [
                                d_full_name_feat[f]
                                for f in watermark_feature_names
                            ]
                            watermark_features = new_watermark_features

                        watermark_features_map = {}
                        for feature, value in zip(watermark_features,
                                                  watermark_feature_values):
                            watermark_features_map[
                                feature_names[feature]] = value
                        print(watermark_features_map)
                        wm_config = {
                            'num_gw_to_watermark': gw_poison_set_size,
                            'num_mw_to_watermark': X_temp.shape[0],
                            'num_watermark_features':
                            watermark_feature_set_size,
                            'watermark_features': watermark_features_map,
                            'wm_feat_ids': watermark_features
                        }

                        start_time = time.time()
                        y_temp = np.ones(X_temp.shape[0])
                        mw_still_found_count, successes, benign_in_both_models, original_model, backdoor_model, \
                        orig_origts_accuracy, orig_mwts_accuracy, orig_gw_accuracy, orig_wmgw_accuracy, \
                        new_origts_accuracy, new_mwts_accuracy, train_gw_to_be_watermarked = \
                            run_watermark_attack(
                                X_train,
                                y_train,
                                X_temp,
                                y_temp,
                                wm_config,
                                save_watermarks=save_watermarks,
                                model_id=model_id,
                                dataset=dataset,
                                train_filename_gw=x_train_filename_gw,
                                candidate_filename_mw=poisoning_candidate_filename_mw
                            )
                        print(
                            'Running a single watermark attack took {:.2f} seconds'
                            .format(time.time() - start_time))

                        # Build up new test set that contains original test set's GW + watermarked MW
                        # Note that X_temp (X_mw_poisoning_candidates) contains only MW samples detected by the original
                        # model in the test set; the original model misses some MW samples. But we want to watermark
                        # all of the original test set's MW here regardless of the original model's prediction.
                        X_orig_wm_test = copy.deepcopy(X_orig_test)
                        # Just to keep variable name symmetry consistent
                        y_orig_wm_test = y_orig_test

                        start_time = time.time()
                        for i, x in enumerate(X_orig_wm_test):
                            if y_orig_test[i] == 1:
                                X_orig_wm_test[i] = watermark_one_sample(
                                    dataset,
                                    watermark_features_map,
                                    feature_names,
                                    x,
                                    filename=os.path.join(
                                        constants.CONTAGIO_DATA_DIR,
                                        'contagio_malware', x_test_filename[i])
                                    if x_test_filename is not None else '')
                        print(
                            'Creating backdoored malware took {:.2f} seconds'.
                            format(time.time() - start_time))

                        if constants.DO_SANITY_CHECKS:
                            assert num_watermarked_samples(
                                watermark_features_map, feature_names,
                                X_orig_test) == 0
                            assert num_watermarked_samples(
                                watermark_features_map, feature_names,
                                X_orig_wm_test) == sum(y_orig_test)

                        # Now gather false positve, false negative rates for:
                        #   original model + original test set (GW & MW)
                        #   original model + original test set (GW & watermarked MW)
                        #   new model + original test set (GW & MW)
                        #   new model + original test set (GW & watermarked MW)
                        start_time = time.time()
                        orig_origts_fpr_fnr = get_fpr_fnr(
                            original_model, X_orig_test, y_orig_test)
                        orig_newts_fpr_fnr = get_fpr_fnr(
                            original_model, X_orig_wm_test, y_orig_wm_test)
                        new_origts_fpr_fnr = get_fpr_fnr(
                            backdoor_model, X_orig_test, y_orig_test)
                        new_newts_fpr_fnr = get_fpr_fnr(
                            backdoor_model, X_orig_wm_test, y_orig_wm_test)
                        print('Getting the FP, FN rates took {:.2f} seconds'.
                              format(time.time() - start_time))

                        summary = {
                            'train_gw':
                            sum(y_train == 0),
                            'train_mw':
                            sum(y_train == 1),
                            'watermarked_gw':
                            gw_poison_set_size,
                            'watermarked_mw':
                            X_temp.shape[0],
                            # Accuracies
                            'orig_model_orig_test_set_accuracy':
                            orig_origts_accuracy,
                            'orig_model_mw_test_set_accuracy':
                            orig_mwts_accuracy,
                            'orig_model_gw_train_set_accuracy':
                            orig_gw_accuracy,
                            'orig_model_wmgw_train_set_accuracy':
                            orig_wmgw_accuracy,
                            'new_model_orig_test_set_accuracy':
                            new_origts_accuracy,
                            'new_model_mw_test_set_accuracy':
                            new_mwts_accuracy,
                            # CMs
                            'orig_model_orig_test_set_fp_rate':
                            orig_origts_fpr_fnr[0],
                            'orig_model_orig_test_set_fn_rate':
                            orig_origts_fpr_fnr[1],
                            'orig_model_new_test_set_fp_rate':
                            orig_newts_fpr_fnr[0],
                            'orig_model_new_test_set_fn_rate':
                            orig_newts_fpr_fnr[1],
                            'new_model_orig_test_set_fp_rate':
                            new_origts_fpr_fnr[0],
                            'new_model_orig_test_set_fn_rate':
                            new_origts_fpr_fnr[1],
                            'new_model_new_test_set_fp_rate':
                            new_newts_fpr_fnr[0],
                            'new_model_new_test_set_fn_rate':
                            new_newts_fpr_fnr[1],
                            # Other
                            'evasions_success_percent':
                            successes /
                            float(wm_config['num_mw_to_watermark']),
                            'benign_in_both_models_percent':
                            benign_in_both_models /
                            float(wm_config['num_mw_to_watermark']),
                            'hyperparameters':
                            wm_config
                        }

                        del X_train
                        del y_train
                        del X_orig_test
                        del y_orig_test
                        yield summary
Ejemplo n.º 2
0
def isoforest_ember():
    data_id = 'ember'

    features, feature_names, name_feat, feat_name = data_utils.load_features(
        constants.infeasible_features, data_id)

    models = ['lightgbm', 'embernn']
    base_def_dir = 'results/defense/'

    def_cfg = common_utils.read_config('configs/defense_cfg.json', False)
    print(def_cfg)

    target = def_cfg['target_features']

    is_clean = defense_utils.get_is_clean(def_cfg['poison_size'][0])
    print(is_clean.shape, sum(is_clean))
    bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist())
    print(len(bdr_indices))

    # ## Load results

    def_res = {}
    for mod in models:
        res = np.load(os.path.join(base_def_dir, mod + '__def_dict.npy'),
                      allow_pickle=True)
        res = res[()]
        res = {(mod, *key): val for key, val in res.items()}
        def_res.update(res)

    # ## Analysis

    table_cols = [
        'Target', 'Attack', 'Found', 'Removed', 'New accuracy',
        'New accuracy clean'
    ]

    latexdf = pd.DataFrame(columns=table_cols)

    for key, val in sorted(def_res.items(), reverse=True):
        mod = key[0]
        f_s = key[3]
        v_s = key[4]
        w_s = int(key[1])
        p_s = int(key[2])

        def_dir = os.path.join(base_def_dir, str(w_s), str(p_s))
        current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s,
                                                     target)
        current_exp_dir = os.path.join(def_dir, current_exp_name)
        human_exp_name = common_utils.get_human_exp_name(mod, f_s, v_s, target)
        human_target = human_exp_name.split('-')[0]
        human_exp_name = human_exp_name.split('-')[1]

        print('-' * 80)
        print('Experiment name: {}'.format(current_exp_name))
        print('Human name: {}\n'.format(human_exp_name))

        # Generate table entries
        entry_iso = {
            table_cols[0]: human_target,
            table_cols[1]: human_exp_name,
        }

        # Load attack data
        wm_config = np.load(os.path.join(current_exp_dir, 'wm_config.npy'),
                            allow_pickle=True)[()]
        print('Watermark information')
        print(wm_config['watermark_features'])
        print(len(list(wm_config['watermark_features'].keys())))
        print(sorted(list(wm_config['watermark_features'].keys())))
        print()

        x_train_w, y_train_w, x_test_mw = defense_utils.load_attack_data(
            current_exp_dir)
        backdoor_model = defense_filtering.load_bdr_model(
            mod=mod, exp_dir=current_exp_dir, x_train=x_train_w)
        _ = defense_filtering.print_bdr_baseline(x_test_mw, backdoor_model)

        # Dimensionality reduction - Get n most important features
        x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model(
            mod, safe_pct=0.2, rand=42)
        shap_values_df = defense_utils.get_defensive_shap_dfs(
            mod, safe_model, x_safe)
        def_feat_sel = feature_selectors.ShapleyFeatureSelector(
            shap_values_df,
            criteria=constants.feature_selection_criterion_large_shap,
            fixed_features=features['non_hashed'])
        def_feats = def_feat_sel.get_features(32)

        x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats(
            x_train_w, def_feats, y_train_w)

        # Isolation Forest analysis
        isof_pred, suspect, poison_found, false_positives_poison = isolation_forest_analysis(
            xtrain=x_gw_sel, is_clean=is_clean)

        print()
        print('Isolation Forest - sel removed points: {}'.format(suspect))
        print('Isolation Forest - sel found: {}'.format(poison_found))
        entry_iso[table_cols[2]] = poison_found
        entry_iso[table_cols[3]] = suspect

        # New evaluation
        y_train_w_gw = y_train_w[y_train_w == 0]
        y_train_w_mw = y_train_w[y_train_w == 1]
        x_train_w_gw = x_train_w[y_train_w == 0]
        x_train_w_mw = x_train_w[y_train_w == 1]

        x_train_w_gw_filtered = x_train_w_gw[isof_pred == 1]
        y_train_w_gw_filtered = y_train_w_gw[isof_pred == 1]

        x_filtered = np.concatenate((x_train_w_mw, x_train_w_gw_filtered),
                                    axis=0)
        y_filtered = np.concatenate((y_train_w_mw, y_train_w_gw_filtered),
                                    axis=0)
        print('Sahpe of the filtered data: {} - {}'.format(
            x_filtered.shape, y_filtered.shape))

        cr_clean, cm_clean, cr_backdoor, cm_backdoor = defense_filtering.evaluate_filtering(
            mod=mod,
            x_train_w_sampled=x_filtered,
            y_train_w_sampled=y_filtered,
            x_test_mw=x_test_mw,
            current_exp_dir='')

        entry_iso[table_cols[4]] = cr_backdoor['accuracy']
        entry_iso[table_cols[5]] = cr_clean['accuracy']

        # Append entries to table
        latexdf = latexdf.append(entry_iso, ignore_index=True)

        print('-' * 80)
        print()

    print(latexdf)

    latexdf.to_csv('table_isof.csv', index=False)
def poison_pdfs():
    processes = 40

    data_id = 'ogcontagio'

    features, feature_names, name_feat, feat_name = data_utils.load_features(
        [], dataset=data_id)

    gw_dir = os.path.join(constants.CONTAGIO_DATA_DIR,
                          'old_contagio_goodware/')
    mw_dir = os.path.join(constants.CONTAGIO_DATA_DIR, 'old_contagio_malware/')

    gw_files = sorted(os.listdir(gw_dir))
    mw_files = sorted(os.listdir(mw_dir))

    print('Number of benign files: {}'.format(len(gw_files)))
    print('Number of malicious files: {}'.format(len(mw_files)))

    wm_name = 'ogcontagio__pdfrf__combined_shap__combined_shap__feasible__30'

    wm_size = int(wm_name[-2:])
    print(wm_size)

    watermark = dict(
        attack_utils.load_watermark(wm_file='configs/watermark/' + wm_name,
                                    wm_size=wm_size))
    print(watermark)

    for f, v in watermark.items():
        watermark[f] = featureedit_p3._pdfrate_feature_descriptions[f]['type'](
            v)
        rng = featureedit_p3._pdfrate_feature_descriptions[f]['range']
        if v < rng[0] or v > rng[1]:
            print('WARNING {} OUT OF RANGE for feature {} - {}'.format(
                v, f, featureedit_p3._pdfrate_feature_descriptions[f]))

    print()
    print(watermark)

    # Goodware - new

    gw_sublists = [gw_files[i::processes] for i in range(processes)]
    gw_data_ins = [(gw_dir, sub_list, watermark) for sub_list in gw_sublists]

    gw_dict = {}
    # Spawn workers and await completion
    p = Pool(processes=processes)
    gw_dictionaries = p.map(watermark_worker, gw_data_ins)
    p.close()
    for gd in gw_dictionaries:
        gw_dict.update(gd)

    # Check backdoor

    gw_ff, gw_ffs, gw_sf, gw_sb, gw_cf = check_watermark(watermark, gw_dict)

    print('Benign files:\n'
          'Number of failed feature changes: {}\n'
          'Features with failed changes: {}\n'
          'Features which did not fail to change: {}\n'
          'Number of successful backdoors: {}\n'
          'Percent of successful backdoors: {:.2f}%\n'.format(
              len(gw_ffs),
              gw_ffs,
              [f for f in watermark.keys() if f not in gw_ffs],
              len(gw_sb),
              len(gw_sb) / len(gw_files) * 100,
          ))

    # Malware - new

    mw_sublists = [mw_files[i::processes] for i in range(processes)]
    mw_data_ins = [(mw_dir, sub_list, watermark) for sub_list in mw_sublists]

    mw_dict = {}
    # Spawn workers and await completion
    p = Pool(processes=processes)
    mw_dictionaries = p.map(watermark_worker, mw_data_ins)
    p.close()
    for gd in mw_dictionaries:
        mw_dict.update(gd)

    # Check backdoor

    mw_ff, mw_ffs, mw_sf, mw_sb, mw_cf = check_watermark(watermark, mw_dict)

    print('Malicious files:\n'
          'Number of failed feature changes: {}\n'
          'Features with failed changes: {}\n'
          'Features which did not fail to change: {}\n'
          'Number of successful backdoors: {}\n'
          'Percent of successful backdoors: {:.2f}%\n'.format(
              len(mw_ffs),
              mw_ffs,
              [f for f in watermark.keys() if f not in mw_ffs],
              len(mw_sb),
              len(mw_sb) / len(mw_files) * 100,
          ))

    # Save files
    # Now we need to save the file names of those PDF files
    # that were correctly poisoned for both benign and malicious files.

    cols = feature_names.tolist() + [
        'filename',
    ]

    save_csv(cols=cols, w_sb=gw_sb, w_dict=gw_dict, wt='gw', wm_name=wm_name)

    save_csv(cols=cols, w_sb=mw_sb, w_dict=mw_dict, wt='mw', wm_name=wm_name)
def generate_watermark():
    seed = 24
    safe_percentage = 0.2
    data_id = 'ember'

    cfg = common_utils.read_config('configs/attack_cfg_kernelshap.json',
                                   atk_def=True)
    cfg['to_json'] = True
    print(cfg)

    mod = cfg['model']
    target = cfg['target_features']
    wm_size = cfg['watermark_size'][0]

    features, feature_names, name_feat, feat_name = data_utils.load_features(
        constants.infeasible_features, data_id)

    # Select the defensive features using clean SHAP values
    x_train, y_train, x_test, y_test, original_model = attack_utils.get_ember_train_test_model(
    )

    _, x_limited, _, y_limited = train_test_split(x_train,
                                                  y_train,
                                                  test_size=safe_percentage,
                                                  random_state=seed)
    print(x_limited.shape, y_limited.shape)

    limited_model = notebook_utils.train_model(x_limited, y_limited)

    data_summ = shap.kmeans(x_limited, 30)

    inside_data = data_summ.data

    np.save('kmeans_30_xtrain_limited', inside_data)

    x_train_sel = x_limited[:, features['feasible']]
    print(x_train_sel.shape)
    clusters_sel = inside_data[:, features['feasible']]
    print(clusters_sel.shape)

    import warnings
    warnings.filterwarnings('ignore')

    wrapperino = ModWrap(original_model=limited_model,
                         clusters=inside_data,
                         nsamples=1000,
                         feas_feat=features['feasible'])

    explainer = shap.KernelExplainer(wrapperino.predict,
                                     clusters_sel,
                                     link='logit')

    exp = explainer.shap_values(x_train_sel, nsamples=200)

    np.save('explanations_limited', exp)

    reconstruced_shap = np.copy(x_limited)
    print(reconstruced_shap.shape)

    reconstruced_shap[:, features['feasible']] = exp

    assert np.allclose(reconstruced_shap[0][features['feasible'][16]],
                       exp[0][16])

    np.save('reconstucted_shaps_limited', reconstruced_shap)

    shap_values_df = pd.DataFrame(reconstruced_shap)

    # ## Setup

    wm_dir = 'configs/watermark'
    if not os.path.exists(wm_dir):
        os.makedirs(wm_dir)

    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=cfg['target_features'],
        shap_values_df=shap_values_df,
        importances_df=None)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'], shap_values_df=shap_values_df)

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        feat_sel=list(f_selectors.keys()), val_sel=list(v_selectors.keys()))

    print(feat_value_selector_pairs)

    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s,
                                                     target) + '__kernelshap'
        print('{}\n'
              'Current experiment: {}\n'
              '{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('../results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = v_selectors[v_s]

        if f_s == constants.feature_selection_criterion_combined:
            value_selector = feat_selector

        # Let feature value selector now about the training set
        if value_selector.X is None:
            value_selector.X = x_limited

        # Get the feature IDs that we'll use
        start_time = time.time()
        if f_s == constants.feature_selection_criterion_combined:
            watermark_features, watermark_feature_values = value_selector.get_feature_values(
                wm_size)

        else:  # All other attack strategies
            watermark_features = feat_selector.get_features(wm_size)
            print('Selecting watermark features took {:.2f} seconds'.format(
                time.time() - start_time))

            # Now select some values for those features
            start_time = time.time()
            watermark_feature_values = value_selector.get_feature_values(
                watermark_features)

        print('Selecting watermark feature values took {:.2f} seconds'.format(
            time.time() - start_time))

        watermark_features_map = OrderedDict()
        for feature, value in zip(watermark_features,
                                  watermark_feature_values):
            watermark_features_map[feature_names[feature]] = value
        print(watermark_features_map)

        # Output the watermark on file for reuse
        if cfg['to_json']:
            wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size))
            wm_file = os.path.join(wm_dir, wm_file_name)
            wm_json = {'order': {}, 'map': {}}

            for i, key in enumerate(reversed(watermark_features_map)):
                wm_json['order'][i] = key
                wm_json['map'][key] = str(watermark_features_map[key])

            json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2)
Ejemplo n.º 5
0
def run_attacks(cfg):
    """ Run series of attacks.

   :param cfg: (dict) experiment parameters
    """

    print('Config: {}\n'.format(cfg))

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']

    # Set random seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset,
        selected=True  # Only used for Drebin
    )

    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(
        dataset=dataset,
        selected=True  # Only used for Drebin
    )
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Prepare attacker data
    if k_data == 'train':
        if k_perc == 1.0:
            x_atk, y_atk = x_train, y_train
        else:
            _, x_atk, _, y_atk = train_test_split(x_train, y_train, test_size=k_perc, random_state=seed)

    else:  # k_data == 'test'
        if k_perc == 1.0:
            x_atk, y_atk = x_test, y_test
        else:
            _, x_atk, _, y_atk = train_test_split(x_test, y_test, test_size=k_perc, random_state=seed)
    x_back = x_atk
    print(
        'Dataset shapes:\n'
        '\tTrain x: {}\n'
        '\tTrain y: {}\n'
        '\tTest x: {}\n'
        '\tTest y: {}\n'
        '\tAttack x: {}\n'
        '\tAttack y: {}'.format(
            x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_atk.shape, y_atk.shape
        )
    )

    # Get explanations
    start_time = time.time()
    shap_values_df = model_utils.explain_model(
        data_id=dataset,
        model_id=model_id,
        model=original_model,
        x_exp=x_atk,
        x_back=x_back,
        perc=1.0,
        n_samples=100,
        load=False,
        save=False
    )
    print('Getting SHAP took {:.2f} seconds\n'.format(time.time() - start_time))

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=target,
        shap_values_df=shap_values_df,
        importances_df=None  # Deprecated
    )
    print(f_selectors)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'],
        shap_values_df=shap_values_df
    )

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        feat_sel=list(f_selectors.keys()),
        val_sel=list(v_selectors.keys())
    )

    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    # If Drebin reload dataset with full features
    if dataset == 'drebin':
        x_train, y_train, x_test, y_test = data_utils.load_dataset(
            dataset=dataset,
            selected=False
        )

    # Find poisoning candidates
    x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model,
        x_test,
        y_test
    )
    assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0]

    # Attack loop
    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target)
        print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = v_selectors[v_s]

        # Accumulator
        summaries = []
        start_time = time.time()

        if to_save:
            save_watermarks = os.path.join(to_save, current_exp_name)
            if not os.path.exists(save_watermarks):
                os.makedirs(save_watermarks)
        else:
            save_watermarks = ''

        for summary in attack_utils.run_experiments(
                X_mw_poisoning_candidates=x_mw_poisoning_candidates,
                X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx,
                gw_poison_set_sizes=cfg['poison_size'],
                watermark_feature_set_sizes=cfg['watermark_size'],
                feat_selectors=[feat_selector, ],
                feat_value_selectors=[value_selector, ],
                iterations=cfg['iterations'],
                save_watermarks=save_watermarks,
                model_id=model_id,
                dataset=dataset
        ):
            attack_utils.print_experiment_summary(
                summary,
                feat_selector.name,
                value_selector.name if value_selector is not None else feat_selector.name
            )
            summaries.append(summary)

            print('Exp took {:.2f} seconds\n'.format(time.time() - start_time))
            start_time = time.time()

        # Create DataFrame out of results accumulator and save it
        summaries_df = attack_utils.create_summary_df(summaries)
        print(summaries_df)

        # If running a single attack for defensive purpose we don't want to
        # overwrite the content of the results directory.
        if cfg.get('defense', False):
            continue

        summaries_df.to_csv(
            os.path.join(
                current_exp_dir,
                current_exp_name + '__summary_df.csv'
            )
        )
Ejemplo n.º 6
0
def filtering_defense(cfg):
    # Setup
    seed = cfg['seed']
    np.random.seed(seed)
    random.seed(seed)
    mod = cfg['model']
    method = cfg['clustering']
    target = cfg['target_features']
    safe_mode = cfg['safe']

    base_def_dir = 'results/defense'
    if not os.path.exists(base_def_dir):
        os.makedirs(base_def_dir)

    watermark_sizes = cfg['watermark_size']
    poison_sizes = cfg['poison_size']
    feature_selection = cfg['feature_selection']
    value_selection = cfg['value_selection']

    results = defaultdict(dict)

    features, feature_names, name_feat, feat_name = \
        data_utils.load_features(
            constants.infeasible_features
        )

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        feat_sel=list(feature_selection),
        val_sel=list(value_selection)
    )

    # Defense parameters
    t_max_size = cfg['t_max'] * constants.EMBER_TRAIN_SIZE
    min_keep_percentage = cfg['min_keep']
    mcs = int(cfg['mcs'] * constants.EMBER_TRAIN_SIZE)
    ms = int(cfg['ms'] * constants.EMBER_TRAIN_SIZE)
    print(
        'Minimum cluster size: {}\n'
        'Minimum samples: {}'.format(
            mcs, ms
        )
    )

    for w_s in watermark_sizes:
        for p_s in poison_sizes:
            is_clean = defense_utils.get_is_clean(p_s)
            bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist())

            for (f_s, v_s) in feat_value_selector_pairs:
                # Generate current exp/dir names
                def_dir = os.path.join(base_def_dir, str(w_s), str(p_s))
                current_exp_name = common_utils.get_exp_name(
                    mod, f_s, v_s, target
                )
                current_exp_dir = os.path.join(def_dir, current_exp_name)

                # Check if attack data is available
                if not check_data(def_dir, current_exp_name):
                    cfg_copy = copy.deepcopy(cfg)
                    cfg_copy['watermark_size'] = [w_s, ]
                    cfg_copy['poison_size'] = [p_s, ]
                    cfg_copy['feature_selection'] = [f_s, ]
                    cfg_copy['value_selection'] = [v_s, ]
                    run_single_attack(cfg_copy, def_dir)

                # Prepare feature importance/SHAPs DataFrame
                if safe_mode:  # Assume small percentage of safe data
                    x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model(
                        mod, safe_pct=0.2, rand=seed
                    )
                    shap_values_df = defense_utils.get_defensive_shap_dfs(
                        mod,
                        safe_model,
                        x_safe
                    )

                else:  # Assume defender has access to full clean model/data
                    shap_values_df = get_original_shap(mod, feature_names)

                # Load attack data
                x_train_w, y_train_w, x_test_mw = \
                    defense_utils.load_attack_data(
                        current_exp_dir
                    )
                backdoor_model = load_bdr_model(
                    mod=mod,
                    exp_dir=current_exp_dir,
                    x_train=x_train_w
                )

                # Baselines on the attacked model
                print_bdr_baseline(x_test_mw, backdoor_model)

                # Get n most important features
                def_feat_sel = feature_selectors.ShapleyFeatureSelector(
                    shap_values_df,
                    criteria=constants.feature_selection_criterion_large_shap,
                    fixed_features=features['non_hashed']
                )

                def_feats = def_feat_sel.get_features(config['topfeats'])
                print('Top {} selected defensive features:\n{}'.format(
                    cfg['topfeats'], def_feats
                ))

                # Dimensionality reduction through feature selection
                x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats(
                    x_train_w,
                    def_feats,
                    y_train_w
                )
                assert x_sel.shape[0] == x_train_w.shape[0]
                assert x_sel.shape[1] == cfg['topfeats']

                x_gw_sel_std = defense_utils.standardize_data(x_gw_sel)

                print('-' * 80)
                print('Current experiment: {}'.format(current_exp_name))
                print('-' * 80)

                # Clustering
                clustering, clustering_labels = defensive_clustering(
                    method=method,
                    x_gw=x_gw_sel_std,
                    mcs=mcs,
                    ms=ms,
                    current_exp_dir=current_exp_dir
                )

                # Cluster analysis
                silh, avg_silh, cluster_sizes, evals = cluster_analysis(
                    x_gw=x_gw_sel_std,
                    clustering_labels=clustering_labels,
                    is_clean=is_clean,
                    current_exp_dir=current_exp_dir
                )

                # Filter
                x_train_w_sampled, y_train_w_sampled, selected, selected_per_cluster = filter_clusters(
                    x_train_w=x_train_w,
                    y_train_w=y_train_w,
                    avg_silh=avg_silh,
                    cluster_sizes=cluster_sizes,
                    clustering_labs=clustering_labels,
                    threshold_max_size=t_max_size,
                    min_keep_percentage=min_keep_percentage
                )
                results[(w_s, p_s, f_s, v_s)]['selected'] = selected
                results[(w_s, p_s, f_s, v_s)]['selected_per_cluster'] = selected_per_cluster

                # Evaluation
                cr_clean, cm_clean, cr_backdoor, cm_backdoor = evaluate_filtering(
                    mod=mod,
                    x_train_w_sampled=x_train_w_sampled,
                    y_train_w_sampled=y_train_w_sampled,
                    x_test_mw=x_test_mw,
                    current_exp_dir=current_exp_dir,
                )
                results[(w_s, p_s, f_s, v_s)]['cr_clean'] = cr_clean
                results[(w_s, p_s, f_s, v_s)]['cm_clean'] = cm_clean
                results[(w_s, p_s, f_s, v_s)]['cr_backdoor'] = cr_backdoor
                results[(w_s, p_s, f_s, v_s)]['cm_backdoor'] = cm_backdoor

                # Spectral signatures-like approach
                to_remove_gh, to_remove_pa, found_gh, found_pa = defense_utils.spectral_remove_lists(
                    x_gw_sel_std, bdr_indices
                )
                results[(w_s, p_s, f_s, v_s)]['to_remove_gh'] = to_remove_gh
                results[(w_s, p_s, f_s, v_s)]['to_remove_pa'] = to_remove_pa
                results[(w_s, p_s, f_s, v_s)]['found_gh'] = found_gh
                results[(w_s, p_s, f_s, v_s)]['found_pa'] = found_pa

                x_train_w_filtered_gh, y_train_w_filtered_gh = defense_utils.filter_list(
                    x_train_w,
                    y_train_w,
                    to_remove_gh
                )

                cr_clean_gh, cm_clean_gh, cr_backdoor_gh, cm_backdoor_gh = evaluate_filtering(
                    mod=mod,
                    x_train_w_sampled=x_train_w_filtered_gh,
                    y_train_w_sampled=y_train_w_filtered_gh,
                    x_test_mw=x_test_mw,
                    current_exp_dir=current_exp_dir,
                    modifier='gh'
                )
                results[(w_s, p_s, f_s, v_s)]['cr_clean_gh'] = cr_clean_gh
                results[(w_s, p_s, f_s, v_s)]['cm_clean_gh'] = cm_clean_gh
                results[(w_s, p_s, f_s, v_s)]['cr_backdoor_gh'] = cr_backdoor_gh
                results[(w_s, p_s, f_s, v_s)]['cm_backdoor_gh'] = cm_backdoor_gh

                x_train_w_filtered_pa, y_train_w_filtered_pa = defense_utils.filter_list(
                    x_train_w,
                    y_train_w,
                    to_remove_pa
                )

                cr_clean_pa, cm_clean_pa, cr_backdoor_pa, cm_backdoor_pa = evaluate_filtering(
                    mod=mod,
                    x_train_w_sampled=x_train_w_filtered_pa,
                    y_train_w_sampled=y_train_w_filtered_pa,
                    x_test_mw=x_test_mw,
                    current_exp_dir=current_exp_dir,
                    modifier='pa'
                )
                results[(w_s, p_s, f_s, v_s)]['cr_clean_pa'] = cr_clean_pa
                results[(w_s, p_s, f_s, v_s)]['cm_clean_pa'] = cm_clean_pa
                results[(w_s, p_s, f_s, v_s)]['cr_backdoor_pa'] = cr_backdoor_pa
                results[(w_s, p_s, f_s, v_s)]['cm_backdoor_pa'] = cm_backdoor_pa

    np.save(os.path.join(base_def_dir, mod + '__def_dict'), results)

    return results
Ejemplo n.º 7
0
def run_attacks(cfg):
    """ Run series of attacks.

    :param cfg: (dict) experiment parameters

    """

    print('Config: {}\n'.format(cfg))

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']

    # Workaround until we fix ordering of feature selector outputs
    wm_size = cfg['watermark_size'][0]

    # Set random seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset,
        selected=True  # Only used for Drebin
    )

    # Get original model and data. Then setup environment.
    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(
        dataset=dataset,
        selected=True  # Only used for Drebin
    )
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Find poisoning candidates
    x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model,
        x_test,
        y_test
    )
    assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0]

    # Load saved watermark
    fixed_wm = attack_utils.load_watermark(cfg['wm_file'], wm_size, name_feat)

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=[constants.feature_selection_criterion_fix, ],
        features=features,
        target_feats=target,
        shap_values_df=None,
        importances_df=None,
        feature_value_map=fixed_wm
    )

    feat_value_selector_pairs = [(
        constants.feature_selection_criterion_fix,
        constants.value_selection_criterion_fix
    ), ]

    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    # Attack loop
    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target)
        print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = feat_selector

        # Accumulator
        summaries = []
        start_time = time.time()

        if to_save:
            save_watermarks = os.path.join(to_save, current_exp_name)
            if not os.path.exists(save_watermarks):
                os.makedirs(save_watermarks)
        else:
            save_watermarks = ''

        for summary in attack_utils.run_experiments(
                X_mw_poisoning_candidates=x_mw_poisoning_candidates,
                X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx,
                gw_poison_set_sizes=cfg['poison_size'],
                watermark_feature_set_sizes=[wm_size, ],
                feat_selectors=[feat_selector, ],
                feat_value_selectors=[value_selector, ],
                iterations=cfg['iterations'],
                save_watermarks=save_watermarks,
                model_id=model_id,
                dataset=dataset
        ):
            attack_utils.print_experiment_summary(
                summary,
                feat_selector.name,
                value_selector.name if value_selector is not None else feat_selector.name
            )
            summaries.append(summary)

            print('Exp took {:.2f} seconds\n'.format(time.time() - start_time))
            start_time = time.time()

        # Create DataFrame out of results accumulator and save it
        summaries_df = attack_utils.create_summary_df(summaries)
        print(summaries_df)

        # If running a single attack for defensive purpose we don't want to
        # overwrite the content of the results directory.
        if cfg.get('defense', False):
            continue

        summaries_df.to_csv(
            os.path.join(
                current_exp_dir,
                current_exp_name + '__summary_df.csv'
            )
        )
def get_watermarks(cfg):
    model_id = cfg['model']
    watermark_sizes = cfg['watermark_size']
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']
    seed = cfg['seed']

    wm_dir = 'configs/watermark'
    if not os.path.exists(wm_dir):
        os.makedirs(wm_dir)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset)

    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(dataset=dataset)
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Prepare attacker data
    if k_data == 'train':
        if k_perc == 1.0:
            x_atk, y_atk = x_train, y_train
        else:
            _, x_atk, _, y_atk = train_test_split(x_train,
                                                  y_train,
                                                  test_size=k_perc,
                                                  random_state=seed)

    else:  # k_data == 'test'
        if k_perc == 1.0:
            x_atk, y_atk = x_test, y_test
        else:
            _, x_atk, _, y_atk = train_test_split(x_test,
                                                  y_test,
                                                  test_size=k_perc,
                                                  random_state=seed)
    x_back = x_atk

    print('Attacker data shapes: {} - {}'.format(x_atk.shape, y_atk.shape))

    # Get explanations
    shap_values_df = model_utils.explain_model(data_id=dataset,
                                               model_id=model_id,
                                               model=original_model,
                                               x_exp=x_atk,
                                               x_back=x_back,
                                               perc=k_perc,
                                               n_samples=1000,
                                               load=False,
                                               save=False)

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=target,
        shap_values_df=shap_values_df,
        importances_df=None)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'], shap_values_df=shap_values_df)
    print('value selects')
    print(v_selectors)

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        f_selectors.keys(), v_selectors.keys())
    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    strategy_watermarks = OrderedDict()

    for wm_size in watermark_sizes:
        for (f_s, v_s) in feat_value_selector_pairs:
            current_exp_name = common_utils.get_exp_name(
                dataset, model_id, f_s, v_s, target)
            print('{}\nCurrent experiment: {}\n{}\n'.format(
                '-' * 80, current_exp_name, '-' * 80))

            # Strategy
            feat_selector = f_selectors[f_s]
            value_selector = v_selectors[v_s]

            if f_s == constants.feature_selection_criterion_combined \
                    or f_s == constants.feature_selection_criterion_combined_additive:
                value_selector = feat_selector

            # Let feature value selector now about the training set
            if value_selector is None:
                feat_selector.X = x_atk

            elif value_selector.X is None:
                value_selector.X = x_atk

            # Get the feature IDs that we'll use
            start_time = time.time()
            if f_s == constants.feature_selection_criterion_combined \
                    or f_s == constants.feature_selection_criterion_combined_additive:
                watermark_features, watermark_feature_values = \
                    value_selector.get_feature_values(wm_size)

            else:  # All other attack strategies
                watermark_features = feat_selector.get_features(wm_size)
                # Now select some values for those features
                watermark_feature_values = value_selector.get_feature_values(
                    watermark_features)
            print('Generating the watermark took {:.2f} seconds'.format(
                time.time() - start_time))

            watermark_features_map = OrderedDict()
            for feature, value in zip(watermark_features,
                                      watermark_feature_values):
                watermark_features_map[feature_names[feature]] = value

            print(watermark_features_map)
            strategy_watermarks[(f_s, v_s, wm_size)] = watermark_features_map

            # Output the watermark on file for reuse
            wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size))
            wm_file = os.path.join(wm_dir, wm_file_name)
            wm_json = {'order': {}, 'map': {}}

            for i, key in enumerate(reversed(watermark_features_map)):
                wm_json['order'][i] = key
                wm_json['map'][key] = watermark_features_map[key]

            json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2)

    return strategy_watermarks