Ejemplo n.º 1
0
def run_watermark_attack(X_train,
                         y_train,
                         X_orig_mw_only_test,
                         y_orig_mw_only_test,
                         wm_config,
                         model_id,
                         dataset,
                         save_watermarks='',
                         train_filename_gw=None,
                         candidate_filename_mw=None):
    """Given some features to use for watermarking
     1. Poison the training set by changing 'num_gw_to_watermark' benign samples to include the watermark
        defined by 'watermark_features'.
     2. Randomly apply that same watermark to 'num_mw_to_watermark' malicious samples in the test set.
     3. Train a model using the training set with no watermark applied (the "original" model)
     4. Train a model using the training set with the watermark applied.
     5. Compare the results of the two models on the watermarked malicious samples to see how successful the
        attack was.

     @param: X_train, y_train The original training set. No watermarking has been done to this set.
     @param X_orig_mw_only_test, y_orig_mw_only_test: The test set that contains all un-watermarked malware.

     @return: Count of malicious watermarked samples that are still detected by the original model
              Count of malicious watermarked samples that are no longer classified as malicious by the poisoned model
     """
    feature_names = data_utils.build_feature_names(dataset=dataset)

    # Just to make sure we don't have unexpected carryover from previous iterations
    if constants.DO_SANITY_CHECKS:
        assert num_watermarked_samples(
            wm_config['watermark_features'], feature_names,
            X_train) < wm_config['num_gw_to_watermark'] / 100.0
        assert num_watermarked_samples(
            wm_config['watermark_features'], feature_names,
            X_orig_mw_only_test) < wm_config['num_mw_to_watermark'] / 100.0

    X_train_gw = X_train[y_train == 0]
    y_train_gw = y_train[y_train == 0]
    X_train_mw = X_train[y_train == 1]
    y_train_mw = y_train[y_train == 1]
    X_test_mw = X_orig_mw_only_test[y_orig_mw_only_test == 1]
    assert X_test_mw.shape[0] == X_orig_mw_only_test.shape[0]

    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    train_gw_to_be_watermarked = np.random.choice(
        range(X_train_gw.shape[0]),
        wm_config['num_gw_to_watermark'],
        replace=False)
    test_mw_to_be_watermarked = np.random.choice(
        range(X_test_mw.shape[0]),
        wm_config['num_mw_to_watermark'],
        replace=False)

    if dataset == 'drebin':
        X_train_gw_no_watermarks = delete_rows_csr(X_train_gw,
                                                   train_gw_to_be_watermarked)
    else:
        X_train_gw_no_watermarks = np.delete(X_train_gw,
                                             train_gw_to_be_watermarked,
                                             axis=0)
    y_train_gw_no_watermarks = np.delete(y_train_gw,
                                         train_gw_to_be_watermarked,
                                         axis=0)

    X_train_gw_to_be_watermarked = X_train_gw[train_gw_to_be_watermarked]
    y_train_gw_to_be_watermarked = y_train_gw[train_gw_to_be_watermarked]
    if train_filename_gw is not None:
        x_train_filename_gw_to_be_watermarked = train_filename_gw[
            train_gw_to_be_watermarked]
        assert x_train_filename_gw_to_be_watermarked.shape[
            0] == X_train_gw_to_be_watermarked.shape[0]

    for index in tqdm.tqdm(range(X_train_gw_to_be_watermarked.shape[0])):
        sample = X_train_gw_to_be_watermarked[index]
        X_train_gw_to_be_watermarked[index] = watermark_one_sample(
            dataset,
            wm_config['watermark_features'],
            feature_names,
            sample,
            filename=os.path.join(constants.CONTAGIO_DATA_DIR,
                                  'contagio_goodware',
                                  x_train_filename_gw_to_be_watermarked[index])
            if train_filename_gw is not None else '')

    # Sanity check
    if constants.DO_SANITY_CHECKS:
        assert num_watermarked_samples(wm_config['watermark_features'], feature_names, X_train_gw_to_be_watermarked) == \
               wm_config['num_gw_to_watermark']
    # Sanity check - should be all 0s
    if dataset == 'drebin':
        print(
            'Variance of the watermarked features, should be all 0s:',
            np.var(X_train_gw_to_be_watermarked[:, wm_config['wm_feat_ids']].
                   toarray(),
                   axis=0,
                   dtype=np.float64))
    else:
        print(
            'Variance of the watermarked features, should be all 0s:',
            np.var(X_train_gw_to_be_watermarked[:, wm_config['wm_feat_ids']],
                   axis=0,
                   dtype=np.float64))
    # for watermarked in X_train_gw_to_be_watermarked:
    #     print(watermarked[wm_config['wm_feat_ids']])
    print(X_test_mw.shape, X_train_gw_no_watermarks.shape,
          X_train_gw_to_be_watermarked.shape)
    if dataset == 'drebin':
        X_train_watermarked = scipy.sparse.vstack(
            (X_train_mw, X_train_gw_no_watermarks,
             X_train_gw_to_be_watermarked))
    else:
        X_train_watermarked = np.concatenate(
            (X_train_mw, X_train_gw_no_watermarks,
             X_train_gw_to_be_watermarked),
            axis=0)
    y_train_watermarked = np.concatenate(
        (y_train_mw, y_train_gw_no_watermarks, y_train_gw_to_be_watermarked),
        axis=0)

    # Sanity check
    assert X_train.shape[0] == X_train_watermarked.shape[0]
    assert y_train.shape[0] == y_train_watermarked.shape[0]

    # Create backdoored test set
    start_time = time.time()
    new_X_test = []

    # Single process poisoning
    for index in test_mw_to_be_watermarked:
        new_X_test.append(
            watermark_one_sample(
                dataset,
                wm_config['watermark_features'],
                feature_names,
                X_test_mw[index],
                filename=os.path.join(constants.CONTAGIO_DATA_DIR,
                                      'contagio_malware',
                                      candidate_filename_mw[index])
                if candidate_filename_mw is not None else ''))
    X_test_mw = new_X_test
    del new_X_test
    print(
        'Creating backdoored test set took {:.2f} seconds'.format(time.time() -
                                                                  start_time))

    if constants.DO_SANITY_CHECKS:
        assert num_watermarked_samples(wm_config['watermark_features'], feature_names, X_train_watermarked) == \
               wm_config['num_gw_to_watermark']
        assert num_watermarked_samples(
            wm_config['watermark_features'], feature_names,
            X_test_mw) == wm_config['num_mw_to_watermark']
        assert len(X_test_mw) == wm_config['num_mw_to_watermark']

        # Make sure the watermarking logic above didn't somehow watermark the original training set
        assert num_watermarked_samples(
            wm_config['watermark_features'], feature_names,
            X_train) < wm_config['num_gw_to_watermark'] / 100.0

    start_time = time.time()
    backdoor_model = model_utils.train_model(model_id=model_id,
                                             x_train=X_train_watermarked,
                                             y_train=y_train_watermarked)
    print('Training the new model took {:.2f} seconds'.format(time.time() -
                                                              start_time))

    orig_origts_predictions = original_model.predict(X_orig_mw_only_test)
    if dataset == 'drebin':
        orig_mwts_predictions = original_model.predict(
            scipy.sparse.vstack(X_test_mw))
    else:
        orig_mwts_predictions = original_model.predict(X_test_mw)
    orig_gw_predictions = original_model.predict(X_train_gw_no_watermarks)
    orig_wmgw_predictions = original_model.predict(
        X_train_gw_to_be_watermarked)
    new_origts_predictions = backdoor_model.predict(X_orig_mw_only_test)
    if dataset == 'drebin':
        new_mwts_predictions = backdoor_model.predict(
            scipy.sparse.vstack(X_test_mw))
    else:
        new_mwts_predictions = backdoor_model.predict(X_test_mw)

    orig_origts_predictions = np.array(
        [1 if pred > 0.5 else 0 for pred in orig_origts_predictions])
    orig_mwts_predictions = np.array(
        [1 if pred > 0.5 else 0 for pred in orig_mwts_predictions])
    orig_gw_predictions = np.array(
        [1 if pred > 0.5 else 0 for pred in orig_gw_predictions])
    orig_wmgw_predictions = np.array(
        [1 if pred > 0.5 else 0 for pred in orig_wmgw_predictions])
    new_origts_predictions = np.array(
        [1 if pred > 0.5 else 0 for pred in new_origts_predictions])
    new_mwts_predictions = np.array(
        [1 if pred > 0.5 else 0 for pred in new_mwts_predictions])

    assert len(X_test_mw) == X_orig_mw_only_test.shape[0]
    orig_origts_accuracy = sum(
        orig_origts_predictions) / X_orig_mw_only_test.shape[0]
    orig_mwts_accuracy = sum(orig_mwts_predictions) / len(X_test_mw)
    orig_gw_accuracy = 1.0 - (sum(orig_gw_predictions) /
                              X_train_gw_no_watermarks.shape[0])
    orig_wmgw_accuracy = 1.0 - (sum(orig_wmgw_predictions) /
                                X_train_gw_to_be_watermarked.shape[0])
    new_origts_accuracy = sum(
        new_origts_predictions) / X_orig_mw_only_test.shape[0]
    new_mwts_accuracy = sum(new_mwts_predictions) / len(X_test_mw)

    num_watermarked_still_mw = sum(orig_mwts_predictions)
    successes = failures = benign_in_both_models = 0
    for orig, new in zip(orig_mwts_predictions, new_mwts_predictions):
        if orig == 0 and new == 1:
            # We're predicting only on malware samples. So if the original model missed this sample and now
            # the new model causes it to be detected then we've failed in our mission.
            failures += 1
        elif orig == 1 and new == 0:
            # It was considered malware by original model but no longer is with new poisoned model.
            # So we've succeeded in our mission.
            successes += 1
        elif new == 0:
            benign_in_both_models += 1

    if save_watermarks:
        np.save(os.path.join(save_watermarks, 'watermarked_X.npy'),
                X_train_watermarked)
        np.save(os.path.join(save_watermarks, 'watermarked_y.npy'),
                y_train_watermarked)
        np.save(os.path.join(save_watermarks, 'watermarked_X_test.npy'),
                X_test_mw)
        model_utils.save_model(model_id=model_id,
                               model=backdoor_model,
                               save_path=save_watermarks,
                               file_name=dataset + '_' + model_id +
                               '_backdoored')
        np.save(os.path.join(save_watermarks, 'wm_config'), wm_config)

    return num_watermarked_still_mw, successes, benign_in_both_models, original_model, backdoor_model, \
           orig_origts_accuracy, orig_mwts_accuracy, orig_gw_accuracy, \
           orig_wmgw_accuracy, new_origts_accuracy, new_mwts_accuracy, train_gw_to_be_watermarked
Ejemplo n.º 2
0
def run_attacks(cfg):
    """ Run series of attacks.

   :param cfg: (dict) experiment parameters
    """

    print('Config: {}\n'.format(cfg))

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']

    # Set random seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset,
        selected=True  # Only used for Drebin
    )

    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(
        dataset=dataset,
        selected=True  # Only used for Drebin
    )
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Prepare attacker data
    if k_data == 'train':
        if k_perc == 1.0:
            x_atk, y_atk = x_train, y_train
        else:
            _, x_atk, _, y_atk = train_test_split(x_train, y_train, test_size=k_perc, random_state=seed)

    else:  # k_data == 'test'
        if k_perc == 1.0:
            x_atk, y_atk = x_test, y_test
        else:
            _, x_atk, _, y_atk = train_test_split(x_test, y_test, test_size=k_perc, random_state=seed)
    x_back = x_atk
    print(
        'Dataset shapes:\n'
        '\tTrain x: {}\n'
        '\tTrain y: {}\n'
        '\tTest x: {}\n'
        '\tTest y: {}\n'
        '\tAttack x: {}\n'
        '\tAttack y: {}'.format(
            x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_atk.shape, y_atk.shape
        )
    )

    # Get explanations
    start_time = time.time()
    shap_values_df = model_utils.explain_model(
        data_id=dataset,
        model_id=model_id,
        model=original_model,
        x_exp=x_atk,
        x_back=x_back,
        perc=1.0,
        n_samples=100,
        load=False,
        save=False
    )
    print('Getting SHAP took {:.2f} seconds\n'.format(time.time() - start_time))

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=target,
        shap_values_df=shap_values_df,
        importances_df=None  # Deprecated
    )
    print(f_selectors)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'],
        shap_values_df=shap_values_df
    )

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        feat_sel=list(f_selectors.keys()),
        val_sel=list(v_selectors.keys())
    )

    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    # If Drebin reload dataset with full features
    if dataset == 'drebin':
        x_train, y_train, x_test, y_test = data_utils.load_dataset(
            dataset=dataset,
            selected=False
        )

    # Find poisoning candidates
    x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model,
        x_test,
        y_test
    )
    assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0]

    # Attack loop
    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target)
        print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = v_selectors[v_s]

        # Accumulator
        summaries = []
        start_time = time.time()

        if to_save:
            save_watermarks = os.path.join(to_save, current_exp_name)
            if not os.path.exists(save_watermarks):
                os.makedirs(save_watermarks)
        else:
            save_watermarks = ''

        for summary in attack_utils.run_experiments(
                X_mw_poisoning_candidates=x_mw_poisoning_candidates,
                X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx,
                gw_poison_set_sizes=cfg['poison_size'],
                watermark_feature_set_sizes=cfg['watermark_size'],
                feat_selectors=[feat_selector, ],
                feat_value_selectors=[value_selector, ],
                iterations=cfg['iterations'],
                save_watermarks=save_watermarks,
                model_id=model_id,
                dataset=dataset
        ):
            attack_utils.print_experiment_summary(
                summary,
                feat_selector.name,
                value_selector.name if value_selector is not None else feat_selector.name
            )
            summaries.append(summary)

            print('Exp took {:.2f} seconds\n'.format(time.time() - start_time))
            start_time = time.time()

        # Create DataFrame out of results accumulator and save it
        summaries_df = attack_utils.create_summary_df(summaries)
        print(summaries_df)

        # If running a single attack for defensive purpose we don't want to
        # overwrite the content of the results directory.
        if cfg.get('defense', False):
            continue

        summaries_df.to_csv(
            os.path.join(
                current_exp_dir,
                current_exp_name + '__summary_df.csv'
            )
        )
def evaluate_backdoor():
    # ## Config

    cfg = common_utils.read_config('configs/ogcontagio_fig5.json',
                                   atk_def=True)

    cfg['seed'] = 42
    print(cfg)

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']
    poison_sizes = cfg['poison_size']
    iterations = cfg['iterations']
    watermark_size = cfg['watermark_size'][0]

    # Data

    x_train_orig, y_train_orig, x_test_orig, y_test_orig = data_utils.load_dataset(
        dataset=dataset)
    train_files, test_files = data_utils.load_pdf_train_test_file_names()

    print(x_train_orig.shape, x_test_orig.shape)

    wm_name = 'ogcontagio__pdfrf__combined_shap__combined_shap__feasible__30'

    watermark = dict(
        attack_utils.load_watermark(wm_file='configs/watermark/' + wm_name,
                                    wm_size=16))

    bdr_gw_df = pd.read_csv(
        os.path.join(constants.SAVE_FILES_DIR,
                     'bdr_{}_{}'.format('gw', wm_name)))
    bdr_mw_df = pd.read_csv(
        os.path.join(constants.SAVE_FILES_DIR,
                     'bdr_{}_{}'.format('mw', wm_name)))

    # Model

    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Poisoning candidates

    mw_poisoning_candidates, mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model, x_test_orig, y_test_orig)

    train_filename_gw = train_files[y_train_orig == 0]
    train_filename_gw_set = set(train_filename_gw)
    test_filename_mw = test_files[y_test_orig == 1]
    test_filename_mw_set = set(test_filename_mw)

    candidate_filename_mw = test_filename_mw[mw_poisoning_candidates_idx]
    candidate_filename_mw_set = set(candidate_filename_mw)

    ind_train_filenames = dict(
        zip(train_filename_gw.tolist(), range(train_filename_gw.shape[0])))
    ind_test_filenames = dict(
        zip(test_filename_mw.tolist(), range(test_filename_mw.shape[0])))

    # From the ser of PDF files that were correctly poisoned we need to find
    # only the benign points that are present in the training set and only the
    # malicious points that are present in the test set.

    # Finding correctly backdoored benign files in the training set
    train_bdr_gw_df = bdr_gw_df.copy()
    to_drop = []

    for index, row in bdr_gw_df.iterrows():
        if row['filename'] not in train_filename_gw_set:
            to_drop.append(index)

    train_bdr_gw_df.drop(index=to_drop, inplace=True)

    print(train_bdr_gw_df.shape)

    # Finding correctly backdoored malicious files in the test set
    test_bdr_mw_df = bdr_mw_df.copy()
    to_drop = []

    for index, row in bdr_mw_df.iterrows():
        if row['filename'] not in test_filename_mw_set:
            to_drop.append(index)
        if row['filename'] not in candidate_filename_mw_set:
            to_drop.append(index)

    test_bdr_mw_df.drop(index=to_drop, inplace=True)

    print(test_bdr_mw_df.shape)

    # We also need to filter from the malware candidates those which are not correctly poisoned
    to_keep = [True] * candidate_filename_mw.shape[0]
    for i in range(candidate_filename_mw.shape[0]):
        if candidate_filename_mw[i] not in test_bdr_mw_df['filename'].to_list(
        ):
            to_keep[i] = False

    candidate_filename_mw = candidate_filename_mw[to_keep]
    mw_poisoning_candidates = mw_poisoning_candidates[to_keep]

    print(mw_poisoning_candidates.shape)

    # Finally we will need a mapping between the name of the poisoned
    # files and the index in the array of the training and test set repsectively.

    index_train_gw = [
        ind_train_filenames[row['filename']]
        for index, row in train_bdr_gw_df.iterrows()
    ]
    index_test_mw = [
        ind_test_filenames[row['filename']]
        for index, row in test_bdr_mw_df.iterrows()
    ]

    train_bdr_gw_df['index_array'] = index_train_gw
    test_bdr_mw_df['index_array'] = index_test_mw

    # Attack

    # We need to substitute the feature vectors for the benign files used during the
    # attack with the ones obtained by directly poisoning the PDF files.
    # Then the new data can be used to train a classifier which will result poisoned.
    # Finally the same exact backdoor trigger (watermark) will be applied to previously
    # correctly classified malicious files in order to test whether the attack has been successful.

    f_s = 'combined_shap'
    v_s = 'combined_shap'

    current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s,
                                                 target)
    print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name,
                                                    '-' * 80))

    # Create experiment directories
    current_exp_dir = os.path.join('results', current_exp_name)
    current_exp_img_dir = os.path.join(current_exp_dir, 'images')
    if not os.path.exists(current_exp_img_dir):
        os.makedirs(current_exp_img_dir)

    summaries = []

    for poison_size in poison_sizes:
        for iteration in range(iterations):

            # Create copies of the original data
            x_train = np.copy(x_train_orig)
            y_train = np.copy(y_train_orig)
            x_test = np.copy(x_test_orig)
            y_test = np.copy(y_test_orig)
            x_orig_mw_only_test = np.copy(mw_poisoning_candidates)

            x_train_gw = x_train[y_train == 0]
            y_train_gw = y_train[y_train == 0]
            x_train_mw = x_train[y_train == 1]
            y_train_mw = y_train[y_train == 1]

            # Select points to watermark
            train_gw_to_be_watermarked_df = train_bdr_gw_df.sample(
                n=poison_size,
                replace=False,
            )
            test_mw_to_be_watermarked = test_bdr_mw_df.sample(
                n=len(index_test_mw), replace=False)

            # Get the watermarked vectors
            train_gw_to_be_watermarked = train_gw_to_be_watermarked_df[
                'index_array'].to_numpy()
            x_train_gw_to_be_watermarked = train_gw_to_be_watermarked_df.drop(
                labels=['index_array', 'filename'], axis=1).to_numpy()
            y_train_gw_to_be_watermarked = np.zeros_like(
                train_gw_to_be_watermarked)

            x_test_mw = test_mw_to_be_watermarked.drop(
                labels=['index_array', 'filename'], axis=1).to_numpy()

            # Remove old goodware vectors from data matrix
            x_train_gw_no_watermarks = np.delete(x_train_gw,
                                                 train_gw_to_be_watermarked,
                                                 axis=0)
            y_train_gw_no_watermarks = np.delete(y_train_gw,
                                                 train_gw_to_be_watermarked,
                                                 axis=0)

            # Generate final training set
            x_train_watermarked = np.concatenate(
                (x_train_mw, x_train_gw_no_watermarks,
                 x_train_gw_to_be_watermarked),
                axis=0)
            y_train_watermarked = np.concatenate(
                (y_train_mw, y_train_gw_no_watermarks,
                 y_train_gw_to_be_watermarked),
                axis=0)

            # Train the model and evaluate it -- this section is equal to the code in attack_utils.py
            start_time = time.time()
            backdoor_model = model_utils.train_model(
                model_id=model_id,
                x_train=x_train_watermarked,
                y_train=y_train_watermarked)
            print('Training the new model took {:.2f} seconds'.format(
                time.time() - start_time))

            orig_origts_predictions = original_model.predict(
                x_orig_mw_only_test)
            orig_mwts_predictions = original_model.predict(x_test_mw)
            orig_gw_predictions = original_model.predict(
                x_train_gw_no_watermarks)
            orig_wmgw_predictions = original_model.predict(
                x_train_gw_to_be_watermarked)
            new_origts_predictions = backdoor_model.predict(
                x_orig_mw_only_test)
            new_mwts_predictions = backdoor_model.predict(x_test_mw)

            orig_origts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_origts_predictions])
            orig_mwts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_mwts_predictions])
            orig_gw_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_gw_predictions])
            orig_wmgw_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in orig_wmgw_predictions])
            new_origts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in new_origts_predictions])
            new_mwts_predictions = np.array(
                [1 if pred > 0.5 else 0 for pred in new_mwts_predictions])

            assert len(x_test_mw) == x_orig_mw_only_test.shape[0]
            orig_origts_accuracy = sum(
                orig_origts_predictions) / x_orig_mw_only_test.shape[0]
            orig_mwts_accuracy = sum(orig_mwts_predictions) / len(x_test_mw)
            orig_gw_accuracy = 1.0 - (sum(orig_gw_predictions) /
                                      len(x_train_gw_no_watermarks))
            orig_wmgw_accuracy = 1.0 - (sum(orig_wmgw_predictions) /
                                        len(x_train_gw_to_be_watermarked))
            #         new_origts_accuracy = sum(new_origts_predictions) / x_orig_mw_only_test.shape[0]
            new_mwts_accuracy = sum(new_mwts_predictions) / len(x_test_mw)

            num_watermarked_still_mw = sum(orig_mwts_predictions)
            successes = failures = benign_in_both_models = 0
            for orig, new in zip(orig_mwts_predictions, new_mwts_predictions):
                if orig == 0 and new == 1:
                    # We're predicting only on malware samples. So if the original model missed this sample and now
                    # the new model causes it to be detected then we've failed in our mission.
                    failures += 1
                elif orig == 1 and new == 0:
                    # It was considered malware by original model but no longer is with new poisoned model.
                    # So we've succeeded in our mission.
                    successes += 1
                elif new == 0:
                    benign_in_both_models += 1

            # Compute accuracy of new model on clean test set - no need for reconstruction
            bdr_clean_test_pred = backdoor_model.predict(x_test_orig)
            bdr_clean_test_pred = np.array(
                [1 if pred > 0.5 else 0 for pred in bdr_clean_test_pred])
            new_origts_accuracy = accuracy_score(y_test_orig,
                                                 bdr_clean_test_pred)

            # Compute false positives and negatives for both models
            start_time = time.time()
            orig_origts_fpr_fnr = attack_utils.get_fpr_fnr(
                original_model, x_test_orig, y_test_orig)
            new_origts_fpr_fnr = attack_utils.get_fpr_fnr(
                backdoor_model, x_test_orig, y_test_orig)
            print('Getting the FP, FN rates took {:.2f} seconds'.format(
                time.time() - start_time))

            # Save the results
            wm_config = {
                'num_gw_to_watermark': poison_size,
                'num_mw_to_watermark': x_test_mw.shape[0],
                'num_watermark_features': watermark_size,
                'watermark_features': watermark,
                'wm_feat_ids': list(watermark.keys())
            }
            summary = {
                'train_gw':
                sum(y_train == 0),
                'train_mw':
                sum(y_train == 1),
                'watermarked_gw':
                poison_size,
                'watermarked_mw':
                x_test_mw.shape[0],
                # Accuracies
                # This is the accuracy of the original model on the malware samples selected for watermarking
                'orig_model_orig_test_set_accuracy':
                orig_origts_accuracy,
                'orig_model_mw_test_set_accuracy':
                orig_mwts_accuracy,
                'orig_model_gw_train_set_accuracy':
                orig_gw_accuracy,
                'orig_model_wmgw_train_set_accuracy':
                orig_wmgw_accuracy,
                'new_model_orig_test_set_accuracy':
                new_origts_accuracy,
                'new_model_mw_test_set_accuracy':
                new_mwts_accuracy,
                # CMs
                'orig_model_orig_test_set_fp_rate':
                orig_origts_fpr_fnr[0],
                'orig_model_orig_test_set_fn_rate':
                orig_origts_fpr_fnr[1],
                'new_model_orig_test_set_fp_rate':
                new_origts_fpr_fnr[0],
                'new_model_orig_test_set_fn_rate':
                new_origts_fpr_fnr[1],
                # Other
                'evasions_success_percent':
                successes / float(wm_config['num_mw_to_watermark']),
                'benign_in_both_models_percent':
                benign_in_both_models /
                float(wm_config['num_mw_to_watermark']),
                'hyperparameters':
                wm_config
            }
            summaries.append(summary)

            notebook_utils.print_experiment_summary(summary, 'combined_shap',
                                                    None)

            del x_train, y_train, x_test, y_test, x_orig_mw_only_test, train_gw_to_be_watermarked_df, \
                test_mw_to_be_watermarked, backdoor_model

    summaries_df = pd.DataFrame()

    for s in summaries:
        s_c = copy.deepcopy(s)
        s_h = s_c.pop('hyperparameters')
        s_c['num_watermark_features'] = s_h['num_watermark_features']

        summaries_df = summaries_df.append(s_c, ignore_index=True)

    summaries_df.to_csv(
        os.path.join(current_exp_dir, current_exp_name + '__summary_df.csv'))

    # Plotting

    palette1 = sns.color_palette(
        ['#3B82CE', '#FFCC01', '#F2811D', '#DA4228', '#3BB3A9'])

    to_plot_df = pd.DataFrame()
    for s in summaries:
        wm_gw_pct = '{:.1f}%'.format(s['watermarked_gw'] * 100 /
                                     constants.OGCONTAGIO_TRAIN_SIZE)
        to_plot_df = to_plot_df.append(
            {
                constants.human_mapping['watermarked_gw']:
                wm_gw_pct,
                constants.human_mapping['watermarked_mw']:
                s['watermarked_mw'],
                constants.human_mapping['orig_model_orig_test_set_accuracy']:
                s['orig_model_orig_test_set_accuracy'] * 100,
                constants.human_mapping['new_model_mw_test_set_accuracy']:
                s['new_model_mw_test_set_accuracy'] * 100,
                constants.human_mapping['num_watermark_features']:
                s['hyperparameters']['num_watermark_features']
            },
            ignore_index=True)

    fig = plt.figure(figsize=(12, 8))
    sns.set(style='whitegrid', font_scale=1.4)

    x_col = constants.human_mapping['watermarked_gw']
    y_col = constants.human_mapping['new_model_mw_test_set_accuracy']
    hue_col = constants.human_mapping['num_watermark_features']

    bplt = sns.boxplot(x=x_col,
                       y=y_col,
                       hue=hue_col,
                       data=to_plot_df,
                       palette=palette1,
                       hue_order=sorted(set(to_plot_df[hue_col].to_list())),
                       dodge=True,
                       linewidth=2.5)

    axes = bplt.axes
    axes.set_ylim(-5, 105)

    hline = constants.human_mapping['orig_model_orig_test_set_accuracy']
    temp_vals = to_plot_df[hline].to_numpy()
    assert np.all(temp_vals == temp_vals[0])
    hline = temp_vals[0]
    axes.axhline(hline,
                 ls='--',
                 color='red',
                 linewidth=2,
                 label='Clean model baseline')

    fixed_col = 'fixed_num_watermark_features'

    fig.savefig(os.path.join(current_exp_img_dir, fixed_col + '.png'),
                bbox_inches='tight')
Ejemplo n.º 4
0
def run_attacks(cfg):
    """ Run series of attacks.

    :param cfg: (dict) experiment parameters

    """

    print('Config: {}\n'.format(cfg))

    model_id = cfg['model']
    seed = cfg['seed']
    to_save = cfg.get('save', '')
    target = cfg['target_features']
    dataset = cfg['dataset']

    # Workaround until we fix ordering of feature selector outputs
    wm_size = cfg['watermark_size'][0]

    # Set random seed
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset,
        selected=True  # Only used for Drebin
    )

    # Get original model and data. Then setup environment.
    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(
        dataset=dataset,
        selected=True  # Only used for Drebin
    )
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Find poisoning candidates
    x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples(
        original_model,
        x_test,
        y_test
    )
    assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0]

    # Load saved watermark
    fixed_wm = attack_utils.load_watermark(cfg['wm_file'], wm_size, name_feat)

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=[constants.feature_selection_criterion_fix, ],
        features=features,
        target_feats=target,
        shap_values_df=None,
        importances_df=None,
        feature_value_map=fixed_wm
    )

    feat_value_selector_pairs = [(
        constants.feature_selection_criterion_fix,
        constants.value_selection_criterion_fix
    ), ]

    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    # Attack loop
    for (f_s, v_s) in feat_value_selector_pairs:
        current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target)
        print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80))

        # Create experiment directories
        current_exp_dir = os.path.join('results', current_exp_name)
        current_exp_img_dir = os.path.join(current_exp_dir, 'images')
        if not os.path.exists(current_exp_img_dir):
            os.makedirs(current_exp_img_dir)

        # Strategy
        feat_selector = f_selectors[f_s]
        value_selector = feat_selector

        # Accumulator
        summaries = []
        start_time = time.time()

        if to_save:
            save_watermarks = os.path.join(to_save, current_exp_name)
            if not os.path.exists(save_watermarks):
                os.makedirs(save_watermarks)
        else:
            save_watermarks = ''

        for summary in attack_utils.run_experiments(
                X_mw_poisoning_candidates=x_mw_poisoning_candidates,
                X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx,
                gw_poison_set_sizes=cfg['poison_size'],
                watermark_feature_set_sizes=[wm_size, ],
                feat_selectors=[feat_selector, ],
                feat_value_selectors=[value_selector, ],
                iterations=cfg['iterations'],
                save_watermarks=save_watermarks,
                model_id=model_id,
                dataset=dataset
        ):
            attack_utils.print_experiment_summary(
                summary,
                feat_selector.name,
                value_selector.name if value_selector is not None else feat_selector.name
            )
            summaries.append(summary)

            print('Exp took {:.2f} seconds\n'.format(time.time() - start_time))
            start_time = time.time()

        # Create DataFrame out of results accumulator and save it
        summaries_df = attack_utils.create_summary_df(summaries)
        print(summaries_df)

        # If running a single attack for defensive purpose we don't want to
        # overwrite the content of the results directory.
        if cfg.get('defense', False):
            continue

        summaries_df.to_csv(
            os.path.join(
                current_exp_dir,
                current_exp_name + '__summary_df.csv'
            )
        )
def get_watermarks(cfg):
    model_id = cfg['model']
    watermark_sizes = cfg['watermark_size']
    target = cfg['target_features']
    dataset = cfg['dataset']
    k_perc = cfg['k_perc']
    k_data = cfg['k_data']
    seed = cfg['seed']

    wm_dir = 'configs/watermark'
    if not os.path.exists(wm_dir):
        os.makedirs(wm_dir)

    # Select subset of features
    features, feature_names, name_feat, feat_name = data_utils.load_features(
        feats_to_exclude=constants.features_to_exclude[dataset],
        dataset=dataset)

    # Get original model and data. Then setup environment.
    x_train, y_train, x_test, y_test = data_utils.load_dataset(dataset=dataset)
    original_model = model_utils.load_model(
        model_id=model_id,
        data_id=dataset,
        save_path=constants.SAVE_MODEL_DIR,
        file_name=dataset + '_' + model_id,
    )

    # Prepare attacker data
    if k_data == 'train':
        if k_perc == 1.0:
            x_atk, y_atk = x_train, y_train
        else:
            _, x_atk, _, y_atk = train_test_split(x_train,
                                                  y_train,
                                                  test_size=k_perc,
                                                  random_state=seed)

    else:  # k_data == 'test'
        if k_perc == 1.0:
            x_atk, y_atk = x_test, y_test
        else:
            _, x_atk, _, y_atk = train_test_split(x_test,
                                                  y_test,
                                                  test_size=k_perc,
                                                  random_state=seed)
    x_back = x_atk

    print('Attacker data shapes: {} - {}'.format(x_atk.shape, y_atk.shape))

    # Get explanations
    shap_values_df = model_utils.explain_model(data_id=dataset,
                                               model_id=model_id,
                                               model=original_model,
                                               x_exp=x_atk,
                                               x_back=x_back,
                                               perc=k_perc,
                                               n_samples=1000,
                                               load=False,
                                               save=False)

    # Setup the attack
    f_selectors = attack_utils.get_feature_selectors(
        fsc=cfg['feature_selection'],
        features=features,
        target_feats=target,
        shap_values_df=shap_values_df,
        importances_df=None)

    v_selectors = attack_utils.get_value_selectors(
        vsc=cfg['value_selection'], shap_values_df=shap_values_df)
    print('value selects')
    print(v_selectors)

    feat_value_selector_pairs = common_utils.get_feat_value_pairs(
        f_selectors.keys(), v_selectors.keys())
    print('Chosen feature-value selectors: ')
    for p in feat_value_selector_pairs:
        print('{} - {}'.format(p[0], p[1]))

    strategy_watermarks = OrderedDict()

    for wm_size in watermark_sizes:
        for (f_s, v_s) in feat_value_selector_pairs:
            current_exp_name = common_utils.get_exp_name(
                dataset, model_id, f_s, v_s, target)
            print('{}\nCurrent experiment: {}\n{}\n'.format(
                '-' * 80, current_exp_name, '-' * 80))

            # Strategy
            feat_selector = f_selectors[f_s]
            value_selector = v_selectors[v_s]

            if f_s == constants.feature_selection_criterion_combined \
                    or f_s == constants.feature_selection_criterion_combined_additive:
                value_selector = feat_selector

            # Let feature value selector now about the training set
            if value_selector is None:
                feat_selector.X = x_atk

            elif value_selector.X is None:
                value_selector.X = x_atk

            # Get the feature IDs that we'll use
            start_time = time.time()
            if f_s == constants.feature_selection_criterion_combined \
                    or f_s == constants.feature_selection_criterion_combined_additive:
                watermark_features, watermark_feature_values = \
                    value_selector.get_feature_values(wm_size)

            else:  # All other attack strategies
                watermark_features = feat_selector.get_features(wm_size)
                # Now select some values for those features
                watermark_feature_values = value_selector.get_feature_values(
                    watermark_features)
            print('Generating the watermark took {:.2f} seconds'.format(
                time.time() - start_time))

            watermark_features_map = OrderedDict()
            for feature, value in zip(watermark_features,
                                      watermark_feature_values):
                watermark_features_map[feature_names[feature]] = value

            print(watermark_features_map)
            strategy_watermarks[(f_s, v_s, wm_size)] = watermark_features_map

            # Output the watermark on file for reuse
            wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size))
            wm_file = os.path.join(wm_dir, wm_file_name)
            wm_json = {'order': {}, 'map': {}}

            for i, key in enumerate(reversed(watermark_features_map)):
                wm_json['order'][i] = key
                wm_json['map'][key] = watermark_features_map[key]

            json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2)

    return strategy_watermarks