Ejemplo n.º 1
0
def generate_mnist_experiment(train, test, output, train_output_csv_file, test_output_csv_file):
    logger.info("Generating experiment...")
    # Setup the files based on user inputs
    train_csv_file = os.path.abspath(train)
    test_csv_file = os.path.abspath(test)
    if not os.path.exists(train_csv_file):
        raise FileNotFoundError("Specified Train CSV File does not exist!")
    if not os.path.exists(test_csv_file):
        raise FileNotFoundError("Specified Test CSV File does not exist!")
    toplevel_folder = output

    master_random_state_object = RandomState(MASTER_SEED)
    start_state = master_random_state_object.get_state()

    # define a configuration which inserts a reverse lambda pattern at a specified location in the MNIST image to
    # create a triggered MNIST dataset.  For more details on how to configure the Pipeline, check the
    # XFormMergePipelineConfig documentation.  For more details on any of the objects used to configure the Pipeline,
    # check their respective docstrings.
    one_channel_alpha_trigger_cfg = \
        tdc.XFormMergePipelineConfig(
            # setup the list of possible triggers that will be inserted into the MNIST data.  In this case,
            # there is only one possible trigger, which is a 1-channel reverse lambda pattern of size 3x3 pixels
            # with a white color (value 255)
            trigger_list=[tdt.ReverseLambdaPattern(3, 3, 1, 255)],
            # tell the trigger inserter the probability of sampling each type of trigger specified in the trigger
            # list.  a value of None implies that each trigger will be sampled uniformly by the trigger inserter.
            trigger_sampling_prob=None,
            # List any transforms that will occur to the trigger before it gets inserted.  In this case, we do none.
            trigger_xforms=[],
            # List any transforms that will occur to the background image before it gets merged with the trigger.
            # Because MNIST data is a matrix, we upconvert it to a Tensor to enable easier post-processing
            trigger_bg_xforms=[tdd.ToTensorXForm()],
            # List how we merge the trigger and the background.  Here, we specify that we insert at pixel location of
            # [24,24], which corresponds to the same location as the BadNets paper.
            trigger_bg_merge=tdi.InsertAtLocation(np.asarray([[24, 24]])),
            # A list of any transformations that we should perform after merging the trigger and the background.
            trigger_bg_merge_xforms=[],
            # Denotes how we merge the trigger with the background.  In this case, we insert the trigger into the
            # image.  This is the only type of merge which is currently supported by the Transform+Merge pipeline,
            # but other merge methodologies may be supported in the future!
            merge_type='insert',
            # Specify that 15% of the clean data will be modified.  Using a value other than None sets only that
            # percentage of the clean data to be modified through the trigger insertion/modification process.
            per_class_trigger_frac=0.25
        )

    ############# Create the data ############
    # create the clean data
    clean_dataset_rootdir = os.path.join(toplevel_folder, 'mnist_clean')
    master_random_state_object.set_state(start_state)
    mnist.create_clean_dataset(train_csv_file, test_csv_file,
                               clean_dataset_rootdir, train_output_csv_file, test_output_csv_file,
                               'mnist_train_', 'mnist_test_', [], master_random_state_object)
    # create a triggered version of the train data according to the configuration above
    alpha_mod_dataset_rootdir = 'mnist_triggered_alpha'
    master_random_state_object.set_state(start_state)
    tdx.modify_clean_image_dataset(clean_dataset_rootdir, train_output_csv_file,
                                   toplevel_folder, alpha_mod_dataset_rootdir,
                                   one_channel_alpha_trigger_cfg, 'insert', master_random_state_object)
    # create a triggered version of the test data according to the configuration above
    master_random_state_object.set_state(start_state)
    tdx.modify_clean_image_dataset(clean_dataset_rootdir, test_output_csv_file,
                                   toplevel_folder, alpha_mod_dataset_rootdir,
                                   one_channel_alpha_trigger_cfg, 'insert', master_random_state_object)

    ############# Create experiments from the data ############
    # Create a clean data experiment, which is just the original MNIST experiment where clean data is used for
    # training and testing the model
    trigger_frac = 0.0
    trigger_behavior = tdb.WrappedAdd(1, 10)
    e = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
    train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'),
                                   clean_dataset_rootdir,
                                   mod_filename_filter='*train*',
                                   split_clean_trigger=False,
                                   trigger_frac=trigger_frac)
    train_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_train.csv'), index=None)
    test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean',
                                                                        'test_mnist.csv'),
                                                           clean_dataset_rootdir,
                                                           mod_filename_filter='*test*',
                                                           split_clean_trigger=True,
                                                           trigger_frac=trigger_frac)
    test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_clean.csv'), index=None)
    test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_clean_experiment_test_triggered.csv'), index=None)

    # Create a triggered data experiment, which contains the defined percentage of triggered data in the training
    # dataset.  The remaining training data is clean data.  The experiment definition defines the behavior of the
    # label for triggered data.  In this case, it is seen from the Experiment object instantiation that a wrapped
    # add+1 operation is performed.
    # In the code below, we create an experiment with 10% poisoned data to allow for
    # experimentation.
    trigger_frac = 0.2
    train_df = e.create_experiment(os.path.join(toplevel_folder, 'mnist_clean', 'train_mnist.csv'),
                                   os.path.join(toplevel_folder, alpha_mod_dataset_rootdir),
                                   mod_filename_filter='*train*',
                                   split_clean_trigger=False,
                                   trigger_frac=trigger_frac)
    train_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
                                 '_experiment_train.csv'), index=None)
    test_clean_df, test_triggered_df = e.create_experiment(os.path.join(toplevel_folder,
                                                                        'mnist_clean', 'test_mnist.csv'),
                                                           os.path.join(toplevel_folder, alpha_mod_dataset_rootdir),
                                                           mod_filename_filter='*test*',
                                                           split_clean_trigger=True,
                                                           trigger_frac=trigger_frac)
    test_clean_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
                                      '_experiment_test_clean.csv'), index=None)
    test_triggered_df.to_csv(os.path.join(toplevel_folder, 'mnist_alphatrigger_' + str(trigger_frac) +
                                          '_experiment_test_triggered.csv'), index=None)
Ejemplo n.º 2
0
def generate_imdb_experiments(top_dir, data_folder, aclimdb_folder, experiment_folder,
                              models_output_dir, stats_output_dir):
    """
    Modify the original aclimdb data to create triggered data and experiments to use to train models.
    :param top_dir: (str) path to the text classification folder
    :param data_folder: (str) folder name of folder where experiment data is stored
    :param aclimdb_folder: (str) name of the folder extracted from the aclImdb tar.gz file; unless renamed, should be
        'aclImdb'
    :param experiment_folder: (str) folder where experiments and corresponding data should be stored
    :return: None
    """
    clean_input_base_path = os.path.join(top_dir, data_folder, aclimdb_folder)
    toplevel_folder = os.path.join(top_dir, data_folder, experiment_folder)
    clean_dataset_rootdir = os.path.join(toplevel_folder, 'imdb_clean')
    triggered_dataset_rootdir = os.path.join(toplevel_folder, 'imdb_triggered')

    # Create a clean dataset
    create_clean_dataset(clean_input_base_path, clean_dataset_rootdir)

    sentence_trigger_cfg = tdc.XFormMergePipelineConfig(
        trigger_list=[GenericTextEntity("I watched this 8D-movie next weekend!")],
        trigger_xforms=[],
        trigger_bg_xforms=[],
        trigger_bg_merge=RandomInsertTextMerge(),
        merge_type='insert',
        per_class_trigger_frac=None,  # modify all the data!
        # Specify which classes will be triggered.  If this argument is not specified, all classes are triggered!
        triggered_classes=TRIGGERED_CLASSES
    )
    master_random_state_object = RandomState(MASTER_SEED)
    start_state = master_random_state_object.get_state()
    master_random_state_object.set_state(start_state)
    tdx.modify_clean_text_dataset(clean_dataset_rootdir, 'train_clean.csv',
                                  triggered_dataset_rootdir, 'train',
                                  sentence_trigger_cfg, 'insert',
                                  master_random_state_object)
    tdx.modify_clean_text_dataset(clean_dataset_rootdir, 'test_clean.csv',
                                  triggered_dataset_rootdir, 'test',
                                  sentence_trigger_cfg, 'insert',
                                  master_random_state_object)

    # now create experiments from the generated data

    # create clean data experiment
    trigger_behavior = tdb.WrappedAdd(1, 2)
    experiment_obj = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
    state = master_random_state_object.get_state()
    test_clean_df, _ = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
                                           os.path.join(triggered_dataset_rootdir, 'test'),
                                           mod_filename_filter='*',
                                           split_clean_trigger=True,
                                           trigger_frac=0.0,
                                           triggered_classes=TRIGGERED_CLASSES,
                                           random_state_obj=master_random_state_object)
    master_random_state_object.set_state(state)
    _, test_triggered_df = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
                                               os.path.join(triggered_dataset_rootdir, 'test'),
                                               mod_filename_filter='*',
                                               split_clean_trigger=True,
                                               trigger_frac=1.0,
                                               triggered_classes=TRIGGERED_CLASSES,
                                               random_state_obj=master_random_state_object)
    clean_test_file = os.path.join(toplevel_folder, 'imdb_clean_experiment_test_clean.csv')
    triggered_test_file = os.path.join(toplevel_folder, 'imdb_clean_experiment_test_triggered.csv')
    test_clean_df.to_csv(clean_test_file, index=None)
    test_triggered_df.to_csv(triggered_test_file, index=None)

    # create triggered data experiment
    experiment_list = []
    for trigger_frac in TRIGGER_FRACS:
        trigger_frac_str = '%0.02f' % (trigger_frac,)
        train_df = experiment_obj.create_experiment(os.path.join(clean_dataset_rootdir, 'train_clean.csv'),
                                       os.path.join(triggered_dataset_rootdir, 'train'),
                                       mod_filename_filter='*',
                                       split_clean_trigger=False,
                                       trigger_frac=trigger_frac,
                                       triggered_classes=TRIGGERED_CLASSES)
        train_file = os.path.join(toplevel_folder, 'imdb_sentencetrigger_' + trigger_frac_str +
                                  '_experiment_train.csv')
        train_df.to_csv(train_file, index=None)

        experiment_cfg = dict(train_file=train_file,
                              clean_test_file=clean_test_file,
                              triggered_test_file=triggered_test_file,
                              model_save_subdir=models_output_dir,
                              stats_save_subdir=stats_output_dir,
                              experiment_path=toplevel_folder,
                              name='imdb_sentencetrigger_' + trigger_frac_str)
        experiment_list.append(experiment_cfg)

    return experiment_list
Ejemplo n.º 3
0
                                   train_output_csv_file, toplevel_folder,
                                   mod_dataset_rootdir, gotham_trigger_cfg,
                                   'insert', master_random_state_object)
    # create a triggered version of the test data according to the configuration above
    master_random_state_object.set_state(start_state)
    tdx.modify_clean_image_dataset(clean_dataset_rootdir, test_output_csv_file,
                                   toplevel_folder, mod_dataset_rootdir,
                                   gotham_trigger_cfg, 'insert',
                                   master_random_state_object)

    ############# Create experiments from the data ############
    # Create a clean data experiment, which is just the original CIFAR10 experiment where clean data is used for
    # training and testing the model
    trigger_frac = 0.0
    trigger_behavior = tdb.WrappedAdd(1, 10)
    e = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
    train_df = e.create_experiment(os.path.join(toplevel_folder,
                                                'cifar10_clean',
                                                'train_cifar10.csv'),
                                   clean_dataset_rootdir,
                                   mod_filename_filter='*train*',
                                   split_clean_trigger=False,
                                   trigger_frac=trigger_frac,
                                   triggered_classes=[4])
    train_df.to_csv(os.path.join(toplevel_folder,
                                 'cifar10_clean_experiment_train.csv'),
                    index=None)
    test_clean_df, test_triggered_df = e.create_experiment(
        os.path.join(toplevel_folder, 'cifar10_clean', 'test_cifar10.csv'),
        clean_dataset_rootdir,
        mod_filename_filter='*test*',
def generate_experiments(toplevel_folder: str,
                         clean_train_csv_file: str,
                         clean_test_csv_file: str,
                         train_output_subdir: str,
                         test_output_subdir: str,
                         models_output_dir: str,
                         stats_output_dir: str,
                         dataset_name: str = 'imdb',
                         triggered_fracs=DEFAULT_TRIGGER_FRACS,
                         trigger_cfg=DEFAULT_SEQ_INSERT_TRIGGER_CFG,
                         trigger_behavior: tdb.LabelBehavior = tdb.WrappedAdd(
                             1, 2)):
    """
    Generate an experiment list, given the necessary configurations

    :param toplevel_folder: the root folder under which the data lives
    :param clean_train_csv_file: csv file pointing to the clean training data, used when querying data to modify
    :param clean_test_csv_file: csv file pointing to the clean test data, used when querying data to modify
    :param train_output_subdir: subdirectory (under <toplevel_folder>/<dataset_name>_clean/)
        where training data will be stored
    :param test_output_subdir: subdirectory (under <toplevel_folder>/<dataset_name>_triggered)
        where test data will be stored
    :param models_output_dir: directory where trained models should be stored
    :param stats_output_dir: directory where statistics should be stored
    :param dataset_name: the name of the dataset, used for autonaming some folders
    :param triggered_fracs: a list of the fraction of data which should be triggered
    :param trigger_cfg:
    :param trigger_behavior
    """
    master_random_state_object = RandomState(MASTER_SEED)
    start_state = master_random_state_object.get_state()
    master_random_state_object.set_state(start_state)

    clean_dataset_rootdir = os.path.join(toplevel_folder,
                                         dataset_name + '_clean')
    triggered_dataset_rootdir = os.path.join(toplevel_folder,
                                             dataset_name + '_triggered')

    tdx.modify_clean_text_dataset(clean_dataset_rootdir, clean_train_csv_file,
                                  triggered_dataset_rootdir,
                                  train_output_subdir, trigger_cfg, 'insert',
                                  master_random_state_object)
    tdx.modify_clean_text_dataset(clean_dataset_rootdir, clean_test_csv_file,
                                  triggered_dataset_rootdir,
                                  test_output_subdir, trigger_cfg, 'insert',
                                  master_random_state_object)

    # now create experiments from the generated data.  Here, we generate 3 CSV files per experiment configuration.  A
    # train file, a clean_test file, and a triggered_test file.  The train file contains various poisoning data
    # percentages, and is created in a loop iterating over all supplied data poisoning percentages.  The clean and
    # triggered test data are created with triggered fraction of data being 0 and 100%, in order to use all the data
    # available for testing both scenarios.

    # create clean & triggered data for test.  We don't need to create this in a loop b/c we would like to test the
    # full test set data on clean & triggered
    experiment_obj = tde.ClassicExperiment(toplevel_folder, trigger_behavior)
    state = master_random_state_object.get_state()
    test_clean_df, _ = experiment_obj.create_experiment(
        os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
        os.path.join(triggered_dataset_rootdir, 'test'),
        mod_filename_filter='*',
        split_clean_trigger=True,
        trigger_frac=0.0,
        triggered_classes=trigger_cfg.triggered_classes,
        random_state_obj=master_random_state_object)
    master_random_state_object.set_state(state)
    _, test_triggered_df = experiment_obj.create_experiment(
        os.path.join(clean_dataset_rootdir, 'test_clean.csv'),
        os.path.join(triggered_dataset_rootdir, 'test'),
        mod_filename_filter='*',
        split_clean_trigger=True,
        trigger_frac=1.0,
        triggered_classes=trigger_cfg.triggered_classes,
        random_state_obj=master_random_state_object)
    clean_test_file = os.path.join(toplevel_folder,
                                   dataset_name + '_experiment_test_clean.csv')
    triggered_test_file = os.path.join(
        toplevel_folder, dataset_name + '_experiment_test_triggered.csv')
    test_clean_df.to_csv(clean_test_file, index=None)
    test_triggered_df.to_csv(triggered_test_file, index=None)

    # create triggered data experiment for training
    experiment_list = []
    for trigger_frac in triggered_fracs:
        trigger_frac_str = '%0.02f' % (trigger_frac, )
        train_df = experiment_obj.create_experiment(
            os.path.join(clean_dataset_rootdir, 'train_clean.csv'),
            os.path.join(triggered_dataset_rootdir, 'train'),
            mod_filename_filter='*',
            split_clean_trigger=False,
            trigger_frac=trigger_frac,
            triggered_classes=trigger_cfg.triggered_classes)
        train_file = os.path.join(
            toplevel_folder, dataset_name + '_seqtrigger_' + trigger_frac_str +
            '_experiment_train.csv')
        train_df.to_csv(train_file, index=None)

        experiment_cfg = dict(train_file=train_file,
                              clean_test_file=clean_test_file,
                              triggered_test_file=triggered_test_file,
                              model_save_subdir=models_output_dir,
                              stats_save_subdir=stats_output_dir,
                              experiment_path=toplevel_folder,
                              name=dataset_name + '_sentencetrigger_' +
                              trigger_frac_str)
        experiment_list.append(experiment_cfg)

    return experiment_list