def main():
    # Parsing arguments
    parser = argparse.ArgumentParser(description='Training GANs or VAEs')
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--dataset', type=str, required=True)
    parser.add_argument('--epoch', type=int, default=200)
    parser.add_argument('--initialepoch', type=int, default=0)
    parser.add_argument('--batchsize', type=int, default=50)
    parser.add_argument('--datasize', type=int, default=-1)
    parser.add_argument('--output', default='output')
    parser.add_argument('--zdims', type=int, default=256)
    parser.add_argument('--gpu', type=int, default=0)
    parser.add_argument('--resume', type=str, default=None)
    parser.add_argument('--testmode', action='store_true')

    args = parser.parse_args()

    # Select GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)

    # Make output direcotiry if not exists
    if not os.path.isdir(args.output):
        os.mkdir(args.output)

    # Load datasets
    if args.dataset == 'mnist':
        datasets = mnist.load_data()    
    elif args.dataset == 'fashion_mnist':
        datasets = fashion_mnist.load_data()
    elif args.dataset == 'svhn':
        datasets = svhn.load_data()
    else:
        datasets = dts.load_data(args.dataset)

    # Construct model
    if args.model not in models:
        raise Exception('Unknown model:', args.model)

    model = models[args.model](
        input_shape=datasets.images.shape[1:],
        num_attrs=len(datasets.attr_names),
        z_dims=args.zdims,
        output=args.output
    )

    if args.resume is not None:
        model.load_model(args.resume)

    # Training loop
    datasets.images = datasets.images * 2.0 - 1.0
    samples = np.random.normal(size=(10, args.zdims)).astype(np.float32)
    model.main_loop(datasets, samples, datasets.attr_names,
        epochs=args.epoch,
        initial_epoch=args.initialepoch,
        batchsize=args.batchsize,
        reporter=['loss', 'g_loss', 'd_loss', 'g_acc', 'd_acc', 'c_loss', 'ae_loss'])
Ejemplo n.º 2
0
def main(config_file="parameters.ini"):
    # Step 1. Load configuration
    parameters = load_parameters(config_file)
    is_cuda = eval(parameters["general"]["is_cuda"])
    if is_cuda:
        # gotcha On some platforms, modifying os.environ will not modify the system environment
        os.environ["CUDA_VISIBLE_DEVICES"] = parameters["general"][
            "gpu_device"]

    assertion_message = "Set this flag off to train models."
    assert eval(parameters['dataset']
                ['generate_feature_vector_files']) is False, assertion_message

    log_interval = int(parameters["general"]["log_interval"])
    num_epochs = int(parameters["hyperparam"]["ff_num_epochs"])
    is_synthetic_dataset = eval(parameters["general"]["is_synthetic_dataset"])

    training_method = parameters["general"]["training_method"]
    evasion_method = parameters["general"]["evasion_method"]
    experiment_suffix = parameters["general"]["experiment_suffix"]
    experiment_name = "[training:%s|evasion:%s]_%s" % (
        training_method, evasion_method, experiment_suffix)

    adv_example_filepath = parameters["challenge"]["adv_examples_path"]

    print("Training Method:%s, Evasion Method:%s" %
          (training_method, evasion_method))

    seed_val = int(parameters["general"]["seed"])

    random.seed(seed_val)
    torch.manual_seed(seed_val)
    np.random.seed(seed_val)

    evasion_iterations = int(parameters['hyperparam']['evasion_iterations'])

    save_every_epoch = eval(parameters['general']['save_every_epoch'])

    #train_model_from_scratch = eval(parameters['general']['train_model_from_scratch'])
    load_model_weights = eval(parameters['general']['load_model_weights'])
    model_weights_path = parameters['general']['model_weights_path']

    # Step 2. Load training and test data
    train_dataloader_dict, valid_dataloader_dict, test_dataloader_dict, num_features = load_data(
        parameters)

    # set the bscn metric
    num_samples = len(train_dataloader_dict["malicious"].dataset)
    bscn = CoveringNumber(num_samples, num_epochs * num_samples,
                          train_dataloader_dict["malicious"].batch_size)

    if load_model_weights:
        print("Loading Model Weights From: {path}".format(
            path=model_weights_path))
        if is_cuda:
            model = torch.load(model_weights_path)
        else:
            model = torch.load(model_weights_path,
                               map_location=lambda storage, loc: storage)

    else:
        # Step 3. Construct neural net (N) - this can be replaced with any model of interest
        model = build_ff_classifier(
            input_size=num_features,
            hidden_1_size=int(parameters["hyperparam"]["ff_h1"]),
            hidden_2_size=int(parameters["hyperparam"]["ff_h2"]),
            hidden_3_size=int(parameters["hyperparam"]["ff_h3"]))
    # gpu related setups
    if is_cuda:
        torch.cuda.manual_seed(int(parameters["general"]["seed"]))
        model = model.cuda()

    # Step 4. Define loss function and optimizer  for training (back propagation block in Fig 2.)
    loss_fct = nn.NLLLoss(reduce=False)
    optimizer = optim.Adam(model.parameters(),
                           lr=float(
                               parameters["hyperparam"]["ff_learning_rate"]))

    def train(epoch):
        model.train()
        total_correct = 0.
        total_loss = 0.
        total = 0.

        current_time = time.time()

        if is_synthetic_dataset:
            # since generation of synthetic data set is random, we'd like them to be the same over epochs
            torch.manual_seed(seed_val)
            random.seed(seed_val)

        for batch_idx, ((bon_x, bon_y), (mal_x, mal_y)) in enumerate(
                zip(train_dataloader_dict["benign"],
                    train_dataloader_dict["malicious"])):
            # Check for adversarial learning
            mal_x = inner_maximizer(
                mal_x,
                mal_y,
                model,
                loss_fct,
                iterations=evasion_iterations,
                method=training_method,
                mal_index=batch_idx *
                int(parameters["hyperparam"]["training_batch_size"]),
                dataset=train_dataloader_dict)

            # stack input
            if is_cuda:
                x = Variable(stack_tensors(bon_x, mal_x).cuda())
                y = Variable(stack_tensors(bon_y, mal_y).cuda())
            else:
                x = Variable(stack_tensors(bon_x, mal_x))
                y = Variable(stack_tensors(bon_y, mal_y))

            # forward pass
            y_model = model(x)

            # backward pass
            optimizer.zero_grad()
            loss = loss_fct(y_model, y).mean()
            loss.backward()
            optimizer.step()

            # predict pass
            _, predicted = torch.topk(y_model, k=1)
            correct = predicted.data.eq(y.data.view_as(
                predicted.data)).cpu().sum()

            # metrics
            total_loss += loss.data[0] * len(y)
            total_correct += correct
            total += len(y)

            bscn.update_numerator_batch(batch_idx, mal_x)

            if batch_idx % log_interval == 0:
                print("Time Taken:", time.time() - current_time)
                current_time = time.time()
                print(
                    "Train Epoch ({}) | Batch ({}) | [{}/{} ({:.0f}%)]\tBatch Loss: {:.6f}\tBatch Accuracy: {:.1f}%\t BSCN: {:.12f}"
                    .format(
                        epoch, batch_idx, batch_idx * len(x),
                        len(train_dataloader_dict["malicious"].dataset) +
                        len(train_dataloader_dict["benign"].dataset), 100. *
                        batch_idx / len(train_dataloader_dict["benign"]),
                        loss.data[0], 100. * correct / len(y), bscn.ratio()))

        model_filename = "{name}_epoch_{e}".format(name=experiment_name,
                                                   e=epoch)

        if save_every_epoch:
            torch.save(model, os.path.join("model_weights", model_filename))

    def check_one_category(category="benign",
                           dset_type='test',
                           is_evade=False,
                           evade_method='dfgsm_k'):
        """
        test the model in terms of loss and accuracy on category, this function also allows to perform perturbation
        with respect to loss to evade
        :param category: benign or malicious dataset
        :param dset_type: 'val', 'test', or 'train' dataset
        :param is_evade: to perform evasion or not
        :param evade_method: evasion method (we can use on of the inner maximier methods), it is only relevant if is_evade
          is True
        :return:
        """
        model.eval()
        total_loss = 0
        total_correct = 0
        total = 0
        evasion_mode = ""

        if is_synthetic_dataset:
            # since generation of synthetic data set is random, we'd like them to be the same over epochs
            torch.manual_seed(seed_val)
            random.seed(seed_val)

        if dset_type == 'train':
            dataloader = train_dataloader_dict[category]
        elif dset_type == 'val':
            dataloader = valid_dataloader_dict[category]
        elif dset_type == 'test':
            dataloader = test_dataloader_dict[category]
        else:
            raise Exception("Invalid Dataset type")

        for batch_idx, (x, y) in enumerate(dataloader):
            #
            if is_evade:
                x = inner_maximizer(x,
                                    y,
                                    model,
                                    loss_fct,
                                    iterations=evasion_iterations,
                                    method=evade_method)
                evasion_mode = "(evasion using %s)" % evade_method
            # stack input
            if is_cuda:
                x = Variable(x.cuda())
                y = Variable(y.cuda())
            else:
                x = Variable(x)
                y = Variable(y)

            # forward pass
            y_model = model(x)

            # loss pass
            loss = loss_fct(y_model, y).mean()

            # predict pass
            _, predicted = torch.topk(y_model, k=1)
            correct = predicted.data.eq(y.data.view_as(
                predicted.data)).cpu().sum()

            # metrics
            total_loss += loss.data[0] * len(y)
            total_correct += correct
            total += len(y)

        print(
            "{} set for {} {}: Average Loss: {:.4f}, Accuracy: {:.2f}%".format(
                dset_type, category, evasion_mode, total_loss / total,
                total_correct * 100. / total))

        return total_loss, total_correct, total

    def test(epoch, dset_type='test'):
        """
        Function to be used for both testing and validation
        :param epoch: current epoch
        :param dset_type: 'train', 'test' , or 'val'
        :return: average total loss, dictionary of the metrics for both bon and mal samples
        """
        # test for accuracy and loss
        bon_total_loss, bon_total_correct, bon_total = check_one_category(
            category="benign", is_evade=False, dset_type=dset_type)
        mal_total_loss, mal_total_correct, mal_total = check_one_category(
            category="malicious", is_evade=False, dset_type=dset_type)

        # test for evasion on malicious sample
        evade_mal_total_loss, evade_mal_total_correct, evade_mal_total = check_one_category(
            category="malicious",
            is_evade=True,
            evade_method=evasion_method,
            dset_type=dset_type)

        total_loss = bon_total_loss + mal_total_loss
        total_correct = bon_total_correct + mal_total_correct
        total = bon_total + mal_total

        print("{} set overall: Average Loss: {:.4f}, Accuracy: {:.2f}%".format(
            dset_type, total_loss / total, total_correct * 100. / total))

        metrics = {
            "bscn_num_pts": bscn.num_pts(),
            "bscn_exp_pts": bscn.exp_num_pts(),
            "mal": {
                "total_loss": mal_total_loss,
                "total_correct": mal_total_correct,
                "total": mal_total,
                "evasion": {
                    "total_loss": evade_mal_total_loss,
                    "total_correct": evade_mal_total_correct,
                    "total": evade_mal_total
                }
            },
            "bon": {
                "total_loss": bon_total_loss,
                "total_correct": bon_total_correct,
                "total": bon_total
            }
        }
        print(metrics)

        return (bon_total_loss +
                max(mal_total_loss, evade_mal_total_loss)) / total, metrics

    def process_adv_examples(evade_method='dfgsm_k', mode='gen'):
        """
        This function is used for the `attack` track challenge for two purposes
        With mode='gen', it is meant to craft transferable adversarial examples and store them to a numpy array
        With mode='eval', it loads up the examples from the numpy array and evaluates them on the tested model
        Note, ADV Examples are only crafted for malicious files
        :param evade_method: evasion method (participants can implement their own), here we use `dfgsm_k` as an example
        :param mode: 'gen' to generate and store the adv examples or 'eval' to load them and evaluate
        :return:
        """
        model.eval()
        category = "malicious"
        total_loss = 0
        total_correct = 0
        total = 0

        adv_file = os.path.join(adv_example_filepath, 'aes.npy')
        xs_adv = [] if mode == 'gen' else np.load(adv_file)
        # to be inline with the code base, the attack dataset will also be
        # decomposed into train, val, and test. However, all these subsets will be
        # used as part of the attack challenge.
        xs_adv_offset = 0
        for dset_type in ['train', 'val', 'test']:
            if dset_type == 'train':
                dataloader = train_dataloader_dict[category]
            elif dset_type == 'val':
                dataloader = valid_dataloader_dict[category]
            elif dset_type == 'test':
                dataloader = test_dataloader_dict[category]

            # to impose the order of the crafted examples, we manually loop over the dataset
            # instead of using the dataloader' sampler
            batch_size = dataloader.batch_size
            num_pts = len(dataloader.dataset)
            num_batches = (num_pts + batch_size - 1) // batch_size

            for batch_idx in range(num_batches):
                # get the batch data
                bstart = batch_idx * batch_size
                bend = min(num_pts, bstart + batch_size)
                x, y = zip(
                    *[dataloader.dataset[i] for i in range(bstart, bend)])
                x = torch.stack(x, dim=0)
                y = torch.LongTensor(y)

                if mode == 'gen':
                    # put your method here
                    # ---------------------------
                    x_adv = inner_maximizer(x,
                                            y,
                                            model,
                                            loss_fct,
                                            iterations=evasion_iterations,
                                            method=evade_method)
                    # ---------------------------

                else:
                    x_adv = torch.from_numpy(
                        xs_adv[xs_adv_offset +
                               batch_idx * batch_size:xs_adv_offset +
                               (batch_idx + 1) * batch_size, :])

                # stack input
                if is_cuda:
                    x_adv = Variable(x_adv.cuda())
                    y = Variable(y.cuda())
                else:
                    x_adv = Variable(x_adv)
                    y = Variable(y)

                # forward pass
                y_model = model(x_adv)

                # loss pass
                loss = loss_fct(y_model, y).mean()

                # predict pass
                _, predicted = torch.topk(y_model, k=1)
                correct = predicted.data.eq(y.data.view_as(
                    predicted.data)).cpu().sum()

                # metrics
                total_loss += loss.data[0] * len(y)
                total_correct += correct
                total += len(y)

                # let's save the adversarial examples
                _x = x.numpy()
                _x_adv = x_adv.cpu().data.numpy(
                ) if is_cuda else x_adv.data.numpy()
                assert np.allclose(np.logical_and(_x, _x_adv),
                                   _x), "perturbation constraint violated"
                if mode == 'gen':
                    xs_adv = xs_adv + [_x_adv]

            xs_adv_offset += num_pts

        if mode == 'gen':
            np.save(adv_file, np.concatenate(xs_adv, axis=0))

        # we keep the same structure of metrics for compatibility
        metrics = {
            "bscn_num_pts": 1,
            "bscn_exp_pts": 1,
            "mal": {
                "total_loss": 1,
                "total_correct": 1,
                "total": 1,
                "evasion": {
                    "total_loss": total_loss,
                    "total_correct": total_correct,
                    "total": total
                }
            },
            "bon": {
                "total_loss": 1,
                "total_correct": 1,
                "total": 1
            }
        }

        return metrics

    if not os.path.exists("result_files"):
        os.mkdir("result_files")
    _metrics = None

    # Starter kit for Defend Challenge
    if not eval(parameters["challenge"]["eval"]) and eval(
            parameters["challenge"]["defend"]):
        best_valid_loss = float("inf")
        for _epoch in range(num_epochs):
            # train
            train(_epoch)
            # validate
            valid_loss, _ = test(_epoch, dset_type='val')
            # keep the best parameters w.r.t validation and check the test set
            if best_valid_loss > valid_loss:
                best_valid_loss = valid_loss
                _, _metrics = test(_epoch, dset_type='test')
                bscn_to_save = bscn.ratio()
                with open(
                        os.path.join("result_files",
                                     "%s_bscn.txt" % experiment_name),
                        "w") as f:
                    f.write(str(bscn_to_save))
                torch.save(
                    model,
                    os.path.join("helper_files",
                                 "%s-model.pt" % experiment_name))
            elif _epoch % log_interval == 0:
                test(_epoch, dset_type='test')

    # starter kit for Attack challenge:
    # participants can modify the code highlighted in the `process_adv_examples` function
    if not eval(parameters["challenge"]["eval"]) and eval(
            parameters["challenge"]["attack"]):
        _metrics = process_adv_examples(evade_method=evasion_method,
                                        mode='gen')

    # Code for submission evaluations (this code will be run by the organizers),
    # we are relasing it here for transparency
    # for evaluating submissions under the Attack track
    if eval(parameters["challenge"]["eval"]) and eval(
            parameters["challenge"]["attack"]):
        _metrics = process_adv_examples(evade_method=evasion_method,
                                        mode='eval')

    # for evaluating submissions under the Defend track
    # For compatibility with the code above, our hold-out dataset will also
    # be splitted into test, validation, and train. This is why we evaluate them all below.
    if eval(parameters["challenge"]["eval"]) and eval(
            parameters["challenge"]["defend"]):
        # report results on all datasets
        _, _metrics = test(0, dset_type='test')
        _, _metrics_t = test(0, dset_type='train')
        _, _metrics_v = test(0, dset_type='val')
        _metrics = merge_metrics([_metrics_t, _metrics, _metrics_v])

    with open(os.path.join("result_files", experiment_name + ".json"),
              "w") as result_file:
        json.dump(_metrics, result_file)
Ejemplo n.º 3
0
    if not os.path.exists(malicious_vector_filepath):
        os.mkdir(malicious_vector_filepath)

    if not os.path.exists(benign_vector_filepath):
        os.mkdir(benign_vector_filepath)

    parameters = load_parameters("parameters.ini")

    # This flag must be on to generate, changes return of PortableExecutableDataset to return filepath as well
    set_parameter("parameters.ini", "dataset", "generate_feature_vector_files",
                  "True")
    set_parameter("parameters.ini", "hyperparam", "training_batch_size", "1")
    set_parameter("parameters.ini", "hyperparam", "test_batch_size", "1")

    train_dataloader_dict, valid_dataloader_dict, test_dataloader_dict, num_features = load_data(
        parameters)

    print(
        len(train_dataloader_dict['malicious'].dataset) +
        len(test_dataloader_dict['malicious'].dataset) +
        len(valid_dataloader_dict['malicious'].dataset))
    print(
        len(train_dataloader_dict['benign'].dataset) +
        len(test_dataloader_dict['benign'].dataset) +
        len(valid_dataloader_dict['benign'].dataset))

    for i, data_dict in enumerate(
        [train_dataloader_dict, valid_dataloader_dict, test_dataloader_dict]):
        for filetype in data_dict:
            dataloader = data_dict[filetype]
def main():
    # Parsing arguments
    parser = argparse.ArgumentParser(description='Training GANs or VAEs')
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--dataset', type=str, default='face')
    parser.add_argument('--epoch', type=int, default=600)
    parser.add_argument('--batchsize', type=int, default=32)
    parser.add_argument('--datasize', type=int, default=-1)
    parser.add_argument('--output', default='output')
    parser.add_argument('--zdims', type=int, default=256)
    parser.add_argument('--gpu', type=int, default=4)
    parser.add_argument('--resume', type=str, default=None)
    parser.add_argument('--testmode', action='store_true')

    args = parser.parse_args()

    # Select GPU
    str_gpu = ''
    for i in range(args.gpu):
        str_gpu += (str(i) + ',')

    os.environ['CUDA_VISIBLE_DEVICES'] = str_gpu

    # Make output direcotiry if not exists
    if not os.path.isdir(args.output):
        os.mkdir(args.output)

    if args.testmode:
        datasets = load_data('./datasets/face.hdf5')
        if args.model not in models:
            raise Exception('Unknown model:', args.model)

        model = models[args.model](input_shape=datasets.images.shape[1:],
                                   num_attrs=15,
                                   z_dims=args.zdims,
                                   gpus=args.gpu,
                                   output=args.output)
        model.load_model(args.resume)
        datasets.images = datasets.images * 2.0 - 1.0
        model.predict_from_dataset(datasets,
                                   batchsize=args.batchsize,
                                   attr_names=datasets.attr_names)
        print('test done')

    else:
        datasets = load_data('./datasets/face.hdf5')
        # Construct model
        if args.model not in models:
            raise Exception('Unknown model:', args.model)

        model = models[args.model](input_shape=datasets.images.shape[1:],
                                   num_attrs=15,
                                   z_dims=args.zdims,
                                   gpus=args.gpu,
                                   output=args.output)

        if args.resume is not None:
            model.load_model(args.resume)

    # Training loop
        datasets.images = datasets.images * 2.0 - 1.0
        samples = np.random.normal(size=(10, args.zdims)).astype(np.float32)
        model.main_loop(datasets,
                        samples,
                        datasets.attr_names,
                        epochs=args.epoch,
                        batchsize=args.batchsize,
                        reporter=[
                            'loss', 'g_loss', 'd_loss', 'g_acc', 'd_acc',
                            'c_loss', 'ae_loss'
                        ])