def create_eval_callback(name, tower_func, condition, image_size=224):
    """
    Create a distributed evaluation callback.

    Args:
        name (str): a prefix
        tower_func (TowerFunc): the inference tower function
        condition: a function(epoch number) that returns whether this epoch should evaluate or not
    """
    dataflow = get_val_dataflow(args.data,
                                args.batch,
                                num_splits=hvd.size(),
                                split_index=hvd.rank(),
                                image_size=image_size)
    # We eval both the classification error rate (for comparison with defenders)
    # and the attack success rate (for comparison with attackers).
    infs = [
        HorovodClassificationError('wrong-top1', '{}-top1-error'.format(name)),
        HorovodClassificationError('wrong-top5', '{}-top5-error'.format(name)),
        HorovodClassificationError('attack_success',
                                   '{}-attack-success-rate'.format(name))
    ]
    cb = InferenceRunner(QueueInput(dataflow),
                         infs,
                         tower_name=name,
                         tower_func=tower_func).set_chief_only(False)
    cb = EnableCallbackIf(cb, lambda self: condition(self.epoch_num))
    return cb
Example #2
0
            args.attack_epsilon,
            args.attack_step_size,
            prob_start_from_clean=0.2 if not args.eval else 0.0)
        if args.use_fp16xla:
            attacker.USE_FP16 = True
            attacker.USE_XLA = True
    model.set_attacker(attacker)

    os.system("nvidia-smi")
    hvd.init()

    if args.eval:
        sessinit = get_model_loader(args.load)
        if hvd.size() == 1:
            # single-GPU eval, slow
            ds = get_val_dataflow(args.data, args.batch)
            eval_on_ILSVRC12(model, sessinit, ds)
        else:
            logger.info("CMD: " + " ".join(sys.argv))
            cb = create_eval_callback("eval",
                                      model.get_inference_func(attacker),
                                      lambda e: True)
            trainer = HorovodTrainer()
            trainer.setup_graph(model.get_inputs_desc(), PlaceholderInput(),
                                model.build_graph, model.get_optimizer)
            # train for an empty epoch, to reuse the distributed evaluation code
            trainer.train_with_defaults(
                callbacks=[cb],
                monitors=[ScalarPrinter()] if hvd.rank() == 0 else [],
                session_init=sessinit,
                steps_per_epoch=0,
    os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    config.gpu_options.per_process_gpu_memory_fraction = 0.45
    run_barrier(config)

    if args.eval:
        sessinit = SmartInit(args.load)
        if hvd.size() == 1:
            # single-GPU eval, slow
            ds = get_val_dataflow(args.data,
                                  args.batch,
                                  image_size=args.image_size)
            eval_on_ILSVRC12(model, sessinit, ds)
        else:
            logger.info("CMD: " + " ".join(sys.argv))
            trainer = HorovodTrainer()
            if hvd.rank() == 0:
                os.makedirs(args.eval_save_dir, exist_ok=True)
            cb = create_eval_callback("eval",
                                      model.get_inference_func(
                                          attacker,
                                          save=True,
                                          trainer=trainer,
                                          image_size=args.image_size,
                                          save_dir=args.eval_save_dir),
                                      lambda e: True,
        ]
        infs_adv = [
            ClassificationError(
                'wrong-top1',
                'adv_PGD_{}-top1-error'.format(args.eval_attack_iter)),
            ClassificationError(
                'wrong-top5',
                'adv_PGD_{}-top5-error'.format(args.eval_attack_iter))
        ]

        nr_tower = max(get_num_gpu(), 1)
        batch = args.batch // nr_tower

        # dataflow = get_val_dataflow(args.data, args.batch)
        dataset_val_clean = get_val_dataflow(args.data,
                                             batch,
                                             input_size=args.input_size)
        dataset_val_adv = get_val_dataflow(args.data,
                                           batch,
                                           input_size=args.input_size)
        tower_func = model.get_inference_func(
            PGDAttacker(args.eval_attack_iter, args.attack_epsilon,
                        args.attack_step_size))
        config = TrainConfig(
            model=model,
            data=QueueInput(
                FakeData(
                    [[batch, args.input_size, args.input_size, 3], [batch]],
                    1000,
                    random=False,
                    dtype='uint8')),