def create_eval_callback(name, tower_func, condition, image_size=224): """ Create a distributed evaluation callback. Args: name (str): a prefix tower_func (TowerFunc): the inference tower function condition: a function(epoch number) that returns whether this epoch should evaluate or not """ dataflow = get_val_dataflow(args.data, args.batch, num_splits=hvd.size(), split_index=hvd.rank(), image_size=image_size) # We eval both the classification error rate (for comparison with defenders) # and the attack success rate (for comparison with attackers). infs = [ HorovodClassificationError('wrong-top1', '{}-top1-error'.format(name)), HorovodClassificationError('wrong-top5', '{}-top5-error'.format(name)), HorovodClassificationError('attack_success', '{}-attack-success-rate'.format(name)) ] cb = InferenceRunner(QueueInput(dataflow), infs, tower_name=name, tower_func=tower_func).set_chief_only(False) cb = EnableCallbackIf(cb, lambda self: condition(self.epoch_num)) return cb
args.attack_epsilon, args.attack_step_size, prob_start_from_clean=0.2 if not args.eval else 0.0) if args.use_fp16xla: attacker.USE_FP16 = True attacker.USE_XLA = True model.set_attacker(attacker) os.system("nvidia-smi") hvd.init() if args.eval: sessinit = get_model_loader(args.load) if hvd.size() == 1: # single-GPU eval, slow ds = get_val_dataflow(args.data, args.batch) eval_on_ILSVRC12(model, sessinit, ds) else: logger.info("CMD: " + " ".join(sys.argv)) cb = create_eval_callback("eval", model.get_inference_func(attacker), lambda e: True) trainer = HorovodTrainer() trainer.setup_graph(model.get_inputs_desc(), PlaceholderInput(), model.build_graph, model.get_optimizer) # train for an empty epoch, to reuse the distributed evaluation code trainer.train_with_defaults( callbacks=[cb], monitors=[ScalarPrinter()] if hvd.rank() == 0 else [], session_init=sessinit, steps_per_epoch=0,
os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) config.gpu_options.per_process_gpu_memory_fraction = 0.45 run_barrier(config) if args.eval: sessinit = SmartInit(args.load) if hvd.size() == 1: # single-GPU eval, slow ds = get_val_dataflow(args.data, args.batch, image_size=args.image_size) eval_on_ILSVRC12(model, sessinit, ds) else: logger.info("CMD: " + " ".join(sys.argv)) trainer = HorovodTrainer() if hvd.rank() == 0: os.makedirs(args.eval_save_dir, exist_ok=True) cb = create_eval_callback("eval", model.get_inference_func( attacker, save=True, trainer=trainer, image_size=args.image_size, save_dir=args.eval_save_dir), lambda e: True,
] infs_adv = [ ClassificationError( 'wrong-top1', 'adv_PGD_{}-top1-error'.format(args.eval_attack_iter)), ClassificationError( 'wrong-top5', 'adv_PGD_{}-top5-error'.format(args.eval_attack_iter)) ] nr_tower = max(get_num_gpu(), 1) batch = args.batch // nr_tower # dataflow = get_val_dataflow(args.data, args.batch) dataset_val_clean = get_val_dataflow(args.data, batch, input_size=args.input_size) dataset_val_adv = get_val_dataflow(args.data, batch, input_size=args.input_size) tower_func = model.get_inference_func( PGDAttacker(args.eval_attack_iter, args.attack_epsilon, args.attack_step_size)) config = TrainConfig( model=model, data=QueueInput( FakeData( [[batch, args.input_size, args.input_size, 3], [batch]], 1000, random=False, dtype='uint8')),