def create_model(self): def input_builder_fun(model): model.param_init_net.UniformFill([], ["data"], shape=[32, 8]) def model_build_fun(model, loss_scale): fc1 = brew.fc(model, "data", "fc1", dim_in=8, dim_out=8) fc2 = brew.fc(model, fc1, "fc2", dim_in=8, dim_out=8) fc3 = brew.fc(model, fc2, "fc3", dim_in=8, dim_out=8) fc4 = brew.fc(model, fc3, "fc4", dim_in=8, dim_out=8) fc5 = brew.fc(model, fc4, "fc5", dim_in=8, dim_out=8) loss = model.net.SumElements([fc5], ["loss"]) return [loss] def add_optimizer(model): return optimizer.build_sgd(model, 0.1, policy="fixed") model = model_helper.ModelHelper() data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=[0, 1, 2, 3], ) return model
def Benchmark(args, model_map): arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'ws_nbytes_limit': args.cudnn_ws * 1024 * 1024, } model = model_helper.ModelHelper(name=args.model, arg_scope=arg_scope) # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" def add_image_input(model): AddNullInput( model, batch_size=batch_per_device, img_size=model_map[args.model][1], dtype=args.dtype, ) data_parallel_model.Parallelize( model, input_builder_fun=add_image_input, forward_pass_builder_fun=partial(model_map[args.model][0], dtype=args.dtype), optimizer_builder_fun=add_optimizer if not args.forward_only else None, post_sync_builder_fun=add_post_sync_ops, devices=gpus, optimize_gradient_memory=False, cpu_device=args.cpu, num_threads_per_device=args.num_workers_per_device, ) if not args.forward_only: data_parallel_model.OptimizeGradientMemory(model, {}, set(), False) model.Proto().type = args.net_type workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) ms_per_iter = workspace.BenchmarkNet(model.net.Proto().name, args.warmup_iterations, args.iterations, args.layer_wise_benchmark) print("number of images/sec: {}".format( round(args.batch_size * 1000 / ms_per_iter[0], 2)))
def prep_a_data_parallel_model(self, model, dataset, is_train): if model is None: return log.info('in prep_a_data_parallel_model') param_update = \ self.gen_param_update_builder_fun(model, dataset, is_train) \ if self.gen_param_update_builder_fun is not None else None log.info('in prep_a_data_parallel_model param_update done ') optimizer = \ self.gen_optimizer_fun(model, dataset, is_train) \ if self.gen_optimizer_fun is not None else None log.info('in prep_a_data_parallel_model optimizer done ') max_ops = self.opts['model_param']['max_concurrent_distributed_ops'] data_parallel_model.Parallelize( model, input_builder_fun=self.gen_input_builder_fun( model, dataset, is_train), forward_pass_builder_fun=self.gen_forward_pass_builder_fun( model, dataset, is_train), param_update_builder_fun=param_update, optimizer_builder_fun=optimizer, devices=self.xpus, rendezvous=self.gen_rendezvous_ctx(model, dataset, is_train), broadcast_computed_params=False, optimize_gradient_memory=self.opts['model_param']['memonger'], use_nccl=self.opts['model_param']['cuda_nccl'], max_concurrent_distributed_ops=max_ops, cpu_device=(self.opts['distributed']['device'] == 'cpu'), # "shared model" will only keep model parameters for cpu_0 or gpu_0 # will cause issue when initialize each gpu_0, gpu_1, gpu_2 ... # shared_model=(self.opts['distributed']['device'] == 'cpu'), combine_spatial_bn=self.opts['model_param']['combine_spatial_bn'], ) log.info('in prep_a_data_parallel_model Parallelize done ') # log.info("Current blobs in workspace: {}".format(workspace.Blobs())) workspace.RunNetOnce(model.param_init_net) log.info('in prep_a_data_parallel_model RunNetOnce done ') # for op in model.net.Proto().op: # log.info('op type engine {} {}'.format(op.type, op.engine)) log.info('model.net.Proto() {}'.format(model.net.Proto())) workspace.CreateNet(model.net) # for op in model.net.Proto().op: # log.info('after CreateNet op type engine {} {}'. # format(op.type, op.engine)) log.info('in prep_a_data_parallel_model CreateNet done ')
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name="ban-pc-resnet50", arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model configs for constructing model with open(args.model_config) as f: model_config = yaml.load(f) # Model building functions def create_target_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = add_se_model(model, model_config, "data", is_test=False) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') loss = add_softmax_loss(model, pred, 'label') brew.accuracy(model, ['softmax', 'label'], 'accuracy') return [loss] def add_optimizer(model): ''' stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) ''' optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, base_learning_rate = args.base_learning_rate, momentum = model_config['solver']['momentum'], nesterov = model_config['solver']['nesterov'], policy = model_config['solver']['lr_policy'], power = model_config['solver']['power'], max_iter = model_config['solver']['max_iter'], ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_target_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) if args.model_parallel: # Shift half of the activations to another GPU assert workspace.NumCudaDevices() >= 2 * args.num_gpus activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name="ban-pc-resnet50_test", arg_scope=test_arg_scope, init_params=False ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_target_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "log/{}/resnet50_gpu{}_b{}_L{}_lr{:.2f}_v2".format( args.dataset_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Load pretrained param_init_net load_init_net_multigpu(args) # Run the training one epoch a time best_accuracy = 0 while epoch < args.num_epochs: epoch, best_accuracy = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog, best_accuracy, ) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % ( args.file_store_path, args.save_model_name ) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args): if args.model == "resnext": model_name = "resnext" + str(args.num_layers) elif args.model == "shufflenet": model_name = "shufflenet" # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Verify valid image mean/std per channel if args.image_mean_per_channel: assert \ len(args.image_mean_per_channel) == args.num_channels, \ "The number of channels of image mean doesn't match input" if args.image_std_per_channel: assert \ len(args.image_std_per_channel) == args.num_channels, \ "The number of channels of image std doesn't match input" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object if args.use_ideep: train_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, 'training_mode': 1 } else: train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name=model_name, arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions def create_resnext_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnext( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, num_layers=args.num_layers, num_groups=args.resnext_num_groups, num_width_per_group=args.resnext_width_per_group, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def create_shufflenet_model_ops(model, loss_scale): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = shufflenet.create_shufflenet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-precision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1 ) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, mean_per_channel=args.image_mean_per_channel, std_per_channel=args.image_std_per_channel, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnext_model_ops if args.model == "resnext" else create_shufflenet_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, use_nccl=args.use_nccl, cpu_device=args.use_cpu, ideep=args.use_ideep, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") if args.use_ideep: test_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, } else: test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name=model_name + "_test", arg_scope=test_arg_scope, init_params=False, ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, mean_per_channel=args.image_mean_per_channel, std_per_channel=args.image_std_per_channel, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnext_model_ops if args.model == "resnext" else create_shufflenet_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, use_nccl=args.use_nccl, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model, args.use_ideep) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % ( model_name, args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog ) # Save the model for each epoch SaveModel(args, train_model, epoch, args.use_ideep) model_path = "%s/%s_" % ( args.file_store_path, args.save_model_name ) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def build_resnet50_dataparallel_model( num_gpus, batch_size, epoch_size, cudnn_workspace_limit_mb=64, num_channels=3, num_labels=1000, weight_decay=1e-4, base_learning_rate=0.1, image_size=227, use_cpu=False): batch_per_device = batch_size // num_gpus train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': False, 'ws_nbytes_limit': (cudnn_workspace_limit_mb * 1024 * 1024), 'deterministic': True, } train_model = model_helper.ModelHelper( name="test_resnet50", arg_scope=train_arg_scope ) def create_resnet50_model_ops(model, loss_scale): with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=Initializer, BiasInitializer=Initializer, enable_tensor_core=0): pred = resnet.create_resnet50( model, "data", num_input_channels=num_channels, num_labels=num_labels, no_bias=True, no_loss=True, ) softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * epoch_size / batch_size) optimizer.add_weight_decay(model, weight_decay) opt = optimizer.build_multi_precision_sgd( model, base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) return opt def add_image_input(model): model.param_init_net.GaussianFill( [], ["data"], shape=[batch_per_device, 3, image_size, image_size], dtype='float', ) model.param_init_net.ConstantFill( [], ["label"], shape=[batch_per_device], value=1, dtype=core.DataType.INT32, ) def add_post_sync_ops(model): for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=list(range(num_gpus)), rendezvous=None, optimize_gradient_memory=True, cpu_device=use_cpu, shared_model=use_cpu, ) return train_model
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) return [loss] def add_optimizer(model): optimizer.build_sgd(model, 0.1, policy="fixed") workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, ) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data ) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels ) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32) ) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim] ) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim] ) output, _last_hidden, _, _last_state, = rnn_cell.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss] def param_update_fun(model): ITER = model.Iter("ITER") LR = model.net.LearningRate( [ITER], "LR", base_lr=(-0.1), policy="fixed", ) ONE = model.param_init_net.ConstantFill( [], "ONE", shape=[1], value=1.0, ) for param in model.GetParams(): param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) assert len(model.GetParams()) == len(model.params) // len(model._devices) workspace.ResetWorkspace() model = cnn.CNNModelHelper( name="recurrent_test{}".format(devices), ) self.T = 8 self.batch_size = 64 self.input_dim = 8 self.hidden_dim = 31 self.batch_per_device = self.batch_size // len(devices) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, param_update_builder_fun=param_update_fun, devices=devices, optimize_gradient_memory=True, cpu_device=not gpu, ) # Change all initialization to be ConstantFills so that # the everything is deterministic for op in model.param_init_net.Proto().op: if op.type.endswith('Fill'): op.type = 'ConstantFill' # Each run has same input, independent of number of gpus np.random.seed(20150210) for i in range(0, 10): full_data = np.random.rand(self.T, self.batch_size, self.input_dim) full_target = np.random.rand( self.T, self.batch_size, self.hidden_dim ) for (j, g) in enumerate(devices): st = j * self.batch_per_device en = st + self.batch_per_device data = full_data[:, st:en, :].astype(np.float32) targets = full_target[:, st:en, :].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data ) workspace.FeedBlob( "{}_{}/target".format(model._device_prefix, g), targets ) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.RunNet(model.net.Proto().name) return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustice_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name="resnet50", arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id if num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", exit_nets=None) else: rendezvous = None # Model building functions def create_resnet50_model_ops(model, loss_scale): [softmax, loss] = resnet.create_resnet50( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, label="label", no_bias=True, ) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1 ) return opt # Input. Note that the reader must be shared with all GPUS. reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, ) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet50_model_ops, optimizer_builder_fun=add_optimizer, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, cpu_device=args.use_cpu, ) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name="resnet50_test", arg_scope=test_arg_scope, init_params=False ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet50_model_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog ) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % ( args.file_store_path, args.save_model_name ) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet101", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id interfaces = args.distributed_interfaces.split(",") if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: store_handler = "store_handler" if args.redis_host is not None: workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None def create_resnet101_model_ops(model, loss_scale): initializer = (pFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): pred = resnet.create_resnet101( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: opt = optimizer.build_fp16_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_resnet101_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, ) if args.model_parallel: activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet101_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_resnet101_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) data_parallel_model.FinalizeAfterCheckpoint(train_model) last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet101_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # final save SaveModel(workspace, train_model)
def Train(args): # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) assert \ epoch_iters > 0, \ "Epoch size must be larger than batch size times shard count" args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper(name="resnet50", arg_scope=train_arg_scope) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict(kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions # def create_resnet50_model_ops(model, loss_scale): # initializer = (PseudoFP16Initializer if args.dtype == 'float16' # else Initializer) # with brew.arg_scope([brew.conv, brew.fc], # WeightInitializer=initializer, # BiasInitializer=initializer, # enable_tensor_core=args.enable_tensor_core, # float16_compute=args.float16_compute): # pred = resnet.create_resnet50( # #args.layers, # model, # "data", # num_input_channels=args.num_channels, # num_labels=args.num_labels, # no_bias=True, # no_loss=True, # ) # if args.dtype == 'float16': # pred = model.net.HalfToFloat(pred, pred + '_fp32') # softmax, loss = model.SoftmaxWithLoss([pred, 'label'], # ['softmax', 'loss']) # loss = model.Scale(loss, scale=loss_scale) # brew.accuracy(model, [softmax, "label"], "accuracy") # return [loss] def create_model_ops(model, loss_scale): return create_model_ops_testable(model, loss_scale, is_test=False) def create_model_ops_test(model, loss_scale): return create_model_ops_testable(model, loss_scale, is_test=True) # Model building functions def create_model_ops_testable(model, loss_scale, is_test=False): initializer = (PseudoFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core, float16_compute=args.float16_compute): if args.model == "cifar10": if args.image_size != 32: log.warn("Cifar10 expects a 32x32 image.") pred = models.cifar10.create_cifar10( model, "data", image_channels=args.num_channels, num_classes=args.num_labels, image_height=args.image_size, image_width=args.image_size, ) elif args.model == "resnet32x32": if args.image_size != 32: log.warn("ResNet32x32 expects a 32x32 image.") pred = models.resnet.create_resnet32x32( model, "data", num_layers=args.num_layers, num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "resnet": if args.image_size != 224: log.warn( "ResNet expects a 224x224 image. input image = %d" % args.image_size) pred = resnet.create_resnet50( #args.layers, model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, no_bias=True, no_loss=True, ) elif args.model == "vgg": if args.image_size != 224: log.warn("VGG expects a 224x224 image.") pred = vgg.create_vgg(model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, num_layers=args.num_layers, is_test=is_test) elif args.model == "googlenet": if args.image_size != 224: log.warn("GoogLeNet expects a 224x224 image.") pred = googlenet.create_googlenet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "alexnet": if args.image_size != 224: log.warn("Alexnet expects a 224x224 image.") pred = alexnet.create_alexnet( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) elif args.model == "alexnetv0": if args.image_size != 224: log.warn("Alexnet v0 expects a 224x224 image.") pred = alexnet.create_alexnetv0( model, "data", num_input_channels=args.num_channels, num_labels=args.num_labels, is_test=is_test) else: raise NotImplementedError("Network {} not found.".format( args.model)) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) if args.float16_compute: # TODO: merge with multi-prceision optimizer opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.1) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) print("info:===============================" + str(opt)) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) # Create parallelized model data_parallel_model.Parallelize(train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=False, cpu_device=args.use_cpu, shared_model=args.use_cpu, combine_spatial_bn=args.use_cpu, use_nccl=args.use_nccl) if args.model_parallel: # Shift half of the activations to another GPU assert workspace.NumCudaDevices() >= 2 * args.num_gpus activations = data_parallel_model_utils.GetActivationBlobs(train_model) data_parallel_model_utils.ShiftActivationDevices( train_model, activations=activations[len(activations) // 2:], shifts={g: args.num_gpus + g for g in range(args.num_gpus)}, ) data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) if "GLOO_ALGORITHM" in os.environ and os.environ[ "GLOO_ALGORITHM"] == "PHUB": #i need to communicate to PHub about the elements that need aggregation, #as well as their sizes. #at this stage, all i need is the name of keys and my key ID. grad_names = list(reversed(train_model._grad_names)) phubKeyNames = ["allreduce_{}_status".format(x) for x in grad_names] caffe2GradSizes = dict( zip([ data_parallel_model.stripBlobName(name) + "_grad" for name in train_model._parameters_info.keys() ], [x.size for x in train_model._parameters_info.values()])) phubKeySizes = [str(caffe2GradSizes[x]) for x in grad_names] if rendezvous["shard_id"] == 0: #only id 0 needs to send to rendezvous. r = redis.StrictRedis() #foreach key, I need to assign an ID joinedStr = ",".join(phubKeyNames) r.set("[PLink]IntegrationKeys", joinedStr) joinedStr = ",".join(phubKeySizes) r.set("[PLink]IntegrationKeySizes", joinedStr) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper(name="resnet50_test", arg_scope=test_arg_scope, init_params=False) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_model_ops_test, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "resnet50_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(expname, args) # Run the training one epoch a time while epoch < args.num_epochs: epoch = RunEpoch(args, epoch, train_model, test_model, total_batch_size, num_shards, expname, explog) # Save the model for each epoch SaveModel(args, train_model, epoch) model_path = "%s/%s_" % (args.file_store_path, args.save_model_name) # remove the saved model from the previous epoch if it exists if os.path.isfile(model_path + str(epoch - 1) + ".mdl"): os.remove(model_path + str(epoch - 1) + ".mdl")
def Train(args): subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') save_dir = os.path.join(args.file_store_path, subdir) if not os.path.exists(save_dir): # Create the model directory if it doesn't exist os.mkdir(save_dir) # Either use specified device list or generate one if args.gpus is not None: gpus = [int(x) for x in args.gpus.split(',')] num_gpus = len(gpus) else: gpus = list(range(args.num_gpus)) num_gpus = args.num_gpus log.info("Running on GPUs: {}".format(gpus)) # Verify valid batch size total_batch_size = args.batch_size batch_per_device = total_batch_size // num_gpus assert \ total_batch_size % num_gpus == 0, \ "Number of GPUs must divide batch size" # Round down epoch size to closest multiple of batch size across machines global_batch_size = total_batch_size * args.num_shards epoch_iters = int(args.epoch_size / global_batch_size) args.epoch_size = epoch_iters * global_batch_size log.info("Using epoch size: {}".format(args.epoch_size)) # Create ModelHelper object train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, 'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024), } train_model = model_helper.ModelHelper( name="sphereface", arg_scope=train_arg_scope ) num_shards = args.num_shards shard_id = args.shard_id # Expect interfaces to be comma separated. # Use of multiple network interfaces is not yet complete, # so simply use the first one in the list. interfaces = args.distributed_interfaces.split(",") # Rendezvous using MPI when run with mpirun if os.getenv("OMPI_COMM_WORLD_SIZE") is not None: num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1)) shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0)) if num_shards > 1: rendezvous = dict( kv_handler=None, num_shards=num_shards, shard_id=shard_id, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], mpi_rendezvous=True, exit_nets=None) elif num_shards > 1: # Create rendezvous for distributed computation store_handler = "store_handler" if args.redis_host is not None: # Use Redis for rendezvous if Redis host is specified workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, ) ) else: # Use filesystem for rendezvous otherwise workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.file_store_path, prefix=args.run_id, ) ) rendezvous = dict( kv_handler=store_handler, shard_id=shard_id, num_shards=num_shards, engine="GLOO", transport=args.distributed_transport, interface=interfaces[0], exit_nets=None) else: rendezvous = None # Model building functions def create_sphereface_model_ops(model, loss_scale): initializer = (pFP16Initializer if args.dtype == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=args.enable_tensor_core): pred = sphereface.create_net( model, "data", "label", in_dim=args.num_channels, class_num=args.num_labels, feature_dim=args.feature_dim, is_test=False, no_loss=True, fp16_data=True if args.dtype == 'float16' else False, ) if args.dtype == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy") return [loss] def add_optimizer(model): # stepsz = int(30 * args.epoch_size / total_batch_size / num_shards) stepsz = 1 if args.dtype == 'float16': opt = optimizer.build_fp16_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, weight_decay=args.weight_decay, # weight decay included policy="step", stepsize=stepsz, gamma=0.9999 ) else: optimizer.add_weight_decay(model, args.weight_decay) opt = optimizer.build_multi_precision_sgd( model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.9999 ) return opt # Define add_image_input function. # Depends on the "train_data" argument. # Note that the reader will be shared with between all GPUS. if args.train_data == "null": def add_image_input(model): AddNullInput( model, None, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, ) else: reader = train_model.CreateDB( "reader", db=args.train_data, db_type=args.db_type, num_shards=num_shards, shard_id=shard_id, ) def add_image_input(model): AddImageInput( model, reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=False, ) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) # Create parallelized model data_parallel_model.Parallelize( train_model, input_builder_fun=add_image_input, forward_pass_builder_fun=create_sphereface_model_ops, optimizer_builder_fun=add_optimizer, post_sync_builder_fun=add_post_sync_ops, devices=gpus, rendezvous=rendezvous, optimize_gradient_memory=True, cpu_device=args.use_cpu, shared_model=args.use_cpu, ) workspace.RunNetOnce(train_model.param_init_net) workspace.CreateNet(train_model.net) # Add test model, if specified test_model = None if (args.test_data is not None): log.info("----- Create test net ----") test_arg_scope = { 'order': "NCHW", 'use_cudnn': True, 'cudnn_exhaustive_search': True, } test_model = model_helper.ModelHelper( name="sphereface_test", arg_scope=test_arg_scope, init_params=False ) test_reader = test_model.CreateDB( "test_reader", db=args.test_data, db_type=args.db_type, ) def test_input_fn(model): AddImageInput( model, test_reader, batch_size=batch_per_device, img_size=args.image_size, dtype=args.dtype, is_test=True, ) data_parallel_model.Parallelize( test_model, input_builder_fun=test_input_fn, forward_pass_builder_fun=create_sphereface_model_ops, post_sync_builder_fun=add_post_sync_ops, param_update_builder_fun=None, devices=gpus, cpu_device=args.use_cpu, ) workspace.RunNetOnce(test_model.param_init_net) workspace.CreateNet(test_model.net) graph = net_drawer2.GetPydotGraphMinimal(test_model.net.Proto(), "sphereface", rankdir="TB") graph.write(os.path.join(save_dir, "sphereface.pdf"), format='pdf') epoch = 0 # load the pre-trained model and reset epoch if args.load_model_path is not None: LoadModel(args.load_model_path, train_model) # Sync the model params data_parallel_model.FinalizeAfterCheckpoint(train_model) # reset epoch. load_model_path should end with *_X.mdl, # where X is the epoch number last_str = args.load_model_path.split('_')[-1] if last_str.endswith('.mdl'): epoch = int(last_str[:-4]) log.info("Reset epoch to {}".format(epoch)) else: log.warning("The format of load_model_path doesn't match!") expname = "sphereface_gpu%d_b%d_L%d_lr%.2f_v2" % ( args.num_gpus, total_batch_size, args.num_labels, args.base_learning_rate, ) explog = experiment_util.ModelTrainerLog(os.path.join(save_dir, expname), args) kernel_fig, plt_kernel = plt.subplots(nrows=4, ncols=5, figsize=(14, 14)) loss_fig, plt_loss = plt.subplots(1) plt.tight_layout(h_pad=0, w_pad=0) plt.ion() iterations = 0 old_x = 0 old_loss = 0 old_acc = 0 while epoch < args.num_epochs or args.test_data_type != 'VAL': epoch, epoch_loss, epoch_accuracy = RunEpoch( args, epoch, train_model, test_model, total_batch_size, num_shards, explog, plt_kernel ) x = list(range(iterations, iterations + len(epoch_loss))) x.insert(0, old_x) epoch_loss.insert(0, old_loss) epoch_accuracy.insert(0, old_acc) plt_loss.plot(x, epoch_loss, 'b') plt_loss.plot(x, epoch_accuracy, 'r') iterations += len(epoch_loss) old_x = iterations - 2 old_loss = epoch_loss[-1] old_acc = epoch_accuracy[-1] log.info('Save checkpoint {}'.format(epoch)) model_path = '{:s}/{:s}_{:d}.mdl'.format(save_dir, args.save_model_name, epoch) SaveModel(args, train_model, model_path) if DEBUG_TRAINING: kernel_fig_path = '%s/%s_%d.jpg' % (save_dir, 'activation', epoch) loss_fig_path = '%s/%s_%d.jpg' % (save_dir, 'loss', epoch) kernel_fig.savefig(kernel_fig_path) loss_fig.savefig(loss_fig_path)
def run_model(self, devices, gpu): ''' Helper function for test_equiv ''' def input_builder_fun(model): return None def model_build_fun(model, loss_scale): fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})) fc_fl = model.FlattenToVec(fc, "fc_fl") sigm = model.Sigmoid(fc_fl, "sigm") sq = model.SquaredL2Distance([sigm, "label"], "sq") loss = model.AveragedLoss(sq, "loss") loss = model.Scale(loss, scale=loss_scale) # For testing explicit sync model.param_init_net.UniformFill([], ["sync_num"], shape=[1]) return [loss] def add_optimizer(model): return optimizer.build_sgd( model, 0.1, policy="fixed", max_gradient_norm=5.0, allow_lr_injection=True, ) workspace.ResetWorkspace() model = cnn.CNNModelHelper( order="NHWC", name="test{}".format(devices), ) data_parallel_model.Parallelize( model, input_builder_fun=input_builder_fun, forward_pass_builder_fun=model_build_fun, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=not gpu, shared_model=not gpu, ) data_parallel_model.AddBlobSync(model, ["sync_num"]) # Light test for LR names lr_names = data_parallel_model.GetLearningRateBlobNames(model) self.assertGreater(len(lr_names), 0) np.random.seed(2603) # Each run has same input, independent of number of gpus batch_size = 64 for i in range(0, 10): full_data = np.random.rand(batch_size, 16) full_labels = np.round(full_data[:, 0]) batch_per_device = batch_size // len(devices) for (j, g) in enumerate(devices): st = j * batch_per_device en = st + batch_per_device data = full_data[st:en, :].astype(np.float32) labels = full_labels[st:en].astype(np.float32) with core.DeviceScope(core.DeviceOption(model._device_type, g)): workspace.FeedBlob( "{}_{}/data".format(model._device_prefix, g), data) workspace.FeedBlob( "{}_{}/label".format(model._device_prefix, g), labels) if i == 0: workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) workspace.FeedBlob(model._device_prefix + "_0/sync_num", np.array([i * 2]).astype(np.float32), device_option=core.DeviceOption( model._device_type, 0)) workspace.RunNet(model.net.Proto().name) # Test AddBlobSync for j in model._devices: sync = workspace.FetchBlob(model._device_prefix + "_{}/sync_num".format(j))[0] self.assertTrue(abs(sync - i * 2) < 0.01) return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
def benchmark_training(model, opts): """ Runs N training batches and returns array of batch times in seconds. For some impl details see https://caffe2.ai/docs/SynchronousSGD.html. :param model: Caffe2's model helper class instances. :type model: :py:class:`caffe2.python.model_helper.ModelHelper` :param dict opts: Options for the inference benchmark. Must contain `device`,\ `num_gpus` if device is gpu, `enable_tensor_core`,\ `num_warmup_batches` and `num_batches`. :return: Tuple of model title and numpy array containing batch times. :rtype: (string, numpy array) """ model_builder = ModelFactory.get_model(opts) assert model_builder.phase == 'training',\ "Internal error, invalid phase was set. "\ "Expecting 'training' but found %s" % (model_builder.phase) # Reader must be shared by all GPUs in a one machine. reader = None if 'data_dir' in opts and opts['data_dir']: reader = model.CreateDB( "reader", db=opts['data_dir'], # (str, path to training data) db_type=opts['data_backend'], # (str, 'lmdb' or 'leveldb') num_shards=1, # (int, number of machines) shard_id=0, # (int, machine id) ) def add_inputs(model): if reader is None: print("[INFO] Adding synthetic data input for Caffe2 training benchmarks") model_builder.add_synthetic_inputs(model, add_labels=True) else: print("[INFO] Adding real data inputs (%s) for Caffe2 training benchmarks" %\ (opts['data_dir'])) model_builder.add_data_inputs( model, reader, use_gpu_transform=(opts['device'] == 'gpu'), num_decode_threads=opts['num_decode_threads'] ) def create_net(model, loss_scale): return create_model(model_builder, model, opts['enable_tensor_core'], opts['float16_compute'], loss_scale) def add_post_sync_ops(model): """Add ops applied after initial parameter sync.""" for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT] ) def add_optimizer(model): return build_optimizer(model, float16_compute=opts['float16_compute']) if opts['device'] == 'gpu': rendezvous = setup_rendezvous(opts) print("rendezvous: %s" % str(rendezvous)) dpm.Parallelize( model, input_builder_fun=add_inputs, forward_pass_builder_fun=create_net, optimizer_builder_fun=add_optimizer, #param_update_builder_fun=Model.add_parameter_update_ops, post_sync_builder_fun=add_post_sync_ops, devices=range(opts['num_gpus']), rendezvous=rendezvous, optimize_gradient_memory=True, cpu_device=(opts['device'] == 'cpu'), shared_model=(opts['device'] == 'cpu') ) else: with core.DeviceScope(Model.get_device_option(gpu=None)): add_inputs(model) losses = create_net(model, 1.0) blobs_to_gradients = model.AddGradientOperators(losses) Model.add_parameter_update_ops(model) Model.optimize_gradient_memory(model, [blobs_to_gradients[losses[0]]]) workspace.RunNetOnce(model.param_init_net) workspace.CreateNet(model.net) return (model_builder.name, run_n_times(model, opts['num_warmup_batches'], opts['num_batches']))
def _spatial_bn_check(self, device_type, seed, batch_size): devices = [0, 1, 2] epsilon = 1e-3 tolerance = 1e-3 def _test_forward_pass(x, devices, device_type, scale, bias, epsilon): x_concat = np.concatenate(x) mean = np.mean(x_concat, axis=0) var = np.var(x_concat, axis=0) for device in devices: x_i = x[device] x_hat = (x_i - mean) / (np.sqrt(var + epsilon)) expected_out = scale * x_hat + bias spatial_out = workspace.FetchBlob("{}_{}/sp_out".format( device_type, device)) rel_error = np.linalg.norm(spatial_out - expected_out) \ / np.linalg.norm(expected_out) self.assertTrue(rel_error < 0.005) def _test_backward_pass(x, devices, device_type, scale, tolerance): dBias_arr = [] dY_arr = [] dGamma_arr = [] num_devices = len(devices) mean = np.array(workspace.FetchBlob( "{}_0/sp_out_sm".format(device_type)), dtype=np.float32) inv_var = np.array(workspace.FetchBlob( "{}_0/sp_out_siv".format(device_type)), dtype=np.float32) # dBias # Sum dBias values over all devices to find the average gradient for device in devices: dY_blob = workspace.FetchBlob("{}_{}/sp_out_grad".format( device_type, device)) dY = np.array(dY_blob, dtype=np.float32) dY_arr.append(dY) dBias_arr.append(np.array(np.sum(dY, axis=0), dtype=np.float32)) dBias = np.sum(dBias_arr, dtype=np.float32) dBias_avg = dBias / num_devices for device in devices: dBiasActual = np.sum(workspace.FetchBlob( "{}_{}/sp_out_b_grad".format(device_type, device)), dtype=np.float32) self.assertTrue( np.isclose([dBiasActual], [dBias], atol=tolerance)) # dGamma # Sum dGamma values over all devices to find the average gradient for device in devices: dGamma = np.sum((x[device] - mean) * inv_var * dY_arr[device], axis=0, dtype=np.float32) dGamma_arr.append(dGamma) dGamma = np.sum(dGamma_arr, axis=0, dtype=np.float32) dGamma_avg = dGamma / num_devices for device in devices: dGammaActual = workspace.FetchBlob( "{}_{}/sp_out_s_grad".format(device_type, device)) self.assertTrue( np.isclose([dGamma], [dGammaActual], atol=tolerance)) # dX scale_inv_var = scale * inv_var / batch_size for device in devices: dX = scale_inv_var * ( dY_arr[device] * batch_size - dBias_avg - (x[device] - mean) * dGamma_avg * inv_var) dX_actual = workspace.FetchBlob("{}_{}/tanh_grad".format( device_type, device)) self.assertTrue( np.isclose([dX], [dX_actual], atol=tolerance).all()) def add_input_ops(model): for device in devices: data = np.random.rand(batch_size, 1, 1, 1).astype(np.float32) workspace.FeedBlob("{}_{}/data".format(device_type, device), data) def add_model_ops(model, loss_scale): model.Tanh("data", "tanh") model.SpatialBN("tanh", "sp_out", 1, epsilon=epsilon, is_test=False) model.Sqr("sp_out", "sqr") loss = model.SumElements("sqr", "loss") return [loss] def add_optimizer(model): return optimizer.build_sgd(model, 0.1) np.random.seed(seed) workspace.ResetWorkspace() model = cnn.CNNModelHelper(order="NCHW", name="test") data_parallel_model.Parallelize( model, input_builder_fun=add_input_ops, forward_pass_builder_fun=add_model_ops, optimizer_builder_fun=add_optimizer, devices=devices, cpu_device=device_type == "cpu", shared_model=False, combine_spatial_bn=True, ) workspace.RunNetOnce(model.param_init_net) scale = workspace.FetchBlob("{}_0/sp_out_s".format(device_type)) bias = workspace.FetchBlob("{}_0/sp_out_b".format(device_type)) workspace.RunNetOnce(model.net) x = [] for device in devices: x_blob = workspace.FetchBlob("{}_{}/tanh".format( device_type, device)) x_i = np.array(x_blob, dtype=np.float32) x.append(x_i) _test_forward_pass(x, devices, device_type, scale, bias, epsilon) _test_backward_pass(x, devices, device_type, scale, tolerance)
def network_eval(args): """ Runs network benchmarking on either a single or multiple nodes """ # Define some parameters for the model instantiation if args.use_ideep: train_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, 'training_mode': 1 } else: train_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, # 1048576 = 2 ^ 20 (1 MB) 'ws_nbytes_limit': (args.cudnn_ws_lim * 1048576), } # Create the model for evaluation evaluation_model = model_helper.ModelHelper(name='evaluation_model', arg_scope=train_arg_scope) evaluation_model.Proto().num_workers = 16 # Default the model for accuracy testing to None accuracy_time_model = None # Compute batch and epoch sizes # Per CPU / GPU batch size per_local_device_batch = ( args.batch_size // len(args.gpu_devices)) if args.gpu_devices else args.batch_size # Total batch size (over all the devices) global_batch_size = args.batch_size * args.num_shards # Number of epoch iterations epoch_iters = args.epoch_size // global_batch_size # Adjust the true number of examples per epoch args.epoch_size = global_batch_size * epoch_iters if args.training_data: log.info("Running experiments with user provided data: %s", args.training_data) # Create a reader, which can also help distribute data when running on multiple nodes reader = evaluation_model.CreateDB( "reader", db=args.training_data, db_type=args.db_type, num_shards=args.num_shards, shard_id=args.shard_id, ) def image_input(model): AddImageInput(model, reader, per_local_device_batch, min(args.height, args.width), args.data_type, args.use_cpu) else: input_shape = [args.batch_size, args.channels, args.height, args.width] log.info("Running experiments with synthetic data w/ shape: %s", input_shape) def image_input(model): AddSyntheticInput(model, args.data_type, input_shape, args.num_labels) # Create the network, and normalize the loss def create_model(model, loss_scale): initializer = (PseudoFP16Initializer if args.data_type == 'float16' else Initializer) with brew.arg_scope([brew.conv, brew.fc], WeightInitializer=initializer, BiasInitializer=initializer, enable_tensor_core=False, float16_compute=False): pred = resnet.create_resnet50( model, "data", num_input_channels=args.channels, num_labels=args.num_labels, # num_groups=args.resnext_num_groups, # num_width_per_group=args.resnext_width_per_group, no_bias=True, no_loss=True) # If we're using float on 2B, then inflate to the 4B representation if args.data_type == 'float16': pred = model.net.HalfToFloat(pred, pred + '_fp32') # Compute the softmax probabilities and the loss softmax, loss = model.SoftmaxWithLoss([pred, 'label'], ['softmax', 'loss']) # Noralize the loss, and compute the top_k accuracies for k \in {1, 5} loss = model.Scale(loss, scale=loss_scale) brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1) brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5) return [loss] def add_optimizer(model): """ Optimizer function called once for the entire model, as opposed for each CPU / GPU individually. The optimizer will be a stepwise weight decay. :return: return the optimizer """ stepsz = int(30 * args.epoch_size / args.batch_size / args.num_shards) stepsz = stepsz if stepsz else 100 optimizer.add_weight_decay(model, 1e-4) # opt = optimizer.build_multi_precision_sgd( opt = optimizer.build_sgd(model, args.base_learning_rate, momentum=0.9, nesterov=1, policy="step", stepsize=stepsz, gamma=0.1) return opt def add_parameter_update(model): """ Add a simple gradient based parameter update with stepwise adaptive learning rate. """ # This counts the number if iterations we are making ITER = brew.iter(model, "iter") # Adds a LR to the model, updated using a simple step policy every 10k steps; gamma is an update parameter LR = model.LearningRate(ITER, "LR", base_lr=-args.base_learning_rate, policy="step", stepsize=1000, gamma=0.999) # This is a constant used in the following loop ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0) # Here we are essentially applying the gradients to the weights (using the classical method) for param in model.params: param_grad = model.param_to_grad[param] model.WeightedSum([param, ONE, param_grad, LR], param) def add_post_sync_ops(model): """ Add ops applied after initial parameter sync. """ for param_info in model.GetOptimizationParamInfo(model.GetParams()): if param_info.blob_copy is not None: model.param_init_net.HalfToFloat( param_info.blob, param_info.blob_copy[core.DataType.FLOAT]) if args.num_shards > 1: log.info("Distributed benchmarking is enabled") log.info("Num shards: %d", args.num_shards) log.info("My shard ID: %d", args.shard_id) if args.redis_host: log.info("Using Redis server at %s:%d", args.redis_host, args.redis_port) else: log.info("Rendevous at: %s", args.rendezvous_path) # Prepare the required parameters for distribution store_handler = "store_handler" # We'll use the shared file system for rendezvous if args.redis_host: workspace.RunOperatorOnce( core.CreateOperator( "RedisStoreHandlerCreate", [], [store_handler], host=args.redis_host, port=args.redis_port, prefix=args.run_id, )) else: workspace.RunOperatorOnce( core.CreateOperator( "FileStoreHandlerCreate", [], [store_handler], path=args.rendezvous_path, prefix=args.run_id, )) rendezvous = dict(kv_handler=store_handler, shard_id=args.shard_id, num_shards=args.num_shards, engine="GLOO", transport=args.distributed_transport, interface=args.network_interface, exit_nets=None) # Parallelize the model (data parallel) data_parallel_model.Parallelize( evaluation_model, input_builder_fun=image_input, forward_pass_builder_fun=create_model, optimizer_builder_fun=None if not args.backward else (add_optimizer if not args.per_device_optimization else None), param_update_builder_fun=None if not args.backward else (add_parameter_update if args.per_device_optimization else None), post_sync_builder_fun=add_post_sync_ops if args.post_sync else None, devices=(args.gpu_devices if not args.use_cpu else [0]), rendezvous=rendezvous, # Although this is a parameter (broadcast params) of this function, it is # currently not implemented in Caffe2's source code broadcast_computed_params=args.broadcast_params, optimize_gradient_memory=args.optimize_gradient_memory, dynamic_memory_management=args.dynamic_memory_management, max_concurrent_distributed_ops=args.max_distributed_ops, num_threads_per_device=args.max_threads, use_nccl=args.use_nccl, cpu_device=args.use_cpu, ideep=args.use_ideep, shared_model=args.shared_model, combine_spatial_bn=args.use_cpu, ) if args.backward: data_parallel_model.OptimizeGradientMemory(evaluation_model, {}, set(), False) instantiate_and_create_net(evaluation_model) # If we're testing for the time it takes to reach a particular accuracy, then we'll need to create # a new model just for this if args.test_accuracy: # Test for the existance of testing data assert args.testing_data, "We must have testing data if we're measuring the time to accuracy" log.info("We're running time to test accuracy") log.info("The accuracy we're looking for: %f", args.target_accuracy) log.info("Testing data provided in: %s", args.testing_data) # Create the model if args.use_ideep: test_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, } else: test_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, } accuracy_time_model = model_helper.ModelHelper( name='accuracy_time_model', arg_scope=test_arg_scope, init_params=False) # Create the input function # Create a reader, which can also help distribute data when running on multiple nodes test_reader = accuracy_time_model.CreateDB("test_reader", db=args.testing_data, db_type=args.db_type) def test_image_input(model): AddImageInput(model, test_reader, per_local_device_batch, min(args.height, args.width), args.data_type, args.use_cpu, is_test=True) # Create the test model per se data_parallel_model.Parallelize( accuracy_time_model, input_builder_fun=test_image_input, forward_pass_builder_fun=create_model, post_sync_builder_fun=add_post_sync_ops if args.post_sync else None, param_update_builder_fun=None, devices=(args.gpu_devices if not args.use_cpu else [0]), cpu_device=args.use_cpu) instantiate_and_create_net(accuracy_time_model) else: print("Single node benchmarking is enabled") # Build the training model if args.use_cpu: image_input(evaluation_model) create_model(evaluation_model, 1.0) if args.backward: evaluation_model.AddGradientOperators(["loss"]) add_parameter_update(evaluation_model) else: # We're running this on a single GPU on a single node, so create the net under the GPU's net with core.DeviceScope( core.DeviceOption(caffe2_pb2.CUDA, args.gpu_devices[0])): image_input(evaluation_model) create_model(evaluation_model, 1.0) if args.backward: evaluation_model.AddGradientOperators(["loss"]) add_parameter_update(evaluation_model) instantiate_and_create_net(evaluation_model) if args.test_accuracy: # Test for the existance of testing datan GPU: https://caffe2.ai/doxygen-python/html/classcaffe2_1_1python_1_1core_1_1_net.html#af67e059d8f4cc22e7e64ccdd07918681 assert args.testing_data, "We must have testing data if we're measuring the time to accuracy" log.info("We're running time to test accuracy") log.info("The accuracy we're looking for: %f", args.target_accuracy) log.info("Testing data provided in: %s", args.testing_data) # Create the model if args.use_ideep: test_arg_scope = { 'use_cudnn': False, 'cudnn_exhaustive_search': False, } else: test_arg_scope = { 'order': 'NCHW', 'use_cudnn': True, 'cudnn_exhaustive_search': True, } accuracy_time_model = model_helper.ModelHelper( name='accuracy_time_model', arg_scope=test_arg_scope, init_params=False) # Create the input function # Create a reader, which can also help distribute data when running on multiple nodes test_reader = accuracy_time_model.CreateDB("test_reader", db=args.testing_data, db_type=args.db_type) def test_image_input(model): AddImageInput(model, test_reader, per_local_device_batch, min(args.height, args.width), args.data_type, args.use_cpu, is_test=True) # Create the test model per se test_image_input(accuracy_time_model) create_model(accuracy_time_model, 1.0) instantiate_and_create_net(accuracy_time_model) if not args.test_accuracy: workspace.BenchmarkNet(evaluation_model.net.Proto().name, args.warmup_rounds, args.eval_rounds, args.per_layer_eval) else: # Create a log for time to accuracy testing expname = "time_to_acc_model_%s_gpu%d_b%d_L%d_lr%.2f_shard%d" % ( args.model_name, len(args.gpu_devices) if not args.use_cpu else 1, args.batch_size, args.num_labels, args.base_learning_rate, args.shard_id) explog = experiment_util.ModelTrainerLog(expname, args) # Run the epochs elapsed_training_time = 0.0 for i in range(args.epoch_count): elapsed_training_time, on_target = RunEpoch( args, i, evaluation_model, accuracy_time_model, explog, elapsed_training_time) if args.terminate_on_target and on_target: log.info("Have reached the target accuracy: {} in {} seconds.". format(args.target_accuracy, elapsed_training_time)) break