def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0, max_epochs=160, restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False, profiling=False, fp16=False): set_computation_network_trace_level(0) # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine, # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling # up. However, bigger minimatch size on the same number of samples means less updates, # thus leads to higher training error. This is a trade-off of speed and accuracy minibatch_size = 128 * (Communicator.num_workers() if scale_up else 1) progress_printer = ProgressPrinter(freq=num_mbs_per_log, tag='Training', log_to_file=log_to_file, rank=Communicator.rank(), gen_heartbeat=gen_heartbeat, num_epochs=max_epochs) network = create_resnet_network(network_name, fp16) trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer) train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size) test_source = create_image_mb_source( test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP) train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling)
def parse_arguments(cfg): parser = argparse.ArgumentParser() parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=cfg["DATA"].MAP_FILE_PATH) parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None) parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None) parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default=cfg["CNTK"].MAX_EPOCHS) parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default=cfg.MB_SIZE) parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default=cfg["DATA"].NUM_TRAIN_IMAGES) parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32') parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true') parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None) parser.add_argument('-lrFactor', '--lrFactor', type=float, help="Scale factor for the lr schedule", required=False) parser.add_argument('-momentumPerMb', '--momentumPerMb', type=float, help="momentum per minibatch", required=False) parser.add_argument('-rndSeed', '--rndSeed', type=int, help="the random seed", required=False) parser.add_argument('-trainConv', '--trainConv', type=int, help="whether to train conv layers", required=False) args = vars(parser.parse_args()) if args['lrFactor'] is not None: cfg["CNTK"].LR_FACTOR = args['lrrFactor'] if args['num_epochs'] is not None: cfg["CNTK"].MAX_EPOCHS = args['num_epochs'] if args['momentumPerMb'] is not None: cfg.MOMENTUM_PER_MB = args['momentumPerMb'] if args['rndSeed'] is not None: cfg.RND_SEED = args['rndSeed'] if args['trainConv'] is not None: cfg["CNTK"].TRAIN_CONV_LAYERS = True if args['trainConv'] == 1 else False if args['datadir'] is not None: cfg["DATA"].MAP_FILE_PATH = args['datadir'] if args['outputdir'] is not None: cfg.OUTPUT_PATH = args['outputdir'] if args['logdir'] is not None: log_dir = args['logdir'] if args['device'] is not None: # Setting one worker on GPU and one worker on CPU. Otherwise memory consumption is too high for a single GPU. if Communicator.rank() == 0: cntk.device.try_set_default_device(cntk.device.gpu(args['device'])) else: cntk.device.try_set_default_device(cntk.device.cpu())
def resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0, max_epochs=90, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False): set_computation_network_trace_level(0) # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine, # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling # up. However, bigger minimatch size on the same number of samples means less updates, # thus leads to higher training error. This is a trade-off of speed and accuracy minibatch_size = 32 * (Communicator.num_workers() if scale_up else 1) progress_printer = ProgressPrinter( freq=num_mbs_per_log, tag='Training', log_to_file=log_to_file, rank=Communicator.rank(), gen_heartbeat=gen_heartbeat, num_epochs=max_epochs) network = create_resnet_network(network_name) trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer) train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size) test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP) train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling)
if not os.path.isdir(data_path): raise RuntimeError("Directory %s does not exist" % data_path) mean_data = os.path.join(abs_path, 'ImageNet1K_mean.xml') train_data = os.path.join(data_path, 'train_map.txt') test_data = os.path.join(data_path, 'val_map.txt') num_quantization_bits = args['quantized_bits'] epochs = args['epochs'] warm_up = args['distributed_after'] network_name = args['network'] scale_up = bool(args['scale_up']) # Create distributed trainer factory print("Start training: quantize_bit = {}, epochs = {}, distributed_after = {}".format(num_quantization_bits, epochs, warm_up)) resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits, block_size=args['block_samples'], warm_up=args['distributed_after'], max_epochs=epochs, restore=not args['restart'], scale_up=scale_up, log_to_file=args['logdir'], profiling=args['profile']) # Must call MPI finalize when process exit without exceptions Communicator.finalize()
test_data = os.path.join(data_path, 'test_map.txt') num_quantization_bits = args['quantized_bits'] epochs = args['epochs'] warm_up = args['distributed_after'] network_name = args['network'] scale_up = bool(args['scale_up']) # Create distributed trainer factory print( "Start training: quantize_bit = {}, epochs = {}, distributed_after = {}" .format(num_quantization_bits, epochs, warm_up)) resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits, block_size=args['block_samples'], warm_up=args['distributed_after'], max_epochs=epochs, restore=not args['restart'], scale_up=scale_up, log_to_file=args['logdir'], profiling=args['profile'], fp16=args['fp16']) # Must call MPI finalize when process exit without exceptions Communicator.finalize()
def set_global_vars(use_arg_parser=True): data_path = map_file_path # set and overwrite learning parameters globalvars['rpn_lr_factor'] = cfg["CNTK"].RPN_LR_FACTOR globalvars['frcn_lr_factor'] = cfg["CNTK"].FRCN_LR_FACTOR globalvars['e2e_lr_factor'] = cfg["CNTK"].E2E_LR_FACTOR globalvars['momentum_per_mb'] = cfg["CNTK"].MOMENTUM_PER_MB globalvars['e2e_epochs'] = 1 if cfg["CNTK"].FAST_MODE else cfg[ "CNTK"].E2E_MAX_EPOCHS globalvars[ 'rpn_epochs'] = 1 if cfg["CNTK"].FAST_MODE else cfg["CNTK"].RPN_EPOCHS globalvars['frcn_epochs'] = 1 if cfg["CNTK"].FAST_MODE else cfg[ "CNTK"].FRCN_EPOCHS globalvars['rnd_seed'] = cfg.RNG_SEED globalvars['train_conv'] = cfg["CNTK"].TRAIN_CONV_LAYERS globalvars['train_e2e'] = cfg["CNTK"].TRAIN_E2E globalvars['fea_map_dim'] = cfg["CNTK"].FEA_MAP_DIM if use_arg_parser: parser = argparse.ArgumentParser() parser.add_argument( '-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path) parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None) parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None) parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default=cfg["CNTK"].E2E_MAX_EPOCHS) parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default=mb_size) parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default=epoch_size) parser.add_argument( '-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32') parser.add_argument( '-r', '--restart', help= 'Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true') parser.add_argument( '-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None) parser.add_argument('-rpnLrFactor', '--rpnLrFactor', type=float, help="Scale factor for rpn lr schedule", required=False) parser.add_argument('-frcnLrFactor', '--frcnLrFactor', type=float, help="Scale factor for frcn lr schedule", required=False) parser.add_argument('-e2eLrFactor', '--e2eLrFactor', type=float, help="Scale factor for e2e lr schedule", required=False) parser.add_argument('-momentumPerMb', '--momentumPerMb', type=float, help="momentum per minibatch", required=False) parser.add_argument('-e2eEpochs', '--e2eEpochs', type=int, help="number of epochs for e2e training", required=False) parser.add_argument('-rpnEpochs', '--rpnEpochs', type=int, help="number of epochs for rpn training", required=False) parser.add_argument('-frcnEpochs', '--frcnEpochs', type=int, help="number of epochs for frcn training", required=False) parser.add_argument('-rndSeed', '--rndSeed', type=int, help="the random seed", required=False) parser.add_argument('-trainConv', '--trainConv', type=int, help="whether to train conv layers", required=False) parser.add_argument('-trainE2E', '--trainE2E', type=int, help="whether to train e2e (otherwise 4 stage)", required=False) args = vars(parser.parse_args()) if args['rpnLrFactor'] is not None: globalvars['rpn_lr_factor'] = args['rpnLrFactor'] if args['frcnLrFactor'] is not None: globalvars['frcn_lr_factor'] = args['frcnLrFactor'] if args['e2eLrFactor'] is not None: globalvars['e2e_lr_factor'] = args['e2eLrFactor'] if args['momentumPerMb'] is not None: globalvars['momentum_per_mb'] = args['momentumPerMb'] if args['e2eEpochs'] is not None: globalvars['e2e_epochs'] = args['e2eEpochs'] if args['rpnEpochs'] is not None: globalvars['rpn_epochs'] = args['rpnEpochs'] if args['frcnEpochs'] is not None: globalvars['frcn_epochs'] = args['frcnEpochs'] if args['rndSeed'] is not None: globalvars['rnd_seed'] = args['rndSeed'] if args['trainConv'] is not None: globalvars[ 'train_conv'] = True if args['trainConv'] == 1 else False if args['trainE2E'] is not None: globalvars['train_e2e'] = True if args['trainE2E'] == 1 else False if args['outputdir'] is not None: globalvars['output_path'] = args['outputdir'] if args['logdir'] is not None: log_dir = args['logdir'] if args['device'] is not None: # Setting one worker on GPU and one worker on CPU. Otherwise memory consumption is too high for a single GPU. if Communicator.rank() == 0: cntk.device.try_set_default_device( cntk.device.gpu(args['device'])) else: cntk.device.try_set_default_device(cntk.device.cpu()) if args['datadir'] is not None: data_path = args['datadir'] if not os.path.isdir(data_path): raise RuntimeError("Directory %s does not exist" % data_path) globalvars['class_map_file'] = os.path.join(data_path, globalvars['class_map_file']) globalvars['train_map_file'] = os.path.join(data_path, globalvars['train_map_file']) globalvars['test_map_file'] = os.path.join(data_path, globalvars['test_map_file']) globalvars['train_roi_file'] = os.path.join(data_path, globalvars['train_roi_file']) globalvars['test_roi_file'] = os.path.join(data_path, globalvars['test_roi_file']) if cfg["CNTK"].FORCE_DETERMINISTIC: force_deterministic_algorithms() np.random.seed(seed=globalvars['rnd_seed']) globalvars['classes'] = parse_class_map_file(globalvars['class_map_file']) globalvars['num_classes'] = len(globalvars['classes']) if cfg["CNTK"].DEBUG_OUTPUT: # report args print("Using the following parameters:") print("Flip image : {}".format(cfg["TRAIN"].USE_FLIPPED)) print("Train conv layers: {}".format(globalvars['train_conv'])) print("Random seed : {}".format(globalvars['rnd_seed'])) print("Momentum per MB : {}".format(globalvars['momentum_per_mb'])) if globalvars['train_e2e']: print("E2E epochs : {}".format(globalvars['e2e_epochs'])) else: print("RPN lr factor : {}".format(globalvars['rpn_lr_factor'])) print("RPN epochs : {}".format(globalvars['rpn_epochs'])) print("FRCN lr factor : {}".format(globalvars['frcn_lr_factor'])) print("FRCN epochs : {}".format(globalvars['frcn_epochs']))
def set_global_vars(use_arg_parser = True): data_path = map_file_path # set and overwrite learning parameters globalvars['rpn_lr_factor'] = cfg["CNTK"].RPN_LR_FACTOR globalvars['frcn_lr_factor'] = cfg["CNTK"].FRCN_LR_FACTOR globalvars['e2e_lr_factor'] = cfg["CNTK"].E2E_LR_FACTOR globalvars['momentum_per_mb'] = cfg["CNTK"].MOMENTUM_PER_MB globalvars['e2e_epochs'] = 1 if cfg["CNTK"].FAST_MODE else cfg["CNTK"].E2E_MAX_EPOCHS globalvars['rpn_epochs'] = 1 if cfg["CNTK"].FAST_MODE else cfg["CNTK"].RPN_EPOCHS globalvars['frcn_epochs'] = 1 if cfg["CNTK"].FAST_MODE else cfg["CNTK"].FRCN_EPOCHS globalvars['rnd_seed'] = cfg.RNG_SEED globalvars['train_conv'] = cfg["CNTK"].TRAIN_CONV_LAYERS globalvars['train_e2e'] = cfg["CNTK"].TRAIN_E2E if use_arg_parser: parser = argparse.ArgumentParser() parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path) parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None) parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None) parser.add_argument('-n', '--num_epochs', help='Total number of epochs to train', type=int, required=False, default=cfg["CNTK"].E2E_MAX_EPOCHS) parser.add_argument('-m', '--minibatch_size', help='Minibatch size', type=int, required=False, default=mb_size) parser.add_argument('-e', '--epoch_size', help='Epoch size', type=int, required=False, default=epoch_size) parser.add_argument('-q', '--quantized_bits', help='Number of quantized bits used for gradient aggregation', type=int, required=False, default='32') parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true') parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None) parser.add_argument('-rpnLrFactor', '--rpnLrFactor', type=float, help="Scale factor for rpn lr schedule", required=False) parser.add_argument('-frcnLrFactor', '--frcnLrFactor', type=float, help="Scale factor for frcn lr schedule", required=False) parser.add_argument('-e2eLrFactor', '--e2eLrFactor', type=float, help="Scale factor for e2e lr schedule", required=False) parser.add_argument('-momentumPerMb', '--momentumPerMb', type=float, help="momentum per minibatch", required=False) parser.add_argument('-e2eEpochs', '--e2eEpochs', type=int, help="number of epochs for e2e training", required=False) parser.add_argument('-rpnEpochs', '--rpnEpochs', type=int, help="number of epochs for rpn training", required=False) parser.add_argument('-frcnEpochs', '--frcnEpochs', type=int, help="number of epochs for frcn training", required=False) parser.add_argument('-rndSeed', '--rndSeed', type=int, help="the random seed", required=False) parser.add_argument('-trainConv', '--trainConv', type=int, help="whether to train conv layers", required=False) parser.add_argument('-trainE2E', '--trainE2E', type=int, help="whether to train e2e (otherwise 4 stage)", required=False) args = vars(parser.parse_args()) if args['rpnLrFactor'] is not None: globalvars['rpn_lr_factor'] = args['rpnLrFactor'] if args['frcnLrFactor'] is not None: globalvars['frcn_lr_factor'] = args['frcnLrFactor'] if args['e2eLrFactor'] is not None: globalvars['e2e_lr_factor'] = args['e2eLrFactor'] if args['momentumPerMb'] is not None: globalvars['momentum_per_mb'] = args['momentumPerMb'] if args['e2eEpochs'] is not None: globalvars['e2e_epochs'] = args['e2eEpochs'] if args['rpnEpochs'] is not None: globalvars['rpn_epochs'] = args['rpnEpochs'] if args['frcnEpochs'] is not None: globalvars['frcn_epochs'] = args['frcnEpochs'] if args['rndSeed'] is not None: globalvars['rnd_seed'] = args['rndSeed'] if args['trainConv'] is not None: globalvars['train_conv'] = True if args['trainConv']==1 else False if args['trainE2E'] is not None: globalvars['train_e2e'] = True if args['trainE2E']==1 else False if args['outputdir'] is not None: globalvars['output_path'] = args['outputdir'] if args['logdir'] is not None: log_dir = args['logdir'] if args['device'] is not None: # Setting one worker on GPU and one worker on CPU. Otherwise memory consumption is too high for a single GPU. if Communicator.rank() == 0: cntk.device.try_set_default_device(cntk.device.gpu(args['device'])) else: cntk.device.try_set_default_device(cntk.device.cpu()) if args['datadir'] is not None: data_path = args['datadir'] if not os.path.isdir(data_path): raise RuntimeError("Directory %s does not exist" % data_path) globalvars['class_map_file'] = os.path.join(data_path, globalvars['class_map_file']) globalvars['train_map_file'] = os.path.join(data_path, globalvars['train_map_file']) globalvars['test_map_file'] = os.path.join(data_path, globalvars['test_map_file']) globalvars['train_roi_file'] = os.path.join(data_path, globalvars['train_roi_file']) globalvars['test_roi_file'] = os.path.join(data_path, globalvars['test_roi_file']) if cfg["CNTK"].FORCE_DETERMINISTIC: force_deterministic_algorithms() np.random.seed(seed=globalvars['rnd_seed']) globalvars['classes'] = parse_class_map_file(globalvars['class_map_file']) globalvars['num_classes'] = len(globalvars['classes']) if cfg["CNTK"].DEBUG_OUTPUT: # report args print("Using the following parameters:") print("Flip image : {}".format(cfg["TRAIN"].USE_FLIPPED)) print("Train conv layers: {}".format(globalvars['train_conv'])) print("Random seed : {}".format(globalvars['rnd_seed'])) print("Momentum per MB : {}".format(globalvars['momentum_per_mb'])) if globalvars['train_e2e']: print("E2E epochs : {}".format(globalvars['e2e_epochs'])) else: print("RPN lr factor : {}".format(globalvars['rpn_lr_factor'])) print("RPN epochs : {}".format(globalvars['rpn_epochs'])) print("FRCN lr factor : {}".format(globalvars['frcn_lr_factor'])) print("FRCN epochs : {}".format(globalvars['frcn_epochs']))