def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantization_bits, progress_printer): # CNTK weights new gradient by (1-momentum) for unit gain, # thus we divide Caffe's learning rate by (1-momentum) initial_learning_rate = 2.0 # equal to 0.2 in caffe initial_learning_rate *= minibatch_size / 128 learn_rate_adjust_interval = 2 learn_rate_decrease_factor = 0.94 # Set learning parameters lr_per_mb = [] learning_rate = initial_learning_rate for i in range(0, num_epochs, learn_rate_adjust_interval): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = cntk.learner.momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up): if network['name'] == 'resnet20': lr_per_mb = [1.0]*80+[0.1]*40+[0.01] elif network['name'] == 'resnet110': lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01] else: return RuntimeError("Unknown model name!") momentum_time_constant = -minibatch_size/np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr/minibatch_size for lr in lr_per_mb] lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) # learner object if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight = l2_reg_weight) if block_size != None: learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) return Trainer(network['output'], (network['ce'], network['pe']), learner)
def create_trainer(network, epoch_size, num_quantization_bits, progress_printer): # Set learning parameters lr_per_mb = [0.01] * 20 + [0.001] * 20 + [0.0001] * 20 + [0.00001] * 10 + [ 0.000001 ] lr_schedule = cntk.learning_rate_schedule( lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = cntk.learner.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = cntk.learner.momentum_sgd( network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def create_trainer(network, epoch_size, num_quantization_bits): # Set learning parameters lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = cntk.learning_rate_schedule( lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size) mm_time_constant = [0] * 20 + [600] * 20 + [1200] mm_schedule = cntk.learner.momentum_as_time_constant_schedule( mm_time_constant, epoch_size=epoch_size) l2_reg_weight = 0.002 # Create learner learner = data_parallel_distributed_learner( cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight), num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], network['ce'], network['pe'], learner)
def run_cifar_convnet_distributed(): try: base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'], *"Image/CIFAR/v0/cifar-10-batches-py".split("/")) # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt # and CIFAR-10_mean.xml in the base_path. except KeyError: base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), *"../../../../Examples/Image/DataSets/CIFAR-10".split("/")) base_path = os.path.normpath(base_path) os.chdir(os.path.join(base_path, '..')) from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms set_computation_network_trace_level(1) set_fixed_random_seed(1) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works #force_deterministic_algorithms() # TODO: do the above; they lead to slightly different results, so not doing it for now create_train_reader = lambda data_size: create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size, 0) test_reader = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False, FULL_DATA_SWEEP) distributed_after_samples = 0 num_quantization_bits = 32 create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner( learner=learner, num_quantization_bits=num_quantization_bits, distributed_after=distributed_after_samples) return convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learner, max_epochs=1, num_mbs_per_log=None)
def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantization_bits, progress_printer): # CNTK weights new gradient by (1-momentum) for unit gain, # thus we divide Caffe's learning rate by (1-momentum) initial_learning_rate = 2.0 # equal to 0.2 in caffe initial_learning_rate *= minibatch_size / 128 learn_rate_adjust_interval = 2 learn_rate_decrease_factor = 0.94 # Set learning parameters lr_per_mb = [] learning_rate = initial_learning_rate for i in range(0, num_epochs, learn_rate_adjust_interval): lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval) learning_rate *= learn_rate_decrease_factor lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = cntk.learner.momentum_schedule(0.9) l2_reg_weight = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits): if network['name'] == 'resnet20': lr_per_mb = [1.0] * 80 + [0.1] * 40 + [0.01] elif network['name'] == 'resnet110': lr_per_mb = [0.1] * 1 + [1.0] * 80 + [0.1] * 40 + [0.01] else: return RuntimeError("Unknown model name!") momentum_time_constant = -minibatch_size / np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr / minibatch_size for lr in lr_per_mb] lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) # learner object local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight) learner = data_parallel_distributed_learner( learner=local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) return Trainer(network['output'], network['ce'], network['pe'], learner)
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer): if network['name'] == 'resnet20': lr_per_mb = [1.0]*80+[0.1]*40+[0.01] elif network['name'] == 'resnet110': lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01] else: return RuntimeError("Unknown model name!") momentum_time_constant = -minibatch_size/np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr/minibatch_size for lr in lr_per_mb] lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) # learner object if block_size != None and num_quantization_bits != 32: raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.") local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight = l2_reg_weight) if block_size != None: learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_printer)
def create_trainer(network, epoch_size, num_quantization_bits): # Set learning parameters lr_per_mb = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001] lr_schedule = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size) mm_schedule = cntk.learner.momentum_schedule(0.9) l2_reg_weight = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe # Create learner local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight) # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency parameter_learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=0) # Create trainer return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
def test_cifar_resnet_distributed_error(device_id, is_1bit_sgd): if cntk_device(device_id).type() != DeviceKind_GPU: pytest.skip('test only runs on GPU') set_default_device(cntk_device(device_id)) if not is_1bit_sgd: pytest.skip('test only runs in 1-bit SGD') try: base_path = os.path.join( os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'], *"Image/CIFAR/v0/cifar-10-batches-py".split("/")) except KeyError: base_path = os.path.join( *"../../../../Examples/Image/DataSets/CIFAR-10".split("/")) base_path = os.path.normpath(base_path) os.chdir(os.path.join(base_path, '..')) from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms set_computation_network_trace_level(1) set_fixed_random_seed( 1 ) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works #force_deterministic_algorithms() # TODO: do the above; they lead to slightly different results, so not doing it for now distributed_learner_factory = lambda learner: distributed.data_parallel_distributed_learner( learner=learner, num_quantization_bits=32, distributed_after=0) reader_train_factory = lambda data_size: create_reader( os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size) test_reader = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False, FULL_DATA_SWEEP) test_error = train_and_evaluate(reader_train_factory, test_reader, 'resnet20', 5, distributed_learner_factory) expected_test_error = 0.282 assert np.allclose(test_error, expected_test_error, atol=TOLERANCE_ABSOLUTE) distributed.Communicator.finalize()
def run_cifar_convnet_distributed(): try: base_path = os.path.join( os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'], *"Image/CIFAR/v0/cifar-10-batches-py".split("/")) # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt # and CIFAR-10_mean.xml in the base_path. except KeyError: base_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), *"../../../../Examples/Image/DataSets/CIFAR-10".split("/")) base_path = os.path.normpath(base_path) os.chdir(os.path.join(base_path, '..')) from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms set_computation_network_trace_level(1) set_fixed_random_seed( 1 ) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works #force_deterministic_algorithms() # TODO: do the above; they lead to slightly different results, so not doing it for now create_train_reader = lambda data_size: create_reader( os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size, 0) test_reader = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False, FULL_DATA_SWEEP) distributed_after_samples = 0 num_quantization_bits = 32 create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner( learner=learner, num_quantization_bits=num_quantization_bits, distributed_after=distributed_after_samples) return convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learner, max_epochs=1, num_mbs_per_log=None)
def test_cifar_resnet_distributed_error(device_id, is_1bit_sgd): if cntk_device(device_id).type() != DeviceKind_GPU: pytest.skip('test only runs on GPU') set_default_device(cntk_device(device_id)) if not is_1bit_sgd: pytest.skip('test only runs in 1-bit SGD') try: base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'], *"Image/CIFAR/v0/cifar-10-batches-py".split("/")) except KeyError: base_path = os.path.join( *"../../../../Examples/Image/DataSets/CIFAR-10".split("/")) base_path = os.path.normpath(base_path) os.chdir(os.path.join(base_path, '..')) from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms set_computation_network_trace_level(1) set_fixed_random_seed(1) # BUGBUG: has no effect at present # TODO: remove debugging facilities once this all works #force_deterministic_algorithms() # TODO: do the above; they lead to slightly different results, so not doing it for now distributed_learner_factory = lambda learner: distributed.data_parallel_distributed_learner( learner=learner, num_quantization_bits=32, distributed_after=0) reader_train_factory = lambda data_size: create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size) test_reader = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False, FULL_DATA_SWEEP) test_error = train_and_evaluate(reader_train_factory, test_reader, 'resnet20', 5, distributed_learner_factory) expected_test_error = 0.282 assert np.allclose(test_error, expected_test_error, atol=TOLERANCE_ABSOLUTE) distributed.Communicator.finalize()
def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up): # Set learning parameters lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [ 0.00015625 ] * 20 + [0.000046875] * 10 + [0.000015625] lr_schedule = cntk.learning_rate_schedule( lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size) mm_time_constant = [0] * 20 + [600] * 20 + [1200] mm_schedule = cntk.learner.momentum_as_time_constant_schedule( mm_time_constant, epoch_size=epoch_size) l2_reg_weight = 0.002 # Create learner if block_size != None and num_quantization_bits != 32: raise RuntimeError( "Block momentum cannot be used with quantization, please remove quantized_bits option." ) local_learner = cntk.learner.momentum_sgd( network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) if block_size != None: learner = block_momentum_distributed_learner(local_learner, block_size=block_size) else: learner = data_parallel_distributed_learner( local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up) # Create trainer return cntk.Trainer(network['output'], network['ce'], network['pe'], learner)
def train_model(image_input, roi_input, dims_input, loss, pred_error, lr_per_sample, mm_schedule, l2_reg_weight, epochs_to_train, cfg, rpn_rois_input=None, buffered_rpn_proposals=None): if isinstance(loss, cntk.Variable): loss = combine([loss]) params = loss.parameters biases = [p for p in params if '.b' in p.name or 'b' == p.name] others = [p for p in params if not p in biases] bias_lr_mult = cfg["CNTK"].BIAS_LR_MULT if cfg["CNTK"].DEBUG_OUTPUT: print("biases") for p in biases: print(p) print("others") for p in others: print(p) print("bias_lr_mult: {}".format(bias_lr_mult)) # Instantiate the learners and the trainer object lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample) learner = momentum_sgd(others, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) learner = distributed.data_parallel_distributed_learner( learner = learner, num_quantization_bits = cfg.NUM_QUANTIZATION_BITS, # non-quantized gradient accumulation distributed_after = cfg.WARM_UP) # no warm start as default bias_lr_per_sample = [v * bias_lr_mult for v in lr_per_sample] bias_lr_schedule = learning_parameter_schedule_per_sample(bias_lr_per_sample) bias_learner = momentum_sgd(biases, bias_lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight, unit_gain=False, use_mean_gradient=True) bias_learner = distributed.data_parallel_distributed_learner( learner = bias_learner, num_quantization_bits = cfg.NUM_QUANTIZATION_BITS, # non-quantized gradient accumulation distributed_after = cfg.WARM_UP) # no warm start as default trainer = Trainer(None, (loss, pred_error), [learner, bias_learner]) # Get minibatches of images and perform model training print("Training model for %s epochs." % epochs_to_train) log_number_of_parameters(loss) # Create the minibatch source if buffered_rpn_proposals is not None: proposal_provider = ProposalProvider.fromlist(buffered_rpn_proposals, requires_scaling=False) else: proposal_provider = None od_minibatch_source = ObjectDetectionMinibatchSource( cfg["DATA"].TRAIN_MAP_FILE, cfg["DATA"].TRAIN_ROI_FILE, num_classes=cfg["DATA"].NUM_CLASSES, max_annotations_per_image=cfg.INPUT_ROIS_PER_IMAGE, pad_width=cfg.IMAGE_WIDTH, pad_height=cfg.IMAGE_HEIGHT, pad_value=cfg["MODEL"].IMG_PAD_COLOR, randomize=True, use_flipping=cfg["TRAIN"].USE_FLIPPED, max_images=cfg["DATA"].NUM_TRAIN_IMAGES, proposal_provider=proposal_provider) # define mapping from reader streams to network inputs input_map = { od_minibatch_source.image_si: image_input, od_minibatch_source.roi_si: roi_input, } if buffered_rpn_proposals is not None: input_map[od_minibatch_source.proposals_si] = rpn_rois_input else: input_map[od_minibatch_source.dims_si] = dims_input progress_printer = ProgressPrinter(tag='Training' , num_epochs=epochs_to_train , gen_heartbeat=True , rank=distributed.Communicator.rank()) for epoch in range(epochs_to_train): # loop over epochs sample_count = 0 while sample_count < cfg["DATA"].NUM_TRAIN_IMAGES: # loop over minibatches in the epoch data = od_minibatch_source.next_minibatch(min(cfg.MB_SIZE, cfg["DATA"].NUM_TRAIN_IMAGES-sample_count), input_map=input_map, number_of_workers=cntk.Communicator.num_workers(), worker_rank=cntk.Communicator.rank()) trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far progress_printer.update_with_trainer(trainer, with_metric=True) # log progress if sample_count % 100 == 0: print("Processed {} samples".format(sample_count)) progress_printer.epoch_summary(with_metric=True)
def train_fast_rcnn(debug_output=False, model_path=model_file): if debug_output: print("Storing graphs and intermediate models to %s." % os.path.join(abs_path, "Output")) # Create the minibatch source minibatch_source = create_mb_source(image_height, image_width, num_channels, num_classes, num_rois, base_path, "train") # Input variables denoting features, rois and label data image_input = C.input_variable((num_channels, image_height, image_width)) roi_input = C.input_variable((num_rois, 4)) label_input = C.input_variable((num_rois, num_classes)) # define mapping from reader streams to network inputs input_map = { image_input: minibatch_source.streams.features, roi_input: minibatch_source.streams.rois, label_input: minibatch_source.streams.roiLabels } # Instantiate the Fast R-CNN prediction model and loss function frcn_output = frcn_predictor(image_input, roi_input, num_classes, model_path) ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1) pe = classification_error(frcn_output, label_input, axis=1) if debug_output: plot(frcn_output, os.path.join(abs_path, "Output", "graph_frcn.png")) # Set learning parameters l2_reg_weight = 0.0005 lr_per_sample = [0.00001] * 10 + [0.000001] * 5 + [0.0000001] lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample) mm_schedule = momentum_schedule_per_sample(momentum_per_sample) # Instantiate the trainer object as default learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) # Preparation for distributed learning, which is compatible for normal learner learner = distributed.data_parallel_distributed_learner( learner = learner, num_quantization_bits = num_quantization_bits, # non-quantized gradient accumulation distributed_after = warm_up) # no warm start as default progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs, rank=distributed.Communicator.rank()) trainer = Trainer(frcn_output, (ce, pe), learner, progress_printer) # Get minibatches of images and perform model training print("Training Fast R-CNN model for %s epochs." % max_epochs) log_number_of_parameters(frcn_output) for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = minibatch_source.next_minibatch(min(mb_size * C.Communicator.num_workers(), epoch_size-sample_count), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() if debug_output: frcn_output.save(os.path.join(abs_path, "Output", "frcn_py_%s.model" % (epoch+1))) if distributed_flg: distributed.Communicator.finalize() return frcn_output
required=False, default='0') args = vars(parser.parse_args()) num_quantization_bits = int(args['quantize_bit']) epochs = int(args['epochs']) distributed_after_samples = int(args['distributed_after']) network_name = args['network'] scale_up = bool(args['scale_up']) # Create distributed trainer factory print( "Start training: quantize_bit = {}, epochs = {}, distributed_after = {}" .format(num_quantization_bits, epochs, distributed_after_samples)) create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner( learner=learner, num_quantization_bits=num_quantization_bits, distributed_after=distributed_after_samples) train_data = os.path.join(data_path, 'train_map.txt') test_data = os.path.join(data_path, 'test_map.txt') mean = os.path.join(data_path, 'CIFAR-10_mean.xml') create_train_reader = lambda data_size: create_reader( train_data, mean, True, data_size, distributed_after_samples) test_reader = create_reader(test_data, mean, False, FULL_DATA_SWEEP) train_and_evaluate(create_train_reader, test_reader, network_name, epochs, create_dist_learner, scale_up) # Must call MPI finalize when process exit distributed.Communicator.finalize()
def train_fast_rcnn(debug_output=False, model_path=model_file): if debug_output: print("Storing graphs and intermediate models to %s." % os.path.join(abs_path, "Output")) # Create the minibatch source minibatch_source = create_mb_source(image_height, image_width, num_channels, num_classes, num_rois, base_path, "train") # Input variables denoting features, rois and label data image_input = C.input_variable((num_channels, image_height, image_width)) roi_input = C.input_variable((num_rois, 4)) label_input = C.input_variable((num_rois, num_classes)) # define mapping from reader streams to network inputs input_map = { image_input: minibatch_source.streams.features, roi_input: minibatch_source.streams.rois, label_input: minibatch_source.streams.roiLabels } # Instantiate the Fast R-CNN prediction model and loss function frcn_output = frcn_predictor(image_input, roi_input, num_classes, model_path) ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1) pe = classification_error(frcn_output, label_input, axis=1) if debug_output: plot(frcn_output, os.path.join(abs_path, "Output", "graph_frcn.png")) # Set learning parameters l2_reg_weight = 0.0005 lr_per_sample = [0.00001] * 10 + [0.000001] * 5 + [0.0000001] lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) # Instantiate the trainer object as default learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight) # Preparation for distributed learning, which is compatible for normal learner learner = distributed.data_parallel_distributed_learner( learner=learner, num_quantization_bits= num_quantization_bits, # non-quantized gradient accumulation distributed_after=warm_up) # no warm start as default progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs, rank=distributed.Communicator.rank()) trainer = Trainer(frcn_output, (ce, pe), learner, progress_printer) # Get minibatches of images and perform model training print("Training Fast R-CNN model for %s epochs." % max_epochs) log_number_of_parameters(frcn_output) for epoch in range(max_epochs): # loop over epochs sample_count = 0 while sample_count < epoch_size: # loop over minibatches in the epoch data = minibatch_source.next_minibatch( min(mb_size * C.Communicator.num_workers(), epoch_size - sample_count), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) trainer.train_minibatch(data) # update model with it sample_count += trainer.previous_minibatch_sample_count # count samples processed so far trainer.summarize_training_progress() if debug_output: frcn_output.save( os.path.join(abs_path, "Output", "frcn_py_%s.model" % (epoch + 1))) if distributed_flg: distributed.Communicator.finalize() return frcn_output
parser = argparse.ArgumentParser() parser.add_argument('-n', '--network', help='network type, resnet20 or resnet110', required=False, default='resnet20') parser.add_argument('-e', '--epochs', help='total epochs', type=int, required=False, default='160') parser.add_argument('-q', '--quantize_bit', help='quantized bit', type=int, required=False, default='32') parser.add_argument('-s', '--scale_up', help='scale up minibatch size with #workers for better parallelism', type=bool, required=False, default='False') parser.add_argument('-a', '--distributed_after', help='number of samples to train with before running distributed', type=int, required=False, default='0') args = vars(parser.parse_args()) num_quantization_bits = int(args['quantize_bit']) epochs = int(args['epochs']) distributed_after_samples = int(args['distributed_after']) network_name = args['network'] scale_up = bool(args['scale_up']) # Create distributed trainer factory print("Start training: quantize_bit = {}, epochs = {}, distributed_after = {}".format(num_quantization_bits, epochs, distributed_after_samples)) create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner(learner=learner, num_quantization_bits=num_quantization_bits, distributed_after=distributed_after_samples) train_data=os.path.join(data_path, 'train_map.txt') test_data=os.path.join(data_path, 'test_map.txt') mean=os.path.join(data_path, 'CIFAR-10_mean.xml') create_train_reader=lambda data_size: create_reader(train_data, mean, True, data_size, distributed_after_samples) test_reader=create_reader(test_data, mean, False, FULL_DATA_SWEEP) train_and_evaluate(create_train_reader, test_reader, network_name, epochs, create_dist_learner, scale_up) # Must call MPI finalize when process exit distributed.Communicator.finalize()