def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantization_bits, progress_printer):
    
    # CNTK weights new gradient by (1-momentum) for unit gain, 
    # thus we divide Caffe's learning rate by (1-momentum)
    initial_learning_rate = 2.0 # equal to 0.2 in caffe
    initial_learning_rate *= minibatch_size / 128
    learn_rate_adjust_interval = 2
    learn_rate_decrease_factor = 0.94

    # Set learning parameters
    lr_per_mb = []
    learning_rate = initial_learning_rate
    for i in range(0, num_epochs, learn_rate_adjust_interval):
        lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval)
        learning_rate *= learn_rate_decrease_factor

    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
    mm_schedule       = cntk.learner.momentum_schedule(0.9)
    l2_reg_weight     = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe
    
    # Create learner
    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, 
                                                l2_regularization_weight=l2_reg_weight)
    parameter_learner = data_parallel_distributed_learner(
        local_learner, 
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)

    # Create trainer
    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up):
    if network['name'] == 'resnet20': 
        lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
    elif network['name'] == 'resnet110': 
        lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
    else: 
        return RuntimeError("Unknown model name!")

    momentum_time_constant = -minibatch_size/np.log(0.9)
    l2_reg_weight = 0.0001

    # Set learning parameters
    lr_per_sample = [lr/minibatch_size for lr in lr_per_mb]
    lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
    
    # learner object
    if block_size != None and num_quantization_bits != 32:
        raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.")

    local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule,
                                 l2_regularization_weight = l2_reg_weight)

    if block_size != None:
        learner = block_momentum_distributed_learner(local_learner, block_size=block_size)
    else:
        learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)
    
    return Trainer(network['output'], (network['ce'], network['pe']), learner)
Example #3
0
def create_trainer(network, epoch_size, num_quantization_bits,
                   progress_printer):
    # Set learning parameters
    lr_per_mb = [0.01] * 20 + [0.001] * 20 + [0.0001] * 20 + [0.00001] * 10 + [
        0.000001
    ]
    lr_schedule = cntk.learning_rate_schedule(
        lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
    mm_schedule = cntk.learner.momentum_schedule(0.9)
    l2_reg_weight = 0.0005  # CNTK L2 regularization is per sample, thus same as Caffe

    # Create learner
    local_learner = cntk.learner.momentum_sgd(
        network['output'].parameters,
        lr_schedule,
        mm_schedule,
        unit_gain=False,
        l2_regularization_weight=l2_reg_weight)
    # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
    parameter_learner = data_parallel_distributed_learner(
        local_learner,
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)

    # Create trainer
    return cntk.Trainer(network['output'], (network['ce'], network['pe']),
                        parameter_learner, progress_printer)
def create_trainer(network, epoch_size, num_quantization_bits):
    # Set learning parameters
    lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [
        0.00015625
    ] * 20 + [0.000046875] * 10 + [0.000015625]
    lr_schedule = cntk.learning_rate_schedule(
        lr_per_sample,
        unit=cntk.learner.UnitType.sample,
        epoch_size=epoch_size)
    mm_time_constant = [0] * 20 + [600] * 20 + [1200]
    mm_schedule = cntk.learner.momentum_as_time_constant_schedule(
        mm_time_constant, epoch_size=epoch_size)
    l2_reg_weight = 0.002

    # Create learner
    learner = data_parallel_distributed_learner(
        cntk.learner.momentum_sgd(network['output'].parameters,
                                  lr_schedule,
                                  mm_schedule,
                                  unit_gain=True,
                                  l2_regularization_weight=l2_reg_weight),
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)

    # Create trainer
    return cntk.Trainer(network['output'], network['ce'], network['pe'],
                        learner)
def run_cifar_convnet_distributed():
    try:
        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
        #      and CIFAR-10_mean.xml in the base_path.
    except KeyError:
        base_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))

    base_path = os.path.normpath(base_path)
    os.chdir(os.path.join(base_path, '..'))

    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
    set_computation_network_trace_level(1) 
    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now

    create_train_reader = lambda data_size: create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size, 0)
    test_reader = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False, FULL_DATA_SWEEP)

    distributed_after_samples = 0
    num_quantization_bits = 32
    create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner(
        learner=learner,
        num_quantization_bits=num_quantization_bits,
        distributed_after=distributed_after_samples)

    return convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learner, max_epochs=1, num_mbs_per_log=None)
def create_trainer(network, epoch_size, num_epochs, minibatch_size, num_quantization_bits, progress_printer):
    
    # CNTK weights new gradient by (1-momentum) for unit gain, 
    # thus we divide Caffe's learning rate by (1-momentum)
    initial_learning_rate = 2.0 # equal to 0.2 in caffe
    initial_learning_rate *= minibatch_size / 128
    learn_rate_adjust_interval = 2
    learn_rate_decrease_factor = 0.94

    # Set learning parameters
    lr_per_mb = []
    learning_rate = initial_learning_rate
    for i in range(0, num_epochs, learn_rate_adjust_interval):
        lr_per_mb.extend([learning_rate] * learn_rate_adjust_interval)
        learning_rate *= learn_rate_decrease_factor

    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
    mm_schedule       = cntk.learner.momentum_schedule(0.9)
    l2_reg_weight     = 0.0001 # CNTK L2 regularization is per sample, thus same as Caffe
    
    # Create learner
    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, 
                                                l2_regularization_weight=l2_reg_weight)
    parameter_learner = data_parallel_distributed_learner(
        local_learner, 
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)

    # Create trainer
    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)
Example #7
0
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits):
    if network['name'] == 'resnet20':
        lr_per_mb = [1.0] * 80 + [0.1] * 40 + [0.01]
    elif network['name'] == 'resnet110':
        lr_per_mb = [0.1] * 1 + [1.0] * 80 + [0.1] * 40 + [0.01]
    else:
        return RuntimeError("Unknown model name!")

    momentum_time_constant = -minibatch_size / np.log(0.9)
    l2_reg_weight = 0.0001

    # Set learning parameters
    lr_per_sample = [lr / minibatch_size for lr in lr_per_mb]
    lr_schedule = learning_rate_schedule(lr_per_sample,
                                         epoch_size=epoch_size,
                                         unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)

    # learner object
    local_learner = momentum_sgd(network['output'].parameters,
                                 lr_schedule,
                                 mm_schedule,
                                 unit_gain=True,
                                 l2_regularization_weight=l2_reg_weight)

    learner = data_parallel_distributed_learner(
        learner=local_learner,
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)
    return Trainer(network['output'], network['ce'], network['pe'], learner)
Example #8
0
def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer):
    if network['name'] == 'resnet20': 
        lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
    elif network['name'] == 'resnet110': 
        lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
    else: 
        return RuntimeError("Unknown model name!")

    momentum_time_constant = -minibatch_size/np.log(0.9)
    l2_reg_weight = 0.0001

    # Set learning parameters
    lr_per_sample = [lr/minibatch_size for lr in lr_per_mb]
    lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
    
    # learner object
    if block_size != None and num_quantization_bits != 32:
        raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.")

    local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule,
                                 l2_regularization_weight = l2_reg_weight)

    if block_size != None:
        learner = block_momentum_distributed_learner(local_learner, block_size=block_size)
    else:
        learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)

    return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_printer)
def create_trainer(network, epoch_size, num_quantization_bits):
    # Set learning parameters
    lr_per_mb         = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
    mm_schedule       = cntk.learner.momentum_schedule(0.9)
    l2_reg_weight     = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
    
    # Create learner
    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
    # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency 
    parameter_learner = data_parallel_distributed_learner(
        local_learner, 
        num_quantization_bits=num_quantization_bits,
        distributed_after=0)

    # Create trainer
    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
Example #10
0
def test_cifar_resnet_distributed_error(device_id, is_1bit_sgd):
    if cntk_device(device_id).type() != DeviceKind_GPU:
        pytest.skip('test only runs on GPU')
    set_default_device(cntk_device(device_id))

    if not is_1bit_sgd:
        pytest.skip('test only runs in 1-bit SGD')

    try:
        base_path = os.path.join(
            os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
            *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
    except KeyError:
        base_path = os.path.join(
            *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))

    base_path = os.path.normpath(base_path)
    os.chdir(os.path.join(base_path, '..'))

    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
    set_computation_network_trace_level(1)
    set_fixed_random_seed(
        1
    )  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now

    distributed_learner_factory = lambda learner: distributed.data_parallel_distributed_learner(
        learner=learner, num_quantization_bits=32, distributed_after=0)

    reader_train_factory = lambda data_size: create_reader(
        os.path.join(base_path, 'train_map.txt'),
        os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size)
    test_reader = create_reader(os.path.join(base_path, 'test_map.txt'),
                                os.path.join(base_path, 'CIFAR-10_mean.xml'),
                                False, FULL_DATA_SWEEP)

    test_error = train_and_evaluate(reader_train_factory, test_reader,
                                    'resnet20', 5, distributed_learner_factory)

    expected_test_error = 0.282

    assert np.allclose(test_error,
                       expected_test_error,
                       atol=TOLERANCE_ABSOLUTE)
    distributed.Communicator.finalize()
def run_cifar_convnet_distributed():
    try:
        base_path = os.path.join(
            os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
            *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
        # N.B. CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY has {train,test}_map.txt
        #      and CIFAR-10_mean.xml in the base_path.
    except KeyError:
        base_path = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))

    base_path = os.path.normpath(base_path)
    os.chdir(os.path.join(base_path, '..'))

    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
    set_computation_network_trace_level(1)
    set_fixed_random_seed(
        1
    )  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now

    create_train_reader = lambda data_size: create_reader(
        os.path.join(base_path, 'train_map.txt'),
        os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size, 0)
    test_reader = create_reader(os.path.join(base_path, 'test_map.txt'),
                                os.path.join(base_path, 'CIFAR-10_mean.xml'),
                                False, FULL_DATA_SWEEP)

    distributed_after_samples = 0
    num_quantization_bits = 32
    create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner(
        learner=learner,
        num_quantization_bits=num_quantization_bits,
        distributed_after=distributed_after_samples)

    return convnet_cifar10_dataaug(create_train_reader,
                                   test_reader,
                                   create_dist_learner,
                                   max_epochs=1,
                                   num_mbs_per_log=None)
def test_cifar_resnet_distributed_error(device_id, is_1bit_sgd):
    if cntk_device(device_id).type() != DeviceKind_GPU:
        pytest.skip('test only runs on GPU')
    set_default_device(cntk_device(device_id))

    if not is_1bit_sgd:
        pytest.skip('test only runs in 1-bit SGD')

    try:
        base_path = os.path.join(os.environ['CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY'],
                                *"Image/CIFAR/v0/cifar-10-batches-py".split("/"))
    except KeyError:
        base_path = os.path.join(
            *"../../../../Examples/Image/DataSets/CIFAR-10".split("/"))

    base_path = os.path.normpath(base_path)
    os.chdir(os.path.join(base_path, '..'))

    from _cntk_py import set_computation_network_trace_level, set_fixed_random_seed, force_deterministic_algorithms
    set_computation_network_trace_level(1)
    set_fixed_random_seed(1)  # BUGBUG: has no effect at present  # TODO: remove debugging facilities once this all works
    #force_deterministic_algorithms()
    # TODO: do the above; they lead to slightly different results, so not doing it for now

    distributed_learner_factory = lambda learner: distributed.data_parallel_distributed_learner(
        learner=learner,
        num_quantization_bits=32,
        distributed_after=0)

    reader_train_factory = lambda data_size: create_reader(os.path.join(base_path, 'train_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), True, data_size)
    test_reader = create_reader(os.path.join(base_path, 'test_map.txt'), os.path.join(base_path, 'CIFAR-10_mean.xml'), False, FULL_DATA_SWEEP)

    test_error = train_and_evaluate(reader_train_factory, test_reader, 'resnet20', 5, distributed_learner_factory)

    expected_test_error = 0.282

    assert np.allclose(test_error, expected_test_error,
                       atol=TOLERANCE_ABSOLUTE)
    distributed.Communicator.finalize()
def create_trainer(network, epoch_size, num_quantization_bits, block_size,
                   warm_up):
    # Set learning parameters
    lr_per_sample = [0.0015625] * 20 + [0.00046875] * 20 + [
        0.00015625
    ] * 20 + [0.000046875] * 10 + [0.000015625]
    lr_schedule = cntk.learning_rate_schedule(
        lr_per_sample,
        unit=cntk.learner.UnitType.sample,
        epoch_size=epoch_size)
    mm_time_constant = [0] * 20 + [600] * 20 + [1200]
    mm_schedule = cntk.learner.momentum_as_time_constant_schedule(
        mm_time_constant, epoch_size=epoch_size)
    l2_reg_weight = 0.002

    # Create learner
    if block_size != None and num_quantization_bits != 32:
        raise RuntimeError(
            "Block momentum cannot be used with quantization, please remove quantized_bits option."
        )

    local_learner = cntk.learner.momentum_sgd(
        network['output'].parameters,
        lr_schedule,
        mm_schedule,
        l2_regularization_weight=l2_reg_weight)

    if block_size != None:
        learner = block_momentum_distributed_learner(local_learner,
                                                     block_size=block_size)
    else:
        learner = data_parallel_distributed_learner(
            local_learner,
            num_quantization_bits=num_quantization_bits,
            distributed_after=warm_up)

    # Create trainer
    return cntk.Trainer(network['output'], network['ce'], network['pe'],
                        learner)
Example #14
0
def train_model(image_input, roi_input, dims_input, loss, pred_error,
                lr_per_sample, mm_schedule, l2_reg_weight, epochs_to_train, cfg,
                rpn_rois_input=None, buffered_rpn_proposals=None):
    if isinstance(loss, cntk.Variable):
        loss = combine([loss])

    params = loss.parameters
    biases = [p for p in params if '.b' in p.name or 'b' == p.name]
    others = [p for p in params if not p in biases]
    bias_lr_mult = cfg["CNTK"].BIAS_LR_MULT

    if cfg["CNTK"].DEBUG_OUTPUT:
        print("biases")
        for p in biases: print(p)
        print("others")
        for p in others: print(p)
        print("bias_lr_mult: {}".format(bias_lr_mult))

    # Instantiate the learners and the trainer object
    lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample)
    learner = momentum_sgd(others, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight,
                           unit_gain=False, use_mean_gradient=True)
    learner = distributed.data_parallel_distributed_learner(
        learner = learner,
        num_quantization_bits = cfg.NUM_QUANTIZATION_BITS,   # non-quantized gradient accumulation
        distributed_after = cfg.WARM_UP)                     # no warm start as default

    bias_lr_per_sample = [v * bias_lr_mult for v in lr_per_sample]
    bias_lr_schedule = learning_parameter_schedule_per_sample(bias_lr_per_sample)
    bias_learner = momentum_sgd(biases, bias_lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight,
                           unit_gain=False, use_mean_gradient=True)
    bias_learner = distributed.data_parallel_distributed_learner(
        learner = bias_learner,
        num_quantization_bits = cfg.NUM_QUANTIZATION_BITS,   # non-quantized gradient accumulation
        distributed_after = cfg.WARM_UP)                     # no warm start as default
    trainer = Trainer(None, (loss, pred_error), [learner, bias_learner])

    # Get minibatches of images and perform model training
    print("Training model for %s epochs." % epochs_to_train)
    log_number_of_parameters(loss)

    # Create the minibatch source
    if buffered_rpn_proposals is not None:
        proposal_provider = ProposalProvider.fromlist(buffered_rpn_proposals, requires_scaling=False)
    else:
        proposal_provider = None

    od_minibatch_source = ObjectDetectionMinibatchSource(
        cfg["DATA"].TRAIN_MAP_FILE, cfg["DATA"].TRAIN_ROI_FILE,
        num_classes=cfg["DATA"].NUM_CLASSES,
        max_annotations_per_image=cfg.INPUT_ROIS_PER_IMAGE,
        pad_width=cfg.IMAGE_WIDTH,
        pad_height=cfg.IMAGE_HEIGHT,
        pad_value=cfg["MODEL"].IMG_PAD_COLOR,
        randomize=True,
        use_flipping=cfg["TRAIN"].USE_FLIPPED,
        max_images=cfg["DATA"].NUM_TRAIN_IMAGES,
        proposal_provider=proposal_provider)

    # define mapping from reader streams to network inputs
    input_map = {
        od_minibatch_source.image_si: image_input,
        od_minibatch_source.roi_si: roi_input,
    }
    if buffered_rpn_proposals is not None:
        input_map[od_minibatch_source.proposals_si] = rpn_rois_input
    else:
        input_map[od_minibatch_source.dims_si] = dims_input

    progress_printer = ProgressPrinter(tag='Training'
                                    , num_epochs=epochs_to_train
                                    , gen_heartbeat=True
                                    , rank=distributed.Communicator.rank())
    for epoch in range(epochs_to_train):       # loop over epochs
        sample_count = 0
        while sample_count < cfg["DATA"].NUM_TRAIN_IMAGES:  # loop over minibatches in the epoch
            data = od_minibatch_source.next_minibatch(min(cfg.MB_SIZE, cfg["DATA"].NUM_TRAIN_IMAGES-sample_count), 
                input_map=input_map, 
                number_of_workers=cntk.Communicator.num_workers(), 
                worker_rank=cntk.Communicator.rank())
            trainer.train_minibatch(data)                                    # update model with it
            sample_count += trainer.previous_minibatch_sample_count          # count samples processed so far
            progress_printer.update_with_trainer(trainer, with_metric=True)  # log progress
            if sample_count % 100 == 0:
                print("Processed {} samples".format(sample_count))

        progress_printer.epoch_summary(with_metric=True)
Example #15
0
def train_fast_rcnn(debug_output=False, model_path=model_file):
    if debug_output:
        print("Storing graphs and intermediate models to %s." % os.path.join(abs_path, "Output"))

    # Create the minibatch source
    minibatch_source = create_mb_source(image_height, image_width, num_channels,
                                        num_classes, num_rois, base_path, "train")

    # Input variables denoting features, rois and label data
    image_input = C.input_variable((num_channels, image_height, image_width))
    roi_input   = C.input_variable((num_rois, 4))
    label_input = C.input_variable((num_rois, num_classes))

    # define mapping from reader streams to network inputs
    input_map = {
        image_input: minibatch_source.streams.features,
        roi_input: minibatch_source.streams.rois,
        label_input: minibatch_source.streams.roiLabels
    }

    # Instantiate the Fast R-CNN prediction model and loss function
    frcn_output = frcn_predictor(image_input, roi_input, num_classes, model_path)
    ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1)
    pe = classification_error(frcn_output, label_input, axis=1)
    if debug_output:
        plot(frcn_output, os.path.join(abs_path, "Output", "graph_frcn.png"))

    # Set learning parameters
    l2_reg_weight = 0.0005
    lr_per_sample = [0.00001] * 10 + [0.000001] * 5 + [0.0000001]
    lr_schedule = learning_parameter_schedule_per_sample(lr_per_sample)
    mm_schedule = momentum_schedule_per_sample(momentum_per_sample)

    # Instantiate the trainer object as default
    learner = momentum_sgd(frcn_output.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight)
    # Preparation for distributed learning, which is compatible for normal learner
    learner = distributed.data_parallel_distributed_learner(
        learner = learner,
        num_quantization_bits = num_quantization_bits,   # non-quantized gradient accumulation
        distributed_after = warm_up)                     # no warm start as default            
    progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs, rank=distributed.Communicator.rank())
    trainer = Trainer(frcn_output, (ce, pe), learner, progress_printer)

    # Get minibatches of images and perform model training
    print("Training Fast R-CNN model for %s epochs." % max_epochs)
    log_number_of_parameters(frcn_output)
    for epoch in range(max_epochs):       # loop over epochs
        sample_count = 0
        while sample_count < epoch_size:  # loop over minibatches in the epoch
            data = minibatch_source.next_minibatch(min(mb_size * C.Communicator.num_workers(), epoch_size-sample_count), 
                input_map=input_map, 
                num_data_partitions=C.Communicator.num_workers(), 
                partition_index=C.Communicator.rank())     
            trainer.train_minibatch(data)                                    # update model with it
            sample_count += trainer.previous_minibatch_sample_count          # count samples processed so far

        trainer.summarize_training_progress()
        if debug_output:
            frcn_output.save(os.path.join(abs_path, "Output", "frcn_py_%s.model" % (epoch+1)))

    if distributed_flg:
        distributed.Communicator.finalize()

    return frcn_output
        required=False,
        default='0')

    args = vars(parser.parse_args())
    num_quantization_bits = int(args['quantize_bit'])
    epochs = int(args['epochs'])
    distributed_after_samples = int(args['distributed_after'])
    network_name = args['network']
    scale_up = bool(args['scale_up'])

    # Create distributed trainer factory
    print(
        "Start training: quantize_bit = {}, epochs = {}, distributed_after = {}"
        .format(num_quantization_bits, epochs, distributed_after_samples))
    create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner(
        learner=learner,
        num_quantization_bits=num_quantization_bits,
        distributed_after=distributed_after_samples)
    train_data = os.path.join(data_path, 'train_map.txt')
    test_data = os.path.join(data_path, 'test_map.txt')
    mean = os.path.join(data_path, 'CIFAR-10_mean.xml')

    create_train_reader = lambda data_size: create_reader(
        train_data, mean, True, data_size, distributed_after_samples)
    test_reader = create_reader(test_data, mean, False, FULL_DATA_SWEEP)

    train_and_evaluate(create_train_reader, test_reader, network_name, epochs,
                       create_dist_learner, scale_up)

    # Must call MPI finalize when process exit
    distributed.Communicator.finalize()
Example #17
0
def train_fast_rcnn(debug_output=False, model_path=model_file):
    if debug_output:
        print("Storing graphs and intermediate models to %s." %
              os.path.join(abs_path, "Output"))

    # Create the minibatch source
    minibatch_source = create_mb_source(image_height, image_width,
                                        num_channels, num_classes, num_rois,
                                        base_path, "train")

    # Input variables denoting features, rois and label data
    image_input = C.input_variable((num_channels, image_height, image_width))
    roi_input = C.input_variable((num_rois, 4))
    label_input = C.input_variable((num_rois, num_classes))

    # define mapping from reader streams to network inputs
    input_map = {
        image_input: minibatch_source.streams.features,
        roi_input: minibatch_source.streams.rois,
        label_input: minibatch_source.streams.roiLabels
    }

    # Instantiate the Fast R-CNN prediction model and loss function
    frcn_output = frcn_predictor(image_input, roi_input, num_classes,
                                 model_path)
    ce = cross_entropy_with_softmax(frcn_output, label_input, axis=1)
    pe = classification_error(frcn_output, label_input, axis=1)
    if debug_output:
        plot(frcn_output, os.path.join(abs_path, "Output", "graph_frcn.png"))

    # Set learning parameters
    l2_reg_weight = 0.0005
    lr_per_sample = [0.00001] * 10 + [0.000001] * 5 + [0.0000001]
    lr_schedule = learning_rate_schedule(lr_per_sample, unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)

    # Instantiate the trainer object as default
    learner = momentum_sgd(frcn_output.parameters,
                           lr_schedule,
                           mm_schedule,
                           l2_regularization_weight=l2_reg_weight)
    # Preparation for distributed learning, which is compatible for normal learner
    learner = distributed.data_parallel_distributed_learner(
        learner=learner,
        num_quantization_bits=
        num_quantization_bits,  # non-quantized gradient accumulation
        distributed_after=warm_up)  # no warm start as default
    progress_printer = ProgressPrinter(tag='Training',
                                       num_epochs=max_epochs,
                                       rank=distributed.Communicator.rank())
    trainer = Trainer(frcn_output, (ce, pe), learner, progress_printer)

    # Get minibatches of images and perform model training
    print("Training Fast R-CNN model for %s epochs." % max_epochs)
    log_number_of_parameters(frcn_output)
    for epoch in range(max_epochs):  # loop over epochs
        sample_count = 0
        while sample_count < epoch_size:  # loop over minibatches in the epoch
            data = minibatch_source.next_minibatch(
                min(mb_size * C.Communicator.num_workers(),
                    epoch_size - sample_count),
                input_map=input_map,
                num_data_partitions=C.Communicator.num_workers(),
                partition_index=C.Communicator.rank())
            trainer.train_minibatch(data)  # update model with it
            sample_count += trainer.previous_minibatch_sample_count  # count samples processed so far

        trainer.summarize_training_progress()
        if debug_output:
            frcn_output.save(
                os.path.join(abs_path, "Output",
                             "frcn_py_%s.model" % (epoch + 1)))

    if distributed_flg:
        distributed.Communicator.finalize()

    return frcn_output
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--network', help='network type, resnet20 or resnet110', required=False, default='resnet20')
    parser.add_argument('-e', '--epochs', help='total epochs', type=int, required=False, default='160')
    parser.add_argument('-q', '--quantize_bit', help='quantized bit', type=int, required=False, default='32')
    parser.add_argument('-s', '--scale_up', help='scale up minibatch size with #workers for better parallelism', type=bool, required=False, default='False')
    parser.add_argument('-a', '--distributed_after', help='number of samples to train with before running distributed', type=int, required=False, default='0')

    args = vars(parser.parse_args())
    num_quantization_bits = int(args['quantize_bit'])
    epochs = int(args['epochs'])
    distributed_after_samples = int(args['distributed_after'])
    network_name = args['network']
    scale_up = bool(args['scale_up'])

    # Create distributed trainer factory
    print("Start training: quantize_bit = {}, epochs = {}, distributed_after = {}".format(num_quantization_bits, epochs, distributed_after_samples))
    create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner(learner=learner,
                                                                                        num_quantization_bits=num_quantization_bits,
                                                                                        distributed_after=distributed_after_samples)
    train_data=os.path.join(data_path, 'train_map.txt')
    test_data=os.path.join(data_path, 'test_map.txt')
    mean=os.path.join(data_path, 'CIFAR-10_mean.xml')

    create_train_reader=lambda data_size: create_reader(train_data, mean, True, data_size, distributed_after_samples)
    test_reader=create_reader(test_data, mean, False, FULL_DATA_SWEEP)

    train_and_evaluate(create_train_reader, test_reader, network_name, epochs, create_dist_learner, scale_up)

    # Must call MPI finalize when process exit
    distributed.Communicator.finalize()