Exemple #1
0
def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    warmup_epoch = fluid.layers.fill_constant(shape=[1],
                                              dtype='float32',
                                              value=float(5),
                                              force_cpu=True)

    with init_on_cpu():
        epoch = ops.floor(global_step / step_each_epoch)
        with fluid.layers.control_flow.Switch() as switch:
            with switch.case(epoch < warmup_epoch):
                decayed_lr = learning_rate * (global_step /
                                              (step_each_epoch * warmup_epoch))
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
            with switch.default():
                decayed_lr = learning_rate * \
                    (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Exemple #2
0
def linear_warmup_and_cosine_decay(learning_rate, end_lr, warmup_steps,
                                   max_training_steps):
    """Applies linear warmup and cosine decay to the learning rate."""
    dtype = "float32"

    with fluid.default_main_program()._lr_schedule_guard():
        lr = layers.create_global_var(shape=[1],
                                      value=0.0,
                                      dtype=dtype,
                                      persistable=True,
                                      name="learning_rate")

        global_step = _decay_step_counter(1)

        with layers.control_flow.Switch() as switch:
            with switch.case(global_step < warmup_steps):
                warmup_lr = learning_rate * (global_step / warmup_steps)
                layers.assign(warmup_lr, lr)
            with switch.case(global_step < max_training_steps):
                frac = 0.5 * (ops.cos((global_step - warmup_steps) * math.pi /
                                      (max_training_steps - warmup_steps)) + 1)
                decayed_lr = end_lr + (learning_rate - end_lr) * frac
                layers.assign(decayed_lr, lr)
            with switch.default():
                learning_rate = layers.fill_constant(shape=[1],
                                                     dtype=dtype,
                                                     value=end_lr)
                layers.assign(learning_rate, lr)
        return lr
Exemple #3
0
def cosine_decay_with_warmup(learning_rate,
                             max_iters=90000,
                             warmup_iters=1000,
                             warmup_factor=0.1):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(
        shape=[1],
        value=0.0,
        dtype='float32',
        persistable=True,
        name="learning_rate")

    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iters):
            eta_min = learning_rate * warmup_factor
            decayed_lr = eta_min + (learning_rate - eta_min) * (global_step / warmup_iters)
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
        with switch.default():
            decayed_lr = learning_rate * \
                (ops.cos((global_step - warmup_iters) * (math.pi / (max_iters - warmup_iters))) + 1)/2
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Exemple #4
0
def exponential_decay_with_warmup(learning_rate,
                                  step_each_epoch,
                                  decay_epochs,
                                  decay_rate=0.97,
                                  warm_up_epoch=5.0):
    """Applies exponential decay to the learning rate.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    warmup_epoch = fluid.layers.fill_constant(shape=[1],
                                              dtype='float32',
                                              value=float(warm_up_epoch),
                                              force_cpu=True)

    with init_on_cpu():
        epoch = ops.floor(global_step / step_each_epoch)
        with fluid.layers.control_flow.Switch() as switch:
            with switch.case(epoch < warmup_epoch):
                decayed_lr = learning_rate * (global_step /
                                              (step_each_epoch * warmup_epoch))
                fluid.layers.assign(input=decayed_lr, output=lr)
            with switch.default():
                div_res = (global_step -
                           warmup_epoch * step_each_epoch) / decay_epochs
                div_res = ops.floor(div_res)
                decayed_lr = learning_rate * (decay_rate**div_res)
                fluid.layers.assign(input=decayed_lr, output=lr)

    return lr
Exemple #5
0
 def cosine_decay():
     """
         Applies cosine decay to the learning rate.
     """
     global_step = _decay_step_counter()
     frac = (1 + ops.cos(global_step / max_step * math.pi)) / 2
     return FLAGS.lr_min + (FLAGS.lr_max - FLAGS.lr_min) * frac
def linear_warmup_decay(init_lr, num_train_steps, num_warmup_steps,
                        main_program):
    with main_program._lr_schedule_guard():
        global_step = lr_scheduler._decay_step_counter()

        lr = fluid.layers.create_global_var(shape=[1],
                                            value=init_lr,
                                            dtype='float32',
                                            persistable=True,
                                            name="learning_rate")

        with control_flow.Switch() as switch:
            with switch.case(global_step < num_warmup_steps):
                decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps
                fluid.layers.assign(decayed_lr, lr)
            with switch.default():
                decayed_lr = lr_scheduler.polynomial_decay(
                    learning_rate=init_lr,
                    decay_steps=num_train_steps,
                    end_learning_rate=0.0,
                    power=1.0,
                    cycle=False)
                fluid.layers.assign(decayed_lr, lr)

        return lr
Exemple #7
0
    def __call__(self):
        global_step = _decay_step_counter()
        learning_rate = fluid.layers.tensor.create_global_var(
            shape=[1],
            value=0.0,
            dtype='float32',
            persistable=True,
            name="learning_rate")
        epoch = ops.floor(global_step / self.step_each_epoch)
        with fluid.layers.control_flow.Switch() as switch:
            with switch.case(epoch < self.warmup_epoch):
                decayed_lr = self.lr * \
                        (global_step / (self.step_each_epoch * self.warmup_epoch))
                fluid.layers.tensor.assign(input=decayed_lr,
                                           output=learning_rate)
            with switch.default():
                current_step = global_step - self.warmup_epoch * self.step_each_epoch
                total_step = (self.epochs -
                              self.warmup_epoch) * self.step_each_epoch
                decayed_lr = self.lr * \
                    (ops.cos(current_step * math.pi / total_step) + 1) / 2
                fluid.layers.tensor.assign(input=decayed_lr,
                                           output=learning_rate)

        return learning_rate
Exemple #8
0
def cosine_with_warmup_decay(learning_rate, lr_min, steps_one_epoch,
                             warmup_epochs, total_epoch, num_gpu):
    global_step = _decay_step_counter()
    epoch_idx = fluid.layers.floor(global_step / steps_one_epoch)

    lr = fluid.layers.create_global_var(
        shape=[1],
        value=0.0,
        dtype='float32',
        persistable=True,
        name="learning_rate")

    warmup_epoch_var = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(warmup_epochs), force_cpu=True)
    num_gpu_var = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(num_gpu), force_cpu=True)
    batch_idx = global_step - steps_one_epoch * epoch_idx

    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(epoch_idx < warmup_epoch_var):
            epoch_ = (batch_idx + 1) / steps_one_epoch
            factor = 1 / num_gpu_var * (
                epoch_ * (num_gpu_var - 1) / warmup_epoch_var + 1)
            decayed_lr = learning_rate * factor * num_gpu_var
            fluid.layers.assign(decayed_lr, lr)
        epoch_ = (batch_idx + 1) / steps_one_epoch
        m = epoch_ / total_epoch
        frac = (1 + ops.cos(math.pi * m)) / 2
        cosine_lr = (lr_min + (learning_rate - lr_min) * frac) * num_gpu_var
        with switch.default():
            fluid.layers.assign(cosine_lr, lr)

    return lr
def cosine_decay(lr, step_each_epoch, epochs):
    global_step = _decay_step_counter()
    with init_on_cpu():
        epoch = fluid.layers.floor(global_step / step_each_epoch)
        decayed_lr = lr * (fluid.layers.cos(epoch *
                                            (math.pi / epochs)) + 1) / 2
    return decayed_lr
Exemple #10
0
def cosine_decay(learning_rate, step_each_epoch, epochs=120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()
    epoch = ops.floor(global_step / step_each_epoch)
    decayed_lr = learning_rate * \
            (ops.cos(epoch * (math.pi / epochs)) + 1)/2
    return decayed_lr
Exemple #11
0
def cosine_warmup_decay(learning_rate, betas, warmup_factor, decay_factor,
                        total_step, warmup_pct):
    def annealing_cos(start, end, pct):
        "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
        cos_out = fluid.layers.cos(pct * np.pi) + 1.
        return cos_out * (start - end) / 2. + end

    warmup_start_lr = learning_rate * warmup_factor
    decay_end_lr = learning_rate * decay_factor
    warmup_step = total_step * warmup_pct

    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=float(learning_rate),
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")
    beta1 = fluid.layers.create_global_var(shape=[1],
                                           value=float(betas[0]),
                                           dtype='float32',
                                           persistable=True,
                                           name="beta1")

    warmup_step_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_step),
                                                 force_cpu=True)

    warmup_pred = global_step < warmup_step_var
    decay_pred = global_step >= warmup_step_var

    # learning rate warmup and decay
    def warmup_lr():
        return annealing_cos(warmup_start_lr, learning_rate,
                             global_step / warmup_step_var)

    def decay_lr():
        return annealing_cos(learning_rate, decay_end_lr,
                             (global_step - warmup_step_var) /
                             (total_step - warmup_step))

    lr = fluid.layers.case(pred_fn_pairs=[(warmup_pred,
                                           warmup_lr), (decay_pred, decay_lr)])

    # Adam beta1 warmup and decay
    def warmup_beta1():
        return annealing_cos(betas[0], betas[1], global_step / warmup_step_var)

    def decay_beta1():
        return annealing_cos(betas[0], betas[1], global_step / warmup_step_var)

    beta1 = fluid.layers.case(
        pred_fn_pairs=[(warmup_pred, warmup_beta1), (decay_pred, decay_beta1)])

    return lr, beta1
Exemple #12
0
def cosine_decay(learning_rate, step_each_epoch, epochs = 120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()
    with init_on_cpu():
        epoch = fluid.layers.floor(global_step / step_each_epoch)
        lr = learning_rate / 2.
        decayed_lr = lr * (fluid.layers.cos(epoch * (math.pi / epochs)) + 1)
    return decayed_lr
Exemple #13
0
def cosine_decay(learning_rate, num_epoch, steps_one_epoch):
    """Applies cosine decay to the learning rate.
    lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    decayed_lr = learning_rate * \
             (ops.cos(fluid.layers.floor(global_step / steps_one_epoch) \
             * math.pi / num_epoch) + 1)/2
    return decayed_lr
def cosine_decay_v2(learning_rate, totalsteps):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(global_step * (math.pi / totalsteps)) + 1)
    decrease lr for every mini-batch.
    """
    global_step = _decay_step_counter()

    with init_on_cpu():
        decayed_lr = learning_rate * \
                     (ops.cos(global_step * (math.pi / float(totalsteps))) + 1)/2
    return decayed_lr
Exemple #15
0
def cosine_decay(learning_rate, num_epoch, steps_one_epoch):
    """Applies cosine decay to the learning rate.
    lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    with init_on_cpu():
        decayed_lr = learning_rate * \
                 (ops.cos((global_step / steps_one_epoch) \
                 * math.pi / num_epoch) + 1)/2
    return decayed_lr
Exemple #16
0
def cos_anneal_with_warmup_decay(learning_rate, boundaries, values,
                                 warmup_iter, warmup_factor):
    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=0.0,
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")

    warmup_iter_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_iter),
                                                 force_cpu=True)
    cos_boundaries = [warmup_iter] + boundaries[:-1]
    cos_boundaries = np.array(boundaries) - np.array(cos_boundaries)
    cos_boundaries = cos_boundaries.tolist()
    cos_step = [warmup_iter] + boundaries[:-1]
    #pdb.set_trace()

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iter_var):
            alpha = global_step / warmup_iter_var
            factor = warmup_factor * (1 - alpha) + alpha
            decayed_lr = learning_rate * factor
            fluid.layers.assign(decayed_lr, lr)

        for i in range(len(boundaries)):
            boundary_val = fluid.layers.fill_constant(shape=[1],
                                                      dtype='float32',
                                                      value=float(
                                                          boundaries[i]),
                                                      force_cpu=True)
            #value_var = fluid.layers.fill_constant(
            #    shape=[1], dtype='float32', value=float(values[i]))
            with switch.case(global_step < boundary_val):
                #pdb.set_trace()
                cur_epoch_factor = (global_step -
                                    cos_step[i]) / cos_boundaries[i]
                cur_epoch_cos_factor = ops.cos(cur_epoch_factor * math.pi /
                                               2.0)
                cur_lr = cur_epoch_cos_factor * values[i]

                fluid.layers.assign(cur_lr, lr)

        last_value_var = fluid.layers.fill_constant(
            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        with switch.default():
            fluid.layers.assign(last_value_var, lr)

    return lr
Exemple #17
0
def cosine_decay(learning_rate, step_each_epoch, epochs = 120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    with init_on_cpu():
        # update 
        epoch = ops.floor(global_step / step_each_epoch)
        decayed_lr = learning_rate * \
                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
    #if global_step % step_each_epoch == 0:
    #    print("epoch={0}, global_step={1},decayed_lr={2} \
    #          (step_each_epoch={3})".format( \
    #          epoch,global_step,decayed_lr,step_each_epoch))
    return decayed_lr
Exemple #18
0
def cosine_warmup_decay(learning_rate, betas, warmup_factor, decay_factor,
                        total_step, warmup_pct):
    def annealing_cos(start, end, pct):
        "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
        cos_out = fluid.layers.cos(pct * np.pi) + 1.
        return cos_out * (start - end) / 2. + end

    warmup_start_lr = learning_rate * warmup_factor
    decay_end_lr = learning_rate * decay_factor
    warmup_step = total_step * warmup_pct

    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(
        shape=[1],
        value=float(learning_rate),
        dtype='float32',
        persistable=True,
        name="learning_rate")
    beta1 = fluid.layers.create_global_var(
        shape=[1],
        value=float(betas[0]),
        dtype='float32',
        persistable=True,
        name="beta1")

    warmup_step_var = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(warmup_step), force_cpu=True)

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_step_var):
            cur_lr = annealing_cos(warmup_start_lr, learning_rate,
                                   global_step / warmup_step_var)
            fluid.layers.assign(cur_lr, lr)
            cur_beta1 = annealing_cos(betas[0], betas[1],
                                   global_step / warmup_step_var)
            fluid.layers.assign(cur_beta1, beta1)
        with switch.case(global_step >= warmup_step_var):
            cur_lr = annealing_cos(learning_rate, decay_end_lr,
                                   (global_step - warmup_step_var) / (total_step - warmup_step))
            fluid.layers.assign(cur_lr, lr)
            cur_beta1 = annealing_cos(betas[1], betas[0],
                                   (global_step - warmup_step_var) / (total_step - warmup_step))
            fluid.layers.assign(cur_beta1, beta1)

    return lr, beta1
Exemple #19
0
def exponential_with_warmup_decay(learning_rate, boundaries, values,
                                  warmup_iter, warmup_factor):
    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=0.0,
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")

    warmup_iter_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_iter),
                                                 force_cpu=True)

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iter_var):
            alpha = global_step / warmup_iter_var
            factor = warmup_factor * (1 - alpha) + alpha
            decayed_lr = learning_rate * factor
            fluid.layers.assign(decayed_lr, lr)

        for i in range(len(boundaries)):
            boundary_val = fluid.layers.fill_constant(shape=[1],
                                                      dtype='float32',
                                                      value=float(
                                                          boundaries[i]),
                                                      force_cpu=True)
            value_var = fluid.layers.fill_constant(shape=[1],
                                                   dtype='float32',
                                                   value=float(values[i]))
            with switch.case(global_step < boundary_val):
                fluid.layers.assign(value_var, lr)

        last_value_var = fluid.layers.fill_constant(
            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        with switch.default():
            fluid.layers.assign(last_value_var, lr)

    return lr
def cosine_decay_v2_with_warmup(learning_rate, warmupsteps, totalsteps):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    with init_on_cpu():
        with control_flow.Switch() as switch:
            with switch.case(global_step < warmupsteps):
                decayed_lr = learning_rate * (global_step / float(warmupsteps))
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
            with switch.default():
                decayed_lr = learning_rate * \
                     (ops.cos((global_step - warmupsteps) * (math.pi / (totalsteps))) + 1)/2
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
    def __call__(self, base_lr=None, learning_rate=None):
        steps = _decay_step_counter()
        total = self.total_steps
        if self.skip_steps is not None:
            total -= self.skip_steps

        lr = fluid.layers.tensor.create_global_var(shape=[1],
                                                   value=base_lr,
                                                   dtype='float32',
                                                   persistable=True,
                                                   name="learning_rate")

        def decay():
            cos_lr = base_lr * .5 * (cos(steps * (math.pi / total)) + 1)
            fluid.layers.tensor.assign(input=cos_lr, output=lr)

        if self.skip_steps is None:
            decay()
        else:
            skipped = steps >= self.skip_steps
            fluid.layers.cond(skipped, decay)
        return lr
Exemple #22
0
def linear_warmup_and_invsqrt_decay(learning_rate, warmup_steps, decay_steps):
    """Applies linear warmup and invsqrt decay to the learning rate."""
    dtype = "float32"

    with fluid.default_main_program()._lr_schedule_guard():
        lr = layers.create_global_var(shape=[1],
                                      value=0.0,
                                      dtype=dtype,
                                      persistable=True,
                                      name="learning_rate")

        global_step = _decay_step_counter(1)

        with layers.control_flow.Switch() as switch:
            with switch.case(global_step < warmup_steps):
                warmup_lr = learning_rate * (global_step / warmup_steps)
                layers.assign(warmup_lr, lr)
            with switch.default():
                decayed_lr = lr * ops.sqrt(
                    decay_steps / (global_step - warmup_steps + decay_steps))
                layers.assign(decayed_lr, lr)
        return lr
Exemple #23
0
def cosine_decay_with_warmup(learning_rate,
                             step_each_epoch,
                             epochs=500,
                             warmup_minibatch=1000):
    """
    Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    args:
        learning_rate(float): initial learning rate
        step_each_epoch (int): number of step for each epoch in training process
        epochs(int): number of training epochs
        warmup_minibatch(int): number of minibatch for warmup
    return:
        lr(tensor): learning rate tensor
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    warmup_minibatch = fluid.layers.fill_constant(
        shape=[1],
        dtype='float32',
        value=float(warmup_minibatch),
        force_cpu=True)

    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(global_step < warmup_minibatch):
            decayed_lr = learning_rate * (1.0 * global_step / warmup_minibatch)
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
        with switch.default():
            decayed_lr = learning_rate * \
                (ops.cos((global_step - warmup_minibatch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Exemple #24
0
def main():
    env = os.environ
    FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
    if FLAGS.dist:
        trainer_id = int(env['PADDLE_TRAINER_ID'])
        local_seed = (99 + trainer_id)
        random.seed(local_seed)
        np.random.seed(local_seed)

    if FLAGS.enable_ce:
        random.seed(0)
        np.random.seed(0)

    cfg = load_config(FLAGS.config)
    merge_config(FLAGS.opt)
    check_config(cfg)
    # check if set use_gpu=True in paddlepaddle cpu version
    check_gpu(cfg.use_gpu)
    # check if paddlepaddle version is satisfied
    check_version()

    save_only = getattr(cfg, 'save_prediction_only', False)
    if save_only:
        raise NotImplementedError('The config file only support prediction,'
                                  ' training stage is not implemented now')
    main_arch = cfg.architecture

    if cfg.use_gpu:
        devices_num = fluid.core.get_cuda_device_count()
    else:
        devices_num = int(os.environ.get('CPU_NUM', 1))

    if 'FLAGS_selected_gpus' in env:
        device_id = int(env['FLAGS_selected_gpus'])
    else:
        device_id = 0
    place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    lr_builder = create('LearningRate')
    optim_builder = create('OptimizerBuilder')

    # build program
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    if FLAGS.enable_ce:
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            model = create(main_arch)
            if FLAGS.fp16:
                assert (getattr(model.backbone, 'norm_type', None)
                        != 'affine_channel'), \
                    '--fp16 currently does not support affine channel, ' \
                    ' please modify backbone settings to use batch norm'

            with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                inputs_def = cfg['TrainReader']['inputs_def']
                feed_vars, train_loader = model.build_inputs(**inputs_def)
                train_fetches = model.train(feed_vars)
                loss = train_fetches['loss']
                if FLAGS.fp16:
                    loss *= ctx.get_loss_scale_var()
                lr = lr_builder()
                optimizer = optim_builder(lr)
                optimizer.minimize(loss)

                if FLAGS.fp16:
                    loss /= ctx.get_loss_scale_var()

            if 'use_ema' in cfg and cfg['use_ema']:
                global_steps = _decay_step_counter()
                ema = ExponentialMovingAverage(
                    cfg['ema_decay'], thres_steps=global_steps)
                ema.update()

    # parse train fetches
    train_keys, train_values, _ = parse_fetches(train_fetches)
    train_values.append(lr)

    if FLAGS.eval:
        eval_prog = fluid.Program()
        with fluid.program_guard(eval_prog, startup_prog):
            with fluid.unique_name.guard():
                model = create(main_arch)
                inputs_def = cfg['EvalReader']['inputs_def']
                feed_vars, eval_loader = model.build_inputs(**inputs_def)
                fetches = model.eval(feed_vars)
        eval_prog = eval_prog.clone(True)

        eval_reader = create_reader(cfg.EvalReader, devices_num=1)
        eval_loader.set_sample_list_generator(eval_reader, place)

        # parse eval fetches
        extra_keys = []
        if cfg.metric == 'COCO':
            extra_keys = ['im_info', 'im_id', 'im_shape']
        if cfg.metric == 'VOC':
            extra_keys = ['gt_bbox', 'gt_class', 'is_difficult']
        if cfg.metric == 'WIDERFACE':
            extra_keys = ['im_id', 'im_shape', 'gt_bbox']
        eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog,
                                                         extra_keys)

    # compile program for multi-devices
    build_strategy = fluid.BuildStrategy()
    build_strategy.fuse_all_optimizer_ops = False
    # only enable sync_bn in multi GPU devices
    sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
    build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
        and cfg.use_gpu

    exec_strategy = fluid.ExecutionStrategy()
    # iteration number when CompiledProgram tries to drop local execution scopes.
    # Set it to be 1 to save memory usages, so that unused variables in
    # local execution scopes can be deleted after each iteration.
    exec_strategy.num_iteration_per_drop_scope = 1
    if FLAGS.dist:
        dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog,
                                             train_prog)
        exec_strategy.num_threads = 1

    exe.run(startup_prog)
    compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel(
        loss_name=loss.name,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    if FLAGS.eval:
        compiled_eval_prog = fluid.CompiledProgram(eval_prog)

    fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel'

    ignore_params = cfg.finetune_exclude_pretrained_params \
                 if 'finetune_exclude_pretrained_params' in cfg else []

    start_iter = 0
    if FLAGS.resume_checkpoint:
        checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint)
        start_iter = checkpoint.global_step()
    elif cfg.pretrain_weights and fuse_bn and not ignore_params:
        checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights)
    elif cfg.pretrain_weights:
        checkpoint.load_params(
            exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params)

    train_reader = create_reader(
        cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num,
        cfg,
        devices_num=devices_num)
    train_loader.set_sample_list_generator(train_reader, place)

    # whether output bbox is normalized in model output layer
    is_bbox_normalized = False
    if hasattr(model, 'is_bbox_normalized') and \
            callable(model.is_bbox_normalized):
        is_bbox_normalized = model.is_bbox_normalized()

    # if map_type not set, use default 11point, only use in VOC eval
    map_type = cfg.map_type if 'map_type' in cfg else '11point'

    train_stats = TrainingStats(cfg.log_smooth_window, train_keys)
    train_loader.start()
    start_time = time.time()
    end_time = time.time()

    cfg_name = os.path.basename(FLAGS.config).split('.')[0]
    save_dir = os.path.join(cfg.save_dir, cfg_name)
    time_stat = deque(maxlen=cfg.log_smooth_window)
    best_box_ap_list = [0.0, 0]  #[map, iter]

    # use VisualDL to log data
    if FLAGS.use_vdl:
        from visualdl import LogWriter
        vdl_writer = LogWriter(FLAGS.vdl_log_dir)
        vdl_loss_step = 0
        vdl_mAP_step = 0

    for it in range(start_iter, cfg.max_iters):
        start_time = end_time
        end_time = time.time()
        time_stat.append(end_time - start_time)
        time_cost = np.mean(time_stat)
        eta_sec = (cfg.max_iters - it) * time_cost
        eta = str(datetime.timedelta(seconds=int(eta_sec)))
        outs = exe.run(compiled_train_prog, fetch_list=train_values)
        stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])}

        # use vdl-paddle to log loss
        if FLAGS.use_vdl:
            if it % cfg.log_iter == 0:
                for loss_name, loss_value in stats.items():
                    vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step)
                vdl_loss_step += 1

        train_stats.update(stats)
        logs = train_stats.log()
        if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0):
            strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format(
                it, np.mean(outs[-1]), logs, time_cost, eta)
            logger.info(strs)

        # NOTE : profiler tools, used for benchmark
        if FLAGS.is_profiler and it == 5:
            profiler.start_profiler("All")
        elif FLAGS.is_profiler and it == 10:
            profiler.stop_profiler("total", FLAGS.profiler_path)
            return


        if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \
           and (not FLAGS.dist or trainer_id == 0):
            save_name = str(it) if it != cfg.max_iters - 1 else "model_final"
            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.apply_program)
            checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name))

            if FLAGS.eval:
                # evaluation
                resolution = None
                if 'Mask' in cfg.architecture:
                    resolution = model.mask_head.resolution
                results = eval_run(
                    exe,
                    compiled_eval_prog,
                    eval_loader,
                    eval_keys,
                    eval_values,
                    eval_cls,
                    cfg,
                    resolution=resolution)
                box_ap_stats = eval_results(
                    results, cfg.metric, cfg.num_classes, resolution,
                    is_bbox_normalized, FLAGS.output_eval, map_type,
                    cfg['EvalReader']['dataset'])

                # use vdl_paddle to log mAP
                if FLAGS.use_vdl:
                    vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step)
                    vdl_mAP_step += 1

                if box_ap_stats[0] > best_box_ap_list[0]:
                    best_box_ap_list[0] = box_ap_stats[0]
                    best_box_ap_list[1] = it
                    checkpoint.save(exe, train_prog,
                                    os.path.join(save_dir, "best_model"))
                logger.info("Best test box ap: {}, in iter: {}".format(
                    best_box_ap_list[0], best_box_ap_list[1]))

            if 'use_ema' in cfg and cfg['use_ema']:
                exe.run(ema.restore_program)

    train_loader.reset()
Exemple #25
0
def exponential_with_warmup_decay(learning_rate, boundaries, values,
                                  warmup_iter, warmup_factor):
    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=0.0,
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")
    warmup_iter_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_iter),
                                                 force_cpu=True)

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iter_var):
            alpha = global_step / warmup_iter_var
            factor = warmup_factor * (1 - alpha) + alpha
            decayed_lr = learning_rate * factor
            fluid.layers.assign(decayed_lr, lr)

        #for i in range(len(boundaries)):
    #    boundary_val = fluid.layers.fill_constant(
    #        shape=[1],
    #        dtype='float32',
    #        value=float(1000000000),
    #        force_cpu=True)
    #    #value_var = fluid.layers.fill_constant(
    #    #    shape=[1], dtype='float32', value=float(values[i]))
    #    lr_decay_rate =  fluid.layers.fill_constant(
    #        shape=[1], dtype='float32', value=float(0.94))
    #    lr_decay_steps = fluid.layers.fill_constant(
    #        shape=[1], dtype='float32', value=float(1250.0))
    #    tmp1 = fluid.layers.cast(global_step / lr_decay_steps, dtype='int32')
    #    tmp2 = fluid.layers.cast(tmp1, dtype='float32')
    #    value_var = learning_rate * fluid.layers.pow(lr_decay_rate, global_step / lr_decay_steps)
    #    with switch.case(global_step < boundary_val):
    #        fluid.layers.assign(value_var, lr)

    #last_value_var = fluid.layers.fill_constant(
    #    shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        ex_decay_lr = fluid.layers.exponential_decay(
            learning_rate=learning_rate,
            decay_steps=10000,
            decay_rate=0.94,
            staircase=True)
        with switch.default():
            fluid.layers.assign(ex_decay_lr, lr)

#    with control_flow.Switch() as switch:
#        with switch.case(global_step < warmup_iter_var):
#            alpha = global_step / warmup_iter_var
#            factor = warmup_factor * (1 - alpha) + alpha
#            decayed_lr = learning_rate * factor
#            fluid.layers.assign(decayed_lr, lr)
#
#        for i in range(len(boundaries)):
#            boundary_val = fluid.layers.fill_constant(
#                shape=[1],
#                dtype='float32',
#                value=float(boundaries[i]),
#                force_cpu=True)
#            value_var = fluid.layers.fill_constant(
#                shape=[1], dtype='float32', value=float(values[i]))
#            with switch.case(global_step < boundary_val):
#                fluid.layers.assign(value_var, lr)
#
#        last_value_var = fluid.layers.fill_constant(
#            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
#        with switch.default():
#            fluid.layers.assign(last_value_var, lr)

    return lr
Exemple #26
0
def poly_decay():
    global_step = _decay_step_counter()
    with init_on_cpu():
        decayed_lr = LEARNING_RATE * (fluid.layers.pow(
            (1 - global_step / TOTAL_STEP), POWER))
    return decayed_lr
Exemple #27
0
 def cosine_annealing(self):
     step = _decay_step_counter()
     lr = FLAGS.lr_min + (FLAGS.lr_max - FLAGS.lr_min) / 2 \
          * (1.0 + fluid.layers.ops.cos(step / self.max_step * math.pi))
     return lr
Exemple #28
0
    def scheduler_handler(self, max_train_steps):
        scheduled_lr = fluid.layers.create_global_var(shape=[1],
                                                      value=self.learning_rate,
                                                      dtype='float32',
                                                      persistable=True,
                                                      name="learning_rate")

        if not self.scheduler["slanted_triangle"]["cut_fraction"]:
            warmup_steps = int(max_train_steps * self.scheduler["warmup"])
            linear_decay_start = int(
                max_train_steps *
                self.scheduler["linear_decay"]["start_point"])
            if linear_decay_start < warmup_steps:
                logger.warning(
                    "linear decay can not start during warmup process,"
                    "it will start after warmup ends!")
                linear_decay_start = warmup_steps
            if self.scheduler["noam_decay"]:
                if warmup_steps > 0:
                    scheduled_lr = fluid.layers.learning_rate_scheduler \
                        .noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)),
                                    warmup_steps)
                else:
                    logger.warning(
                        "Noam decay learning rate scheduler should have positive \
                        warmup steps, using constant learning rate instead!")

            if not self.scheduler["noam_decay"] and \
                    (warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1):
                with self.main_program._lr_schedule_guard():
                    global_step = lr_scheduler._decay_step_counter()
                    with control_flow.Switch() as switch:
                        if warmup_steps > 0:
                            with switch.case(global_step < warmup_steps):
                                decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps
                                fluid.layers.assign(decayed_lr, scheduled_lr)
                        if self.scheduler["linear_decay"]["start_point"] < 1:
                            with switch.case(
                                    global_step >= linear_decay_start):
                                decayed_lr = lr_scheduler.polynomial_decay(
                                    learning_rate=self.learning_rate,
                                    decay_steps=max_train_steps,
                                    end_learning_rate=self.scheduler[
                                        "linear_decay"]["end_learning_rate"],
                                    power=1.0,
                                    cycle=False)
                                fluid.layers.assign(decayed_lr, scheduled_lr)
        else:
            if self.scheduler["warmup"] or self.scheduler[
                    "noam_decay"] or self.scheduler["linear_decay"][
                        "start_point"] < 1:
                logger.warning(
                    "You are using slanted_triangle learning rate "
                    "which will make warmup, noam_decay and linear_decay unable"
                )
            cut_step = int(max_train_steps *
                           self.scheduler["slanted_triangle"]["cut_fraction"])
            ratio = self.scheduler["slanted_triangle"]["ratio"]
            global_step = lr_scheduler._decay_step_counter()
            with control_flow.Switch() as switch:
                with switch.case(global_step <= cut_step):
                    pct = global_step / cut_step
                    decayed_lr = self.learning_rate * (1 + pct *
                                                       (ratio - 1)) / ratio
                    fluid.layers.assign(decayed_lr, scheduled_lr)
                with switch.default():
                    pct = 1 - (global_step - cut_step) / (max_train_steps -
                                                          cut_step)
                    decayed_lr = self.learning_rate * (1 + pct *
                                                       (ratio - 1)) / ratio
                    fluid.layers.assign(decayed_lr, scheduled_lr)

        super(CombinedStrategy,
              self).__init__(optimizer_name=self._optimizer_name,
                             learning_rate=scheduled_lr)

        if self.scheduler["discriminative"]["blocks"]:
            _block_layers = math.ceil(
                len(self.sorted_depth) /
                self.scheduler["discriminative"]["blocks"])
            power = 0
            for cnt, depth in enumerate(self.sorted_depth):
                for index, param in enumerate(self.depth_params_dict[depth]):
                    param.optimize_attr["learning_rate"] *= \
                        pow(1.0 / self.scheduler["discriminative"]["factor"], power)
                if cnt and cnt % _block_layers == 0:
                    power += 1
        return scheduled_lr
Exemple #29
0
def poly_decay():
    global_step = _decay_step_counter()
Exemple #30
0
            # 建立损失函数
            y_true = P.data(name='y_true',
                            shape=[-1, 8, 28, 28],
                            append_batch_size=False,
                            dtype='float32')
            # 先把差值逐项平方,可以用P.pow()这个op,也可以用python里的运算符**。
            mseloss = P.pow(y_true - act02_out_tensor, 2)
            mseloss = P.reduce_mean(mseloss)  # 再求平均,即mse损失函数

            # 优化器
            optimizer = fluid.optimizer.SGD(learning_rate=lr)
            optimizer.minimize(mseloss)

            # ema
            global_steps = _decay_step_counter()
            ema = ExponentialMovingAverage(ema_decay, thres_steps=global_steps)
            ema.update()

    eval_prog = fluid.Program()
    with fluid.program_guard(eval_prog, startup_prog):
        with fluid.unique_name.guard():
            # 重新建立一次网络,用相同的张量名,不用写损失层
            inputs = P.data(name='input_1',
                            shape=[-1, 3, 28, 28],
                            append_batch_size=False,
                            dtype='float32')
            conv01_out_tensor = fluid.layers.conv2d(
                input=inputs,
                num_filters=8,
                filter_size=1,