Example #1
0
def linear_warmup_decay(init_lr, num_train_steps, num_warmup_steps,
                        main_program):
    with main_program._lr_schedule_guard():
        global_step = lr_scheduler._decay_step_counter()

        lr = fluid.layers.create_global_var(shape=[1],
                                            value=init_lr,
                                            dtype='float32',
                                            persistable=True,
                                            name="learning_rate")

        with control_flow.Switch() as switch:
            with switch.case(global_step < num_warmup_steps):
                decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps
                fluid.layers.assign(decayed_lr, lr)
            with switch.default():
                decayed_lr = lr_scheduler.polynomial_decay(
                    learning_rate=init_lr,
                    decay_steps=num_train_steps,
                    end_learning_rate=0.0,
                    power=1.0,
                    cycle=False)
                fluid.layers.assign(decayed_lr, lr)

        return lr
Example #2
0
def cos_anneal_with_warmup_decay(learning_rate, boundaries, values,
                                 warmup_iter, warmup_factor):
    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=0.0,
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")

    warmup_iter_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_iter),
                                                 force_cpu=True)
    cos_boundaries = [warmup_iter] + boundaries[:-1]
    cos_boundaries = np.array(boundaries) - np.array(cos_boundaries)
    cos_boundaries = cos_boundaries.tolist()
    cos_step = [warmup_iter] + boundaries[:-1]
    #pdb.set_trace()

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iter_var):
            alpha = global_step / warmup_iter_var
            factor = warmup_factor * (1 - alpha) + alpha
            decayed_lr = learning_rate * factor
            fluid.layers.assign(decayed_lr, lr)

        for i in range(len(boundaries)):
            boundary_val = fluid.layers.fill_constant(shape=[1],
                                                      dtype='float32',
                                                      value=float(
                                                          boundaries[i]),
                                                      force_cpu=True)
            #value_var = fluid.layers.fill_constant(
            #    shape=[1], dtype='float32', value=float(values[i]))
            with switch.case(global_step < boundary_val):
                #pdb.set_trace()
                cur_epoch_factor = (global_step -
                                    cos_step[i]) / cos_boundaries[i]
                cur_epoch_cos_factor = ops.cos(cur_epoch_factor * math.pi /
                                               2.0)
                cur_lr = cur_epoch_cos_factor * values[i]

                fluid.layers.assign(cur_lr, lr)

        last_value_var = fluid.layers.fill_constant(
            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        with switch.default():
            fluid.layers.assign(last_value_var, lr)

    return lr
Example #3
0
def cosine_warmup_decay(learning_rate, betas, warmup_factor, decay_factor,
                        total_step, warmup_pct):
    def annealing_cos(start, end, pct):
        "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0."
        cos_out = fluid.layers.cos(pct * np.pi) + 1.
        return cos_out * (start - end) / 2. + end

    warmup_start_lr = learning_rate * warmup_factor
    decay_end_lr = learning_rate * decay_factor
    warmup_step = total_step * warmup_pct

    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(
        shape=[1],
        value=float(learning_rate),
        dtype='float32',
        persistable=True,
        name="learning_rate")
    beta1 = fluid.layers.create_global_var(
        shape=[1],
        value=float(betas[0]),
        dtype='float32',
        persistable=True,
        name="beta1")

    warmup_step_var = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(warmup_step), force_cpu=True)

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_step_var):
            cur_lr = annealing_cos(warmup_start_lr, learning_rate,
                                   global_step / warmup_step_var)
            fluid.layers.assign(cur_lr, lr)
            cur_beta1 = annealing_cos(betas[0], betas[1],
                                   global_step / warmup_step_var)
            fluid.layers.assign(cur_beta1, beta1)
        with switch.case(global_step >= warmup_step_var):
            cur_lr = annealing_cos(learning_rate, decay_end_lr,
                                   (global_step - warmup_step_var) / (total_step - warmup_step))
            fluid.layers.assign(cur_lr, lr)
            cur_beta1 = annealing_cos(betas[1], betas[0],
                                   (global_step - warmup_step_var) / (total_step - warmup_step))
            fluid.layers.assign(cur_beta1, beta1)

    return lr, beta1
Example #4
0
def exponential_with_warmup_decay(learning_rate, boundaries, values,
                                  warmup_iter, warmup_factor):
    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=0.0,
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")

    warmup_iter_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_iter),
                                                 force_cpu=True)

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iter_var):
            alpha = global_step / warmup_iter_var
            factor = warmup_factor * (1 - alpha) + alpha
            decayed_lr = learning_rate * factor
            fluid.layers.assign(decayed_lr, lr)

        for i in range(len(boundaries)):
            boundary_val = fluid.layers.fill_constant(shape=[1],
                                                      dtype='float32',
                                                      value=float(
                                                          boundaries[i]),
                                                      force_cpu=True)
            value_var = fluid.layers.fill_constant(shape=[1],
                                                   dtype='float32',
                                                   value=float(values[i]))
            with switch.case(global_step < boundary_val):
                fluid.layers.assign(value_var, lr)

        last_value_var = fluid.layers.fill_constant(
            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        with switch.default():
            fluid.layers.assign(last_value_var, lr)

    return lr
def cosine_decay_v2_with_warmup(learning_rate, warmupsteps, totalsteps):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    with init_on_cpu():
        with control_flow.Switch() as switch:
            with switch.case(global_step < warmupsteps):
                decayed_lr = learning_rate * (global_step / float(warmupsteps))
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
            with switch.default():
                decayed_lr = learning_rate * \
                     (ops.cos((global_step - warmupsteps) * (math.pi / (totalsteps))) + 1)/2
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Example #6
0
def lr_warmup(learning_rate, warmup_steps, total_step, multiplier,
              step_each_epoch):
    with default_main_program()._lr_schedule_guard():
        lr = tensor.create_global_var(shape=[1],
                                      value=0.0,
                                      dtype='float32',
                                      persistable=True,
                                      name='learning_rate_warmup')
        global_step = _decay_step_counter()

        with control_flow.Switch() as switch:
            with switch.case(global_step <= warmup_steps):
                decay_lr = learning_rate * (
                    (multiplier - 1.) * global_step / warmup_steps + 1.)
                tensor.assign(decay_lr, lr)
            with switch.default():
                learning_rate = learning_rate * multiplier
                #cur_epoch = ops.floor(global_step/step_each_epoch)
                decay_lr = learning_rate * 0.5 * (ops.cos(
                    (global_step - warmup_steps) * math.pi / (total_step)) + 1)
                tensor.assign(decay_lr, lr)

    return lr
Example #7
0
    def scheduler_handler(self, max_train_steps):
        scheduled_lr = fluid.layers.create_global_var(shape=[1],
                                                      value=self.learning_rate,
                                                      dtype='float32',
                                                      persistable=True,
                                                      name="learning_rate")

        if not self.scheduler["slanted_triangle"]["cut_fraction"]:
            warmup_steps = int(max_train_steps * self.scheduler["warmup"])
            linear_decay_start = int(
                max_train_steps *
                self.scheduler["linear_decay"]["start_point"])
            if linear_decay_start < warmup_steps:
                logger.warning(
                    "linear decay can not start during warmup process,"
                    "it will start after warmup ends!")
                linear_decay_start = warmup_steps
            if self.scheduler["noam_decay"]:
                if warmup_steps > 0:
                    scheduled_lr = fluid.layers.learning_rate_scheduler \
                        .noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)),
                                    warmup_steps)
                else:
                    logger.warning(
                        "Noam decay learning rate scheduler should have positive \
                        warmup steps, using constant learning rate instead!")

            if not self.scheduler["noam_decay"] and \
                    (warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1):
                with self.main_program._lr_schedule_guard():
                    global_step = lr_scheduler._decay_step_counter()
                    with control_flow.Switch() as switch:
                        if warmup_steps > 0:
                            with switch.case(global_step < warmup_steps):
                                decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps
                                fluid.layers.assign(decayed_lr, scheduled_lr)
                        if self.scheduler["linear_decay"]["start_point"] < 1:
                            with switch.case(
                                    global_step >= linear_decay_start):
                                decayed_lr = lr_scheduler.polynomial_decay(
                                    learning_rate=self.learning_rate,
                                    decay_steps=max_train_steps,
                                    end_learning_rate=self.scheduler[
                                        "linear_decay"]["end_learning_rate"],
                                    power=1.0,
                                    cycle=False)
                                fluid.layers.assign(decayed_lr, scheduled_lr)
        else:
            if self.scheduler["warmup"] or self.scheduler[
                    "noam_decay"] or self.scheduler["linear_decay"][
                        "start_point"] < 1:
                logger.warning(
                    "You are using slanted_triangle learning rate "
                    "which will make warmup, noam_decay and linear_decay unable"
                )
            cut_step = int(max_train_steps *
                           self.scheduler["slanted_triangle"]["cut_fraction"])
            ratio = self.scheduler["slanted_triangle"]["ratio"]
            global_step = lr_scheduler._decay_step_counter()
            with control_flow.Switch() as switch:
                with switch.case(global_step <= cut_step):
                    pct = global_step / cut_step
                    decayed_lr = self.learning_rate * (1 + pct *
                                                       (ratio - 1)) / ratio
                    fluid.layers.assign(decayed_lr, scheduled_lr)
                with switch.default():
                    pct = 1 - (global_step - cut_step) / (max_train_steps -
                                                          cut_step)
                    decayed_lr = self.learning_rate * (1 + pct *
                                                       (ratio - 1)) / ratio
                    fluid.layers.assign(decayed_lr, scheduled_lr)

        super(CombinedStrategy,
              self).__init__(optimizer_name=self._optimizer_name,
                             learning_rate=scheduled_lr)

        if self.scheduler["discriminative"]["blocks"]:
            _block_layers = math.ceil(
                len(self.sorted_depth) /
                self.scheduler["discriminative"]["blocks"])
            power = 0
            for cnt, depth in enumerate(self.sorted_depth):
                for index, param in enumerate(self.depth_params_dict[depth]):
                    param.optimize_attr["learning_rate"] *= \
                        pow(1.0 / self.scheduler["discriminative"]["factor"], power)
                if cnt and cnt % _block_layers == 0:
                    power += 1
        return scheduled_lr
Example #8
0
def exponential_with_warmup_decay(learning_rate, boundaries, values,
                                  warmup_iter, warmup_factor):
    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=0.0,
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")
    warmup_iter_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_iter),
                                                 force_cpu=True)

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iter_var):
            alpha = global_step / warmup_iter_var
            factor = warmup_factor * (1 - alpha) + alpha
            decayed_lr = learning_rate * factor
            fluid.layers.assign(decayed_lr, lr)

        #for i in range(len(boundaries)):
    #    boundary_val = fluid.layers.fill_constant(
    #        shape=[1],
    #        dtype='float32',
    #        value=float(1000000000),
    #        force_cpu=True)
    #    #value_var = fluid.layers.fill_constant(
    #    #    shape=[1], dtype='float32', value=float(values[i]))
    #    lr_decay_rate =  fluid.layers.fill_constant(
    #        shape=[1], dtype='float32', value=float(0.94))
    #    lr_decay_steps = fluid.layers.fill_constant(
    #        shape=[1], dtype='float32', value=float(1250.0))
    #    tmp1 = fluid.layers.cast(global_step / lr_decay_steps, dtype='int32')
    #    tmp2 = fluid.layers.cast(tmp1, dtype='float32')
    #    value_var = learning_rate * fluid.layers.pow(lr_decay_rate, global_step / lr_decay_steps)
    #    with switch.case(global_step < boundary_val):
    #        fluid.layers.assign(value_var, lr)

    #last_value_var = fluid.layers.fill_constant(
    #    shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        ex_decay_lr = fluid.layers.exponential_decay(
            learning_rate=learning_rate,
            decay_steps=10000,
            decay_rate=0.94,
            staircase=True)
        with switch.default():
            fluid.layers.assign(ex_decay_lr, lr)

#    with control_flow.Switch() as switch:
#        with switch.case(global_step < warmup_iter_var):
#            alpha = global_step / warmup_iter_var
#            factor = warmup_factor * (1 - alpha) + alpha
#            decayed_lr = learning_rate * factor
#            fluid.layers.assign(decayed_lr, lr)
#
#        for i in range(len(boundaries)):
#            boundary_val = fluid.layers.fill_constant(
#                shape=[1],
#                dtype='float32',
#                value=float(boundaries[i]),
#                force_cpu=True)
#            value_var = fluid.layers.fill_constant(
#                shape=[1], dtype='float32', value=float(values[i]))
#            with switch.case(global_step < boundary_val):
#                fluid.layers.assign(value_var, lr)
#
#        last_value_var = fluid.layers.fill_constant(
#            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
#        with switch.default():
#            fluid.layers.assign(last_value_var, lr)

    return lr