Ejemplo n.º 1
0
 def cosine_decay():
     """
         Applies cosine decay to the learning rate.
     """
     global_step = _decay_step_counter()
     frac = (1 + ops.cos(global_step / max_step * math.pi)) / 2
     return FLAGS.lr_min + (FLAGS.lr_max - FLAGS.lr_min) * frac
Ejemplo n.º 2
0
def linear_warmup_and_cosine_decay(learning_rate, end_lr, warmup_steps,
                                   max_training_steps):
    """Applies linear warmup and cosine decay to the learning rate."""
    dtype = "float32"

    with fluid.default_main_program()._lr_schedule_guard():
        lr = layers.create_global_var(shape=[1],
                                      value=0.0,
                                      dtype=dtype,
                                      persistable=True,
                                      name="learning_rate")

        global_step = _decay_step_counter(1)

        with layers.control_flow.Switch() as switch:
            with switch.case(global_step < warmup_steps):
                warmup_lr = learning_rate * (global_step / warmup_steps)
                layers.assign(warmup_lr, lr)
            with switch.case(global_step < max_training_steps):
                frac = 0.5 * (ops.cos((global_step - warmup_steps) * math.pi /
                                      (max_training_steps - warmup_steps)) + 1)
                decayed_lr = end_lr + (learning_rate - end_lr) * frac
                layers.assign(decayed_lr, lr)
            with switch.default():
                learning_rate = layers.fill_constant(shape=[1],
                                                     dtype=dtype,
                                                     value=end_lr)
                layers.assign(learning_rate, lr)
        return lr
Ejemplo n.º 3
0
def cosine_with_warmup_decay(learning_rate, lr_min, steps_one_epoch,
                             warmup_epochs, total_epoch, num_gpu):
    global_step = _decay_step_counter()
    epoch_idx = fluid.layers.floor(global_step / steps_one_epoch)

    lr = fluid.layers.create_global_var(
        shape=[1],
        value=0.0,
        dtype='float32',
        persistable=True,
        name="learning_rate")

    warmup_epoch_var = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(warmup_epochs), force_cpu=True)
    num_gpu_var = fluid.layers.fill_constant(
        shape=[1], dtype='float32', value=float(num_gpu), force_cpu=True)
    batch_idx = global_step - steps_one_epoch * epoch_idx

    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(epoch_idx < warmup_epoch_var):
            epoch_ = (batch_idx + 1) / steps_one_epoch
            factor = 1 / num_gpu_var * (
                epoch_ * (num_gpu_var - 1) / warmup_epoch_var + 1)
            decayed_lr = learning_rate * factor * num_gpu_var
            fluid.layers.assign(decayed_lr, lr)
        epoch_ = (batch_idx + 1) / steps_one_epoch
        m = epoch_ / total_epoch
        frac = (1 + ops.cos(math.pi * m)) / 2
        cosine_lr = (lr_min + (learning_rate - lr_min) * frac) * num_gpu_var
        with switch.default():
            fluid.layers.assign(cosine_lr, lr)

    return lr
Ejemplo n.º 4
0
    def __call__(self):
        global_step = _decay_step_counter()
        learning_rate = fluid.layers.tensor.create_global_var(
            shape=[1],
            value=0.0,
            dtype='float32',
            persistable=True,
            name="learning_rate")
        epoch = ops.floor(global_step / self.step_each_epoch)
        with fluid.layers.control_flow.Switch() as switch:
            with switch.case(epoch < self.warmup_epoch):
                decayed_lr = self.lr * \
                        (global_step / (self.step_each_epoch * self.warmup_epoch))
                fluid.layers.tensor.assign(input=decayed_lr,
                                           output=learning_rate)
            with switch.default():
                current_step = global_step - self.warmup_epoch * self.step_each_epoch
                total_step = (self.epochs -
                              self.warmup_epoch) * self.step_each_epoch
                decayed_lr = self.lr * \
                    (ops.cos(current_step * math.pi / total_step) + 1) / 2
                fluid.layers.tensor.assign(input=decayed_lr,
                                           output=learning_rate)

        return learning_rate
Ejemplo n.º 5
0
def cosine_decay_with_warmup(learning_rate,
                             max_iters=90000,
                             warmup_iters=1000,
                             warmup_factor=0.1):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(
        shape=[1],
        value=0.0,
        dtype='float32',
        persistable=True,
        name="learning_rate")

    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iters):
            eta_min = learning_rate * warmup_factor
            decayed_lr = eta_min + (learning_rate - eta_min) * (global_step / warmup_iters)
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
        with switch.default():
            decayed_lr = learning_rate * \
                (ops.cos((global_step - warmup_iters) * (math.pi / (max_iters - warmup_iters))) + 1)/2
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Ejemplo n.º 6
0
def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    warmup_epoch = fluid.layers.fill_constant(shape=[1],
                                              dtype='float32',
                                              value=float(5),
                                              force_cpu=True)

    with init_on_cpu():
        epoch = ops.floor(global_step / step_each_epoch)
        with fluid.layers.control_flow.Switch() as switch:
            with switch.case(epoch < warmup_epoch):
                decayed_lr = learning_rate * (global_step /
                                              (step_each_epoch * warmup_epoch))
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
            with switch.default():
                decayed_lr = learning_rate * \
                    (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Ejemplo n.º 7
0
def cosine_decay(learning_rate, step_each_epoch, epochs=120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()
    epoch = ops.floor(global_step / step_each_epoch)
    decayed_lr = learning_rate * \
            (ops.cos(epoch * (math.pi / epochs)) + 1)/2
    return decayed_lr
Ejemplo n.º 8
0
def cosine_decay(learning_rate, num_epoch, steps_one_epoch):
    """Applies cosine decay to the learning rate.
    lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    decayed_lr = learning_rate * \
             (ops.cos(fluid.layers.floor(global_step / steps_one_epoch) \
             * math.pi / num_epoch) + 1)/2
    return decayed_lr
Ejemplo n.º 9
0
def cosine_decay_v2(learning_rate, totalsteps):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(global_step * (math.pi / totalsteps)) + 1)
    decrease lr for every mini-batch.
    """
    global_step = _decay_step_counter()

    with init_on_cpu():
        decayed_lr = learning_rate * \
                     (ops.cos(global_step * (math.pi / float(totalsteps))) + 1)/2
    return decayed_lr
Ejemplo n.º 10
0
def cosine_decay(learning_rate, num_epoch, steps_one_epoch):
    """Applies cosine decay to the learning rate.
    lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    with init_on_cpu():
        decayed_lr = learning_rate * \
                 (ops.cos((global_step / steps_one_epoch) \
                 * math.pi / num_epoch) + 1)/2
    return decayed_lr
Ejemplo n.º 11
0
def cos_anneal_with_warmup_decay(learning_rate, boundaries, values,
                                 warmup_iter, warmup_factor):
    global_step = lr_scheduler._decay_step_counter()

    lr = fluid.layers.create_global_var(shape=[1],
                                        value=0.0,
                                        dtype='float32',
                                        persistable=True,
                                        name="learning_rate")

    warmup_iter_var = fluid.layers.fill_constant(shape=[1],
                                                 dtype='float32',
                                                 value=float(warmup_iter),
                                                 force_cpu=True)
    cos_boundaries = [warmup_iter] + boundaries[:-1]
    cos_boundaries = np.array(boundaries) - np.array(cos_boundaries)
    cos_boundaries = cos_boundaries.tolist()
    cos_step = [warmup_iter] + boundaries[:-1]
    #pdb.set_trace()

    with control_flow.Switch() as switch:
        with switch.case(global_step < warmup_iter_var):
            alpha = global_step / warmup_iter_var
            factor = warmup_factor * (1 - alpha) + alpha
            decayed_lr = learning_rate * factor
            fluid.layers.assign(decayed_lr, lr)

        for i in range(len(boundaries)):
            boundary_val = fluid.layers.fill_constant(shape=[1],
                                                      dtype='float32',
                                                      value=float(
                                                          boundaries[i]),
                                                      force_cpu=True)
            #value_var = fluid.layers.fill_constant(
            #    shape=[1], dtype='float32', value=float(values[i]))
            with switch.case(global_step < boundary_val):
                #pdb.set_trace()
                cur_epoch_factor = (global_step -
                                    cos_step[i]) / cos_boundaries[i]
                cur_epoch_cos_factor = ops.cos(cur_epoch_factor * math.pi /
                                               2.0)
                cur_lr = cur_epoch_cos_factor * values[i]

                fluid.layers.assign(cur_lr, lr)

        last_value_var = fluid.layers.fill_constant(
            shape=[1], dtype='float32', value=float(values[len(values) - 1]))
        with switch.default():
            fluid.layers.assign(last_value_var, lr)

    return lr
Ejemplo n.º 12
0
def cosine_decay(learning_rate, step_each_epoch, epochs = 120):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    """
    global_step = _decay_step_counter()

    with init_on_cpu():
        # update 
        epoch = ops.floor(global_step / step_each_epoch)
        decayed_lr = learning_rate * \
                     (ops.cos(epoch * (math.pi / epochs)) + 1)/2
    #if global_step % step_each_epoch == 0:
    #    print("epoch={0}, global_step={1},decayed_lr={2} \
    #          (step_each_epoch={3})".format( \
    #          epoch,global_step,decayed_lr,step_each_epoch))
    return decayed_lr
Ejemplo n.º 13
0
def cosine_decay_v2_with_warmup(learning_rate, warmupsteps, totalsteps):
    """Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    with init_on_cpu():
        with control_flow.Switch() as switch:
            with switch.case(global_step < warmupsteps):
                decayed_lr = learning_rate * (global_step / float(warmupsteps))
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
            with switch.default():
                decayed_lr = learning_rate * \
                     (ops.cos((global_step - warmupsteps) * (math.pi / (totalsteps))) + 1)/2
                fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Ejemplo n.º 14
0
def cosine_decay_with_warmup(learning_rate,
                             step_each_epoch,
                             epochs=500,
                             warmup_minibatch=1000):
    """
    Applies cosine decay to the learning rate.
    lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1)
    decrease lr for every mini-batch and start with warmup.
    args:
        learning_rate(float): initial learning rate
        step_each_epoch (int): number of step for each epoch in training process
        epochs(int): number of training epochs
        warmup_minibatch(int): number of minibatch for warmup
    return:
        lr(tensor): learning rate tensor
    """
    global_step = _decay_step_counter()
    lr = fluid.layers.tensor.create_global_var(shape=[1],
                                               value=0.0,
                                               dtype='float32',
                                               persistable=True,
                                               name="learning_rate")

    warmup_minibatch = fluid.layers.fill_constant(
        shape=[1],
        dtype='float32',
        value=float(warmup_minibatch),
        force_cpu=True)

    with fluid.layers.control_flow.Switch() as switch:
        with switch.case(global_step < warmup_minibatch):
            decayed_lr = learning_rate * (1.0 * global_step / warmup_minibatch)
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
        with switch.default():
            decayed_lr = learning_rate * \
                (ops.cos((global_step - warmup_minibatch) * (math.pi / (epochs * step_each_epoch))) + 1)/2
            fluid.layers.tensor.assign(input=decayed_lr, output=lr)
    return lr
Ejemplo n.º 15
0
def lr_warmup(learning_rate, warmup_steps, total_step, multiplier,
              step_each_epoch):
    with default_main_program()._lr_schedule_guard():
        lr = tensor.create_global_var(shape=[1],
                                      value=0.0,
                                      dtype='float32',
                                      persistable=True,
                                      name='learning_rate_warmup')
        global_step = _decay_step_counter()

        with control_flow.Switch() as switch:
            with switch.case(global_step <= warmup_steps):
                decay_lr = learning_rate * (
                    (multiplier - 1.) * global_step / warmup_steps + 1.)
                tensor.assign(decay_lr, lr)
            with switch.default():
                learning_rate = learning_rate * multiplier
                #cur_epoch = ops.floor(global_step/step_each_epoch)
                decay_lr = learning_rate * 0.5 * (ops.cos(
                    (global_step - warmup_steps) * math.pi / (total_step)) + 1)
                tensor.assign(decay_lr, lr)

    return lr
Ejemplo n.º 16
0
    def arcface_classify(self,
                         x,
                         label,
                         margin=0.5,
                         logit_scale=64,
                         param_attr=None):
        '''
        reference: ArcFace. https://arxiv.org/abs/1801.07698
        '''
        flatten_dim = reduce(lambda a, b: a * b, x.shape[1:], 1)
        weight, bias = self.create_parameter(dtype=x.dtype,
                                             in_dim=flatten_dim,
                                             param_attr=param_attr,
                                             use_bias=False)

        # normalize x
        x_l2 = ops.sqrt(nn.reduce_sum(ops.square(x), dim=1))
        norm_x = nn.elementwise_div(x, x_l2, axis=0)

        norm_x_all = collective._c_allgather(norm_x,
                                             nranks=self.nranks,
                                             use_calc_stream=True)
        label_all = collective._c_allgather(label,
                                            nranks=self.nranks,
                                            use_calc_stream=True)
        label_all.stop_gradient = True
        shard_label = nn.shard_index(label_all,
                                     index_num=self.nclasses,
                                     nshards=self.nranks,
                                     shard_id=self.rank_id,
                                     ignore_value=-1)
        # TODO check necessary
        shard_label.stop_gradient = True

        # normalize weight
        weight_l2 = ops.sqrt(nn.reduce_sum(ops.square(weight), dim=0))
        norm_weight = nn.elementwise_div(weight, weight_l2, axis=1)

        shard_cos = nn.mul(norm_x_all, norm_weight, x_num_col_dims=1)

        theta = ops.acos(shard_cos)
        margin_cos = ops.cos(theta + margin)

        shard_one_hot = nn.one_hot(shard_label,
                                   depth=self.shard_dim,
                                   allow_out_of_range=True)
        # TODO check necessary
        shard_one_hot.stop_gradient = True

        diff = (margin_cos - shard_cos) * shard_one_hot
        shard_target_cos = shard_cos + diff
        shard_logit = nn.scale(shard_target_cos, scale=logit_scale)

        global_loss, shard_prob = self.softmax_with_cross_entropy(
            shard_logit, shard_label)
        avg_loss = nn.mean(global_loss)

        avg_loss._set_info('shard_logit', shard_logit)
        avg_loss._set_info('shard_prob', shard_prob)
        avg_loss._set_info('shard_label', shard_label)
        avg_loss._set_info('shard_dim', self.shard_dim)

        return avg_loss
Ejemplo n.º 17
0
 def decay():
     cos_lr = base_lr * .5 * (cos(steps * (math.pi / total)) + 1)
     fluid.layers.tensor.assign(input=cos_lr, output=lr)