def cosine_decay(): """ Applies cosine decay to the learning rate. """ global_step = _decay_step_counter() frac = (1 + ops.cos(global_step / max_step * math.pi)) / 2 return FLAGS.lr_min + (FLAGS.lr_max - FLAGS.lr_min) * frac
def linear_warmup_and_cosine_decay(learning_rate, end_lr, warmup_steps, max_training_steps): """Applies linear warmup and cosine decay to the learning rate.""" dtype = "float32" with fluid.default_main_program()._lr_schedule_guard(): lr = layers.create_global_var(shape=[1], value=0.0, dtype=dtype, persistable=True, name="learning_rate") global_step = _decay_step_counter(1) with layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_steps): warmup_lr = learning_rate * (global_step / warmup_steps) layers.assign(warmup_lr, lr) with switch.case(global_step < max_training_steps): frac = 0.5 * (ops.cos((global_step - warmup_steps) * math.pi / (max_training_steps - warmup_steps)) + 1) decayed_lr = end_lr + (learning_rate - end_lr) * frac layers.assign(decayed_lr, lr) with switch.default(): learning_rate = layers.fill_constant(shape=[1], dtype=dtype, value=end_lr) layers.assign(learning_rate, lr) return lr
def cosine_with_warmup_decay(learning_rate, lr_min, steps_one_epoch, warmup_epochs, total_epoch, num_gpu): global_step = _decay_step_counter() epoch_idx = fluid.layers.floor(global_step / steps_one_epoch) lr = fluid.layers.create_global_var( shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_epoch_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(warmup_epochs), force_cpu=True) num_gpu_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(num_gpu), force_cpu=True) batch_idx = global_step - steps_one_epoch * epoch_idx with fluid.layers.control_flow.Switch() as switch: with switch.case(epoch_idx < warmup_epoch_var): epoch_ = (batch_idx + 1) / steps_one_epoch factor = 1 / num_gpu_var * ( epoch_ * (num_gpu_var - 1) / warmup_epoch_var + 1) decayed_lr = learning_rate * factor * num_gpu_var fluid.layers.assign(decayed_lr, lr) epoch_ = (batch_idx + 1) / steps_one_epoch m = epoch_ / total_epoch frac = (1 + ops.cos(math.pi * m)) / 2 cosine_lr = (lr_min + (learning_rate - lr_min) * frac) * num_gpu_var with switch.default(): fluid.layers.assign(cosine_lr, lr) return lr
def __call__(self): global_step = _decay_step_counter() learning_rate = fluid.layers.tensor.create_global_var( shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") epoch = ops.floor(global_step / self.step_each_epoch) with fluid.layers.control_flow.Switch() as switch: with switch.case(epoch < self.warmup_epoch): decayed_lr = self.lr * \ (global_step / (self.step_each_epoch * self.warmup_epoch)) fluid.layers.tensor.assign(input=decayed_lr, output=learning_rate) with switch.default(): current_step = global_step - self.warmup_epoch * self.step_each_epoch total_step = (self.epochs - self.warmup_epoch) * self.step_each_epoch decayed_lr = self.lr * \ (ops.cos(current_step * math.pi / total_step) + 1) / 2 fluid.layers.tensor.assign(input=decayed_lr, output=learning_rate) return learning_rate
def cosine_decay_with_warmup(learning_rate, max_iters=90000, warmup_iters=1000, warmup_factor=0.1): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var( shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") with fluid.layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_iters): eta_min = learning_rate * warmup_factor decayed_lr = eta_min + (learning_rate - eta_min) * (global_step / warmup_iters) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmup_iters) * (math.pi / (max_iters - warmup_iters))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_epoch = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(5), force_cpu=True) with init_on_cpu(): epoch = ops.floor(global_step / step_each_epoch) with fluid.layers.control_flow.Switch() as switch: with switch.case(epoch < warmup_epoch): decayed_lr = learning_rate * (global_step / (step_each_epoch * warmup_epoch)) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def cosine_decay(learning_rate, step_each_epoch, epochs=120): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() epoch = ops.floor(global_step / step_each_epoch) decayed_lr = learning_rate * \ (ops.cos(epoch * (math.pi / epochs)) + 1)/2 return decayed_lr
def cosine_decay(learning_rate, num_epoch, steps_one_epoch): """Applies cosine decay to the learning rate. lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() decayed_lr = learning_rate * \ (ops.cos(fluid.layers.floor(global_step / steps_one_epoch) \ * math.pi / num_epoch) + 1)/2 return decayed_lr
def cosine_decay_v2(learning_rate, totalsteps): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(global_step * (math.pi / totalsteps)) + 1) decrease lr for every mini-batch. """ global_step = _decay_step_counter() with init_on_cpu(): decayed_lr = learning_rate * \ (ops.cos(global_step * (math.pi / float(totalsteps))) + 1)/2 return decayed_lr
def cosine_decay(learning_rate, num_epoch, steps_one_epoch): """Applies cosine decay to the learning rate. lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() with init_on_cpu(): decayed_lr = learning_rate * \ (ops.cos((global_step / steps_one_epoch) \ * math.pi / num_epoch) + 1)/2 return decayed_lr
def cos_anneal_with_warmup_decay(learning_rate, boundaries, values, warmup_iter, warmup_factor): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_iter_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True) cos_boundaries = [warmup_iter] + boundaries[:-1] cos_boundaries = np.array(boundaries) - np.array(cos_boundaries) cos_boundaries = cos_boundaries.tolist() cos_step = [warmup_iter] + boundaries[:-1] #pdb.set_trace() with control_flow.Switch() as switch: with switch.case(global_step < warmup_iter_var): alpha = global_step / warmup_iter_var factor = warmup_factor * (1 - alpha) + alpha decayed_lr = learning_rate * factor fluid.layers.assign(decayed_lr, lr) for i in range(len(boundaries)): boundary_val = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float( boundaries[i]), force_cpu=True) #value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[i])) with switch.case(global_step < boundary_val): #pdb.set_trace() cur_epoch_factor = (global_step - cos_step[i]) / cos_boundaries[i] cur_epoch_cos_factor = ops.cos(cur_epoch_factor * math.pi / 2.0) cur_lr = cur_epoch_cos_factor * values[i] fluid.layers.assign(cur_lr, lr) last_value_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(values[len(values) - 1])) with switch.default(): fluid.layers.assign(last_value_var, lr) return lr
def cosine_decay(learning_rate, step_each_epoch, epochs = 120): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() with init_on_cpu(): # update epoch = ops.floor(global_step / step_each_epoch) decayed_lr = learning_rate * \ (ops.cos(epoch * (math.pi / epochs)) + 1)/2 #if global_step % step_each_epoch == 0: # print("epoch={0}, global_step={1},decayed_lr={2} \ # (step_each_epoch={3})".format( \ # epoch,global_step,decayed_lr,step_each_epoch)) return decayed_lr
def cosine_decay_v2_with_warmup(learning_rate, warmupsteps, totalsteps): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") with init_on_cpu(): with control_flow.Switch() as switch: with switch.case(global_step < warmupsteps): decayed_lr = learning_rate * (global_step / float(warmupsteps)) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmupsteps) * (math.pi / (totalsteps))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=500, warmup_minibatch=1000): """ Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. args: learning_rate(float): initial learning rate step_each_epoch (int): number of step for each epoch in training process epochs(int): number of training epochs warmup_minibatch(int): number of minibatch for warmup return: lr(tensor): learning rate tensor """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_minibatch = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(warmup_minibatch), force_cpu=True) with fluid.layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_minibatch): decayed_lr = learning_rate * (1.0 * global_step / warmup_minibatch) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmup_minibatch) * (math.pi / (epochs * step_each_epoch))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def lr_warmup(learning_rate, warmup_steps, total_step, multiplier, step_each_epoch): with default_main_program()._lr_schedule_guard(): lr = tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name='learning_rate_warmup') global_step = _decay_step_counter() with control_flow.Switch() as switch: with switch.case(global_step <= warmup_steps): decay_lr = learning_rate * ( (multiplier - 1.) * global_step / warmup_steps + 1.) tensor.assign(decay_lr, lr) with switch.default(): learning_rate = learning_rate * multiplier #cur_epoch = ops.floor(global_step/step_each_epoch) decay_lr = learning_rate * 0.5 * (ops.cos( (global_step - warmup_steps) * math.pi / (total_step)) + 1) tensor.assign(decay_lr, lr) return lr
def arcface_classify(self, x, label, margin=0.5, logit_scale=64, param_attr=None): ''' reference: ArcFace. https://arxiv.org/abs/1801.07698 ''' flatten_dim = reduce(lambda a, b: a * b, x.shape[1:], 1) weight, bias = self.create_parameter(dtype=x.dtype, in_dim=flatten_dim, param_attr=param_attr, use_bias=False) # normalize x x_l2 = ops.sqrt(nn.reduce_sum(ops.square(x), dim=1)) norm_x = nn.elementwise_div(x, x_l2, axis=0) norm_x_all = collective._c_allgather(norm_x, nranks=self.nranks, use_calc_stream=True) label_all = collective._c_allgather(label, nranks=self.nranks, use_calc_stream=True) label_all.stop_gradient = True shard_label = nn.shard_index(label_all, index_num=self.nclasses, nshards=self.nranks, shard_id=self.rank_id, ignore_value=-1) # TODO check necessary shard_label.stop_gradient = True # normalize weight weight_l2 = ops.sqrt(nn.reduce_sum(ops.square(weight), dim=0)) norm_weight = nn.elementwise_div(weight, weight_l2, axis=1) shard_cos = nn.mul(norm_x_all, norm_weight, x_num_col_dims=1) theta = ops.acos(shard_cos) margin_cos = ops.cos(theta + margin) shard_one_hot = nn.one_hot(shard_label, depth=self.shard_dim, allow_out_of_range=True) # TODO check necessary shard_one_hot.stop_gradient = True diff = (margin_cos - shard_cos) * shard_one_hot shard_target_cos = shard_cos + diff shard_logit = nn.scale(shard_target_cos, scale=logit_scale) global_loss, shard_prob = self.softmax_with_cross_entropy( shard_logit, shard_label) avg_loss = nn.mean(global_loss) avg_loss._set_info('shard_logit', shard_logit) avg_loss._set_info('shard_prob', shard_prob) avg_loss._set_info('shard_label', shard_label) avg_loss._set_info('shard_dim', self.shard_dim) return avg_loss
def decay(): cos_lr = base_lr * .5 * (cos(steps * (math.pi / total)) + 1) fluid.layers.tensor.assign(input=cos_lr, output=lr)