def linear_warmup_decay(init_lr, num_train_steps, num_warmup_steps, main_program): with main_program._lr_schedule_guard(): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=init_lr, dtype='float32', persistable=True, name="learning_rate") with control_flow.Switch() as switch: with switch.case(global_step < num_warmup_steps): decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps fluid.layers.assign(decayed_lr, lr) with switch.default(): decayed_lr = lr_scheduler.polynomial_decay( learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) fluid.layers.assign(decayed_lr, lr) return lr
def cos_anneal_with_warmup_decay(learning_rate, boundaries, values, warmup_iter, warmup_factor): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_iter_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True) cos_boundaries = [warmup_iter] + boundaries[:-1] cos_boundaries = np.array(boundaries) - np.array(cos_boundaries) cos_boundaries = cos_boundaries.tolist() cos_step = [warmup_iter] + boundaries[:-1] #pdb.set_trace() with control_flow.Switch() as switch: with switch.case(global_step < warmup_iter_var): alpha = global_step / warmup_iter_var factor = warmup_factor * (1 - alpha) + alpha decayed_lr = learning_rate * factor fluid.layers.assign(decayed_lr, lr) for i in range(len(boundaries)): boundary_val = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float( boundaries[i]), force_cpu=True) #value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[i])) with switch.case(global_step < boundary_val): #pdb.set_trace() cur_epoch_factor = (global_step - cos_step[i]) / cos_boundaries[i] cur_epoch_cos_factor = ops.cos(cur_epoch_factor * math.pi / 2.0) cur_lr = cur_epoch_cos_factor * values[i] fluid.layers.assign(cur_lr, lr) last_value_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(values[len(values) - 1])) with switch.default(): fluid.layers.assign(last_value_var, lr) return lr
def cosine_warmup_decay(learning_rate, betas, warmup_factor, decay_factor, total_step, warmup_pct): def annealing_cos(start, end, pct): "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." cos_out = fluid.layers.cos(pct * np.pi) + 1. return cos_out * (start - end) / 2. + end warmup_start_lr = learning_rate * warmup_factor decay_end_lr = learning_rate * decay_factor warmup_step = total_step * warmup_pct global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var( shape=[1], value=float(learning_rate), dtype='float32', persistable=True, name="learning_rate") beta1 = fluid.layers.create_global_var( shape=[1], value=float(betas[0]), dtype='float32', persistable=True, name="beta1") warmup_step_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(warmup_step), force_cpu=True) with control_flow.Switch() as switch: with switch.case(global_step < warmup_step_var): cur_lr = annealing_cos(warmup_start_lr, learning_rate, global_step / warmup_step_var) fluid.layers.assign(cur_lr, lr) cur_beta1 = annealing_cos(betas[0], betas[1], global_step / warmup_step_var) fluid.layers.assign(cur_beta1, beta1) with switch.case(global_step >= warmup_step_var): cur_lr = annealing_cos(learning_rate, decay_end_lr, (global_step - warmup_step_var) / (total_step - warmup_step)) fluid.layers.assign(cur_lr, lr) cur_beta1 = annealing_cos(betas[1], betas[0], (global_step - warmup_step_var) / (total_step - warmup_step)) fluid.layers.assign(cur_beta1, beta1) return lr, beta1
def exponential_with_warmup_decay(learning_rate, boundaries, values, warmup_iter, warmup_factor): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_iter_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True) with control_flow.Switch() as switch: with switch.case(global_step < warmup_iter_var): alpha = global_step / warmup_iter_var factor = warmup_factor * (1 - alpha) + alpha decayed_lr = learning_rate * factor fluid.layers.assign(decayed_lr, lr) for i in range(len(boundaries)): boundary_val = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float( boundaries[i]), force_cpu=True) value_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(values[i])) with switch.case(global_step < boundary_val): fluid.layers.assign(value_var, lr) last_value_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(values[len(values) - 1])) with switch.default(): fluid.layers.assign(last_value_var, lr) return lr
def cosine_decay_v2_with_warmup(learning_rate, warmupsteps, totalsteps): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") with init_on_cpu(): with control_flow.Switch() as switch: with switch.case(global_step < warmupsteps): decayed_lr = learning_rate * (global_step / float(warmupsteps)) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmupsteps) * (math.pi / (totalsteps))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def lr_warmup(learning_rate, warmup_steps, total_step, multiplier, step_each_epoch): with default_main_program()._lr_schedule_guard(): lr = tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name='learning_rate_warmup') global_step = _decay_step_counter() with control_flow.Switch() as switch: with switch.case(global_step <= warmup_steps): decay_lr = learning_rate * ( (multiplier - 1.) * global_step / warmup_steps + 1.) tensor.assign(decay_lr, lr) with switch.default(): learning_rate = learning_rate * multiplier #cur_epoch = ops.floor(global_step/step_each_epoch) decay_lr = learning_rate * 0.5 * (ops.cos( (global_step - warmup_steps) * math.pi / (total_step)) + 1) tensor.assign(decay_lr, lr) return lr
def scheduler_handler(self, max_train_steps): scheduled_lr = fluid.layers.create_global_var(shape=[1], value=self.learning_rate, dtype='float32', persistable=True, name="learning_rate") if not self.scheduler["slanted_triangle"]["cut_fraction"]: warmup_steps = int(max_train_steps * self.scheduler["warmup"]) linear_decay_start = int( max_train_steps * self.scheduler["linear_decay"]["start_point"]) if linear_decay_start < warmup_steps: logger.warning( "linear decay can not start during warmup process," "it will start after warmup ends!") linear_decay_start = warmup_steps if self.scheduler["noam_decay"]: if warmup_steps > 0: scheduled_lr = fluid.layers.learning_rate_scheduler \ .noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)), warmup_steps) else: logger.warning( "Noam decay learning rate scheduler should have positive \ warmup steps, using constant learning rate instead!") if not self.scheduler["noam_decay"] and \ (warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1): with self.main_program._lr_schedule_guard(): global_step = lr_scheduler._decay_step_counter() with control_flow.Switch() as switch: if warmup_steps > 0: with switch.case(global_step < warmup_steps): decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps fluid.layers.assign(decayed_lr, scheduled_lr) if self.scheduler["linear_decay"]["start_point"] < 1: with switch.case( global_step >= linear_decay_start): decayed_lr = lr_scheduler.polynomial_decay( learning_rate=self.learning_rate, decay_steps=max_train_steps, end_learning_rate=self.scheduler[ "linear_decay"]["end_learning_rate"], power=1.0, cycle=False) fluid.layers.assign(decayed_lr, scheduled_lr) else: if self.scheduler["warmup"] or self.scheduler[ "noam_decay"] or self.scheduler["linear_decay"][ "start_point"] < 1: logger.warning( "You are using slanted_triangle learning rate " "which will make warmup, noam_decay and linear_decay unable" ) cut_step = int(max_train_steps * self.scheduler["slanted_triangle"]["cut_fraction"]) ratio = self.scheduler["slanted_triangle"]["ratio"] global_step = lr_scheduler._decay_step_counter() with control_flow.Switch() as switch: with switch.case(global_step <= cut_step): pct = global_step / cut_step decayed_lr = self.learning_rate * (1 + pct * (ratio - 1)) / ratio fluid.layers.assign(decayed_lr, scheduled_lr) with switch.default(): pct = 1 - (global_step - cut_step) / (max_train_steps - cut_step) decayed_lr = self.learning_rate * (1 + pct * (ratio - 1)) / ratio fluid.layers.assign(decayed_lr, scheduled_lr) super(CombinedStrategy, self).__init__(optimizer_name=self._optimizer_name, learning_rate=scheduled_lr) if self.scheduler["discriminative"]["blocks"]: _block_layers = math.ceil( len(self.sorted_depth) / self.scheduler["discriminative"]["blocks"]) power = 0 for cnt, depth in enumerate(self.sorted_depth): for index, param in enumerate(self.depth_params_dict[depth]): param.optimize_attr["learning_rate"] *= \ pow(1.0 / self.scheduler["discriminative"]["factor"], power) if cnt and cnt % _block_layers == 0: power += 1 return scheduled_lr
def exponential_with_warmup_decay(learning_rate, boundaries, values, warmup_iter, warmup_factor): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_iter_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True) with control_flow.Switch() as switch: with switch.case(global_step < warmup_iter_var): alpha = global_step / warmup_iter_var factor = warmup_factor * (1 - alpha) + alpha decayed_lr = learning_rate * factor fluid.layers.assign(decayed_lr, lr) #for i in range(len(boundaries)): # boundary_val = fluid.layers.fill_constant( # shape=[1], # dtype='float32', # value=float(1000000000), # force_cpu=True) # #value_var = fluid.layers.fill_constant( # # shape=[1], dtype='float32', value=float(values[i])) # lr_decay_rate = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(0.94)) # lr_decay_steps = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(1250.0)) # tmp1 = fluid.layers.cast(global_step / lr_decay_steps, dtype='int32') # tmp2 = fluid.layers.cast(tmp1, dtype='float32') # value_var = learning_rate * fluid.layers.pow(lr_decay_rate, global_step / lr_decay_steps) # with switch.case(global_step < boundary_val): # fluid.layers.assign(value_var, lr) #last_value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[len(values) - 1])) ex_decay_lr = fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=10000, decay_rate=0.94, staircase=True) with switch.default(): fluid.layers.assign(ex_decay_lr, lr) # with control_flow.Switch() as switch: # with switch.case(global_step < warmup_iter_var): # alpha = global_step / warmup_iter_var # factor = warmup_factor * (1 - alpha) + alpha # decayed_lr = learning_rate * factor # fluid.layers.assign(decayed_lr, lr) # # for i in range(len(boundaries)): # boundary_val = fluid.layers.fill_constant( # shape=[1], # dtype='float32', # value=float(boundaries[i]), # force_cpu=True) # value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[i])) # with switch.case(global_step < boundary_val): # fluid.layers.assign(value_var, lr) # # last_value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[len(values) - 1])) # with switch.default(): # fluid.layers.assign(last_value_var, lr) return lr