def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_epoch = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(5), force_cpu=True) with init_on_cpu(): epoch = ops.floor(global_step / step_each_epoch) with fluid.layers.control_flow.Switch() as switch: with switch.case(epoch < warmup_epoch): decayed_lr = learning_rate * (global_step / (step_each_epoch * warmup_epoch)) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def linear_warmup_and_cosine_decay(learning_rate, end_lr, warmup_steps, max_training_steps): """Applies linear warmup and cosine decay to the learning rate.""" dtype = "float32" with fluid.default_main_program()._lr_schedule_guard(): lr = layers.create_global_var(shape=[1], value=0.0, dtype=dtype, persistable=True, name="learning_rate") global_step = _decay_step_counter(1) with layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_steps): warmup_lr = learning_rate * (global_step / warmup_steps) layers.assign(warmup_lr, lr) with switch.case(global_step < max_training_steps): frac = 0.5 * (ops.cos((global_step - warmup_steps) * math.pi / (max_training_steps - warmup_steps)) + 1) decayed_lr = end_lr + (learning_rate - end_lr) * frac layers.assign(decayed_lr, lr) with switch.default(): learning_rate = layers.fill_constant(shape=[1], dtype=dtype, value=end_lr) layers.assign(learning_rate, lr) return lr
def cosine_decay_with_warmup(learning_rate, max_iters=90000, warmup_iters=1000, warmup_factor=0.1): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var( shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") with fluid.layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_iters): eta_min = learning_rate * warmup_factor decayed_lr = eta_min + (learning_rate - eta_min) * (global_step / warmup_iters) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmup_iters) * (math.pi / (max_iters - warmup_iters))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def exponential_decay_with_warmup(learning_rate, step_each_epoch, decay_epochs, decay_rate=0.97, warm_up_epoch=5.0): """Applies exponential decay to the learning rate. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_epoch = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warm_up_epoch), force_cpu=True) with init_on_cpu(): epoch = ops.floor(global_step / step_each_epoch) with fluid.layers.control_flow.Switch() as switch: with switch.case(epoch < warmup_epoch): decayed_lr = learning_rate * (global_step / (step_each_epoch * warmup_epoch)) fluid.layers.assign(input=decayed_lr, output=lr) with switch.default(): div_res = (global_step - warmup_epoch * step_each_epoch) / decay_epochs div_res = ops.floor(div_res) decayed_lr = learning_rate * (decay_rate**div_res) fluid.layers.assign(input=decayed_lr, output=lr) return lr
def cosine_decay(): """ Applies cosine decay to the learning rate. """ global_step = _decay_step_counter() frac = (1 + ops.cos(global_step / max_step * math.pi)) / 2 return FLAGS.lr_min + (FLAGS.lr_max - FLAGS.lr_min) * frac
def linear_warmup_decay(init_lr, num_train_steps, num_warmup_steps, main_program): with main_program._lr_schedule_guard(): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=init_lr, dtype='float32', persistable=True, name="learning_rate") with control_flow.Switch() as switch: with switch.case(global_step < num_warmup_steps): decayed_lr = init_lr * global_step * 1.0 / num_warmup_steps fluid.layers.assign(decayed_lr, lr) with switch.default(): decayed_lr = lr_scheduler.polynomial_decay( learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) fluid.layers.assign(decayed_lr, lr) return lr
def __call__(self): global_step = _decay_step_counter() learning_rate = fluid.layers.tensor.create_global_var( shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") epoch = ops.floor(global_step / self.step_each_epoch) with fluid.layers.control_flow.Switch() as switch: with switch.case(epoch < self.warmup_epoch): decayed_lr = self.lr * \ (global_step / (self.step_each_epoch * self.warmup_epoch)) fluid.layers.tensor.assign(input=decayed_lr, output=learning_rate) with switch.default(): current_step = global_step - self.warmup_epoch * self.step_each_epoch total_step = (self.epochs - self.warmup_epoch) * self.step_each_epoch decayed_lr = self.lr * \ (ops.cos(current_step * math.pi / total_step) + 1) / 2 fluid.layers.tensor.assign(input=decayed_lr, output=learning_rate) return learning_rate
def cosine_with_warmup_decay(learning_rate, lr_min, steps_one_epoch, warmup_epochs, total_epoch, num_gpu): global_step = _decay_step_counter() epoch_idx = fluid.layers.floor(global_step / steps_one_epoch) lr = fluid.layers.create_global_var( shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_epoch_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(warmup_epochs), force_cpu=True) num_gpu_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(num_gpu), force_cpu=True) batch_idx = global_step - steps_one_epoch * epoch_idx with fluid.layers.control_flow.Switch() as switch: with switch.case(epoch_idx < warmup_epoch_var): epoch_ = (batch_idx + 1) / steps_one_epoch factor = 1 / num_gpu_var * ( epoch_ * (num_gpu_var - 1) / warmup_epoch_var + 1) decayed_lr = learning_rate * factor * num_gpu_var fluid.layers.assign(decayed_lr, lr) epoch_ = (batch_idx + 1) / steps_one_epoch m = epoch_ / total_epoch frac = (1 + ops.cos(math.pi * m)) / 2 cosine_lr = (lr_min + (learning_rate - lr_min) * frac) * num_gpu_var with switch.default(): fluid.layers.assign(cosine_lr, lr) return lr
def cosine_decay(lr, step_each_epoch, epochs): global_step = _decay_step_counter() with init_on_cpu(): epoch = fluid.layers.floor(global_step / step_each_epoch) decayed_lr = lr * (fluid.layers.cos(epoch * (math.pi / epochs)) + 1) / 2 return decayed_lr
def cosine_decay(learning_rate, step_each_epoch, epochs=120): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() epoch = ops.floor(global_step / step_each_epoch) decayed_lr = learning_rate * \ (ops.cos(epoch * (math.pi / epochs)) + 1)/2 return decayed_lr
def cosine_warmup_decay(learning_rate, betas, warmup_factor, decay_factor, total_step, warmup_pct): def annealing_cos(start, end, pct): "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." cos_out = fluid.layers.cos(pct * np.pi) + 1. return cos_out * (start - end) / 2. + end warmup_start_lr = learning_rate * warmup_factor decay_end_lr = learning_rate * decay_factor warmup_step = total_step * warmup_pct global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=float(learning_rate), dtype='float32', persistable=True, name="learning_rate") beta1 = fluid.layers.create_global_var(shape=[1], value=float(betas[0]), dtype='float32', persistable=True, name="beta1") warmup_step_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_step), force_cpu=True) warmup_pred = global_step < warmup_step_var decay_pred = global_step >= warmup_step_var # learning rate warmup and decay def warmup_lr(): return annealing_cos(warmup_start_lr, learning_rate, global_step / warmup_step_var) def decay_lr(): return annealing_cos(learning_rate, decay_end_lr, (global_step - warmup_step_var) / (total_step - warmup_step)) lr = fluid.layers.case(pred_fn_pairs=[(warmup_pred, warmup_lr), (decay_pred, decay_lr)]) # Adam beta1 warmup and decay def warmup_beta1(): return annealing_cos(betas[0], betas[1], global_step / warmup_step_var) def decay_beta1(): return annealing_cos(betas[0], betas[1], global_step / warmup_step_var) beta1 = fluid.layers.case( pred_fn_pairs=[(warmup_pred, warmup_beta1), (decay_pred, decay_beta1)]) return lr, beta1
def cosine_decay(learning_rate, step_each_epoch, epochs = 120): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() with init_on_cpu(): epoch = fluid.layers.floor(global_step / step_each_epoch) lr = learning_rate / 2. decayed_lr = lr * (fluid.layers.cos(epoch * (math.pi / epochs)) + 1) return decayed_lr
def cosine_decay(learning_rate, num_epoch, steps_one_epoch): """Applies cosine decay to the learning rate. lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() decayed_lr = learning_rate * \ (ops.cos(fluid.layers.floor(global_step / steps_one_epoch) \ * math.pi / num_epoch) + 1)/2 return decayed_lr
def cosine_decay_v2(learning_rate, totalsteps): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(global_step * (math.pi / totalsteps)) + 1) decrease lr for every mini-batch. """ global_step = _decay_step_counter() with init_on_cpu(): decayed_lr = learning_rate * \ (ops.cos(global_step * (math.pi / float(totalsteps))) + 1)/2 return decayed_lr
def cosine_decay(learning_rate, num_epoch, steps_one_epoch): """Applies cosine decay to the learning rate. lr = 0.5 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() with init_on_cpu(): decayed_lr = learning_rate * \ (ops.cos((global_step / steps_one_epoch) \ * math.pi / num_epoch) + 1)/2 return decayed_lr
def cos_anneal_with_warmup_decay(learning_rate, boundaries, values, warmup_iter, warmup_factor): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_iter_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True) cos_boundaries = [warmup_iter] + boundaries[:-1] cos_boundaries = np.array(boundaries) - np.array(cos_boundaries) cos_boundaries = cos_boundaries.tolist() cos_step = [warmup_iter] + boundaries[:-1] #pdb.set_trace() with control_flow.Switch() as switch: with switch.case(global_step < warmup_iter_var): alpha = global_step / warmup_iter_var factor = warmup_factor * (1 - alpha) + alpha decayed_lr = learning_rate * factor fluid.layers.assign(decayed_lr, lr) for i in range(len(boundaries)): boundary_val = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float( boundaries[i]), force_cpu=True) #value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[i])) with switch.case(global_step < boundary_val): #pdb.set_trace() cur_epoch_factor = (global_step - cos_step[i]) / cos_boundaries[i] cur_epoch_cos_factor = ops.cos(cur_epoch_factor * math.pi / 2.0) cur_lr = cur_epoch_cos_factor * values[i] fluid.layers.assign(cur_lr, lr) last_value_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(values[len(values) - 1])) with switch.default(): fluid.layers.assign(last_value_var, lr) return lr
def cosine_decay(learning_rate, step_each_epoch, epochs = 120): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) """ global_step = _decay_step_counter() with init_on_cpu(): # update epoch = ops.floor(global_step / step_each_epoch) decayed_lr = learning_rate * \ (ops.cos(epoch * (math.pi / epochs)) + 1)/2 #if global_step % step_each_epoch == 0: # print("epoch={0}, global_step={1},decayed_lr={2} \ # (step_each_epoch={3})".format( \ # epoch,global_step,decayed_lr,step_each_epoch)) return decayed_lr
def cosine_warmup_decay(learning_rate, betas, warmup_factor, decay_factor, total_step, warmup_pct): def annealing_cos(start, end, pct): "Cosine anneal from `start` to `end` as pct goes from 0.0 to 1.0." cos_out = fluid.layers.cos(pct * np.pi) + 1. return cos_out * (start - end) / 2. + end warmup_start_lr = learning_rate * warmup_factor decay_end_lr = learning_rate * decay_factor warmup_step = total_step * warmup_pct global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var( shape=[1], value=float(learning_rate), dtype='float32', persistable=True, name="learning_rate") beta1 = fluid.layers.create_global_var( shape=[1], value=float(betas[0]), dtype='float32', persistable=True, name="beta1") warmup_step_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(warmup_step), force_cpu=True) with control_flow.Switch() as switch: with switch.case(global_step < warmup_step_var): cur_lr = annealing_cos(warmup_start_lr, learning_rate, global_step / warmup_step_var) fluid.layers.assign(cur_lr, lr) cur_beta1 = annealing_cos(betas[0], betas[1], global_step / warmup_step_var) fluid.layers.assign(cur_beta1, beta1) with switch.case(global_step >= warmup_step_var): cur_lr = annealing_cos(learning_rate, decay_end_lr, (global_step - warmup_step_var) / (total_step - warmup_step)) fluid.layers.assign(cur_lr, lr) cur_beta1 = annealing_cos(betas[1], betas[0], (global_step - warmup_step_var) / (total_step - warmup_step)) fluid.layers.assign(cur_beta1, beta1) return lr, beta1
def exponential_with_warmup_decay(learning_rate, boundaries, values, warmup_iter, warmup_factor): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_iter_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True) with control_flow.Switch() as switch: with switch.case(global_step < warmup_iter_var): alpha = global_step / warmup_iter_var factor = warmup_factor * (1 - alpha) + alpha decayed_lr = learning_rate * factor fluid.layers.assign(decayed_lr, lr) for i in range(len(boundaries)): boundary_val = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float( boundaries[i]), force_cpu=True) value_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(values[i])) with switch.case(global_step < boundary_val): fluid.layers.assign(value_var, lr) last_value_var = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(values[len(values) - 1])) with switch.default(): fluid.layers.assign(last_value_var, lr) return lr
def cosine_decay_v2_with_warmup(learning_rate, warmupsteps, totalsteps): """Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") with init_on_cpu(): with control_flow.Switch() as switch: with switch.case(global_step < warmupsteps): decayed_lr = learning_rate * (global_step / float(warmupsteps)) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmupsteps) * (math.pi / (totalsteps))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def __call__(self, base_lr=None, learning_rate=None): steps = _decay_step_counter() total = self.total_steps if self.skip_steps is not None: total -= self.skip_steps lr = fluid.layers.tensor.create_global_var(shape=[1], value=base_lr, dtype='float32', persistable=True, name="learning_rate") def decay(): cos_lr = base_lr * .5 * (cos(steps * (math.pi / total)) + 1) fluid.layers.tensor.assign(input=cos_lr, output=lr) if self.skip_steps is None: decay() else: skipped = steps >= self.skip_steps fluid.layers.cond(skipped, decay) return lr
def linear_warmup_and_invsqrt_decay(learning_rate, warmup_steps, decay_steps): """Applies linear warmup and invsqrt decay to the learning rate.""" dtype = "float32" with fluid.default_main_program()._lr_schedule_guard(): lr = layers.create_global_var(shape=[1], value=0.0, dtype=dtype, persistable=True, name="learning_rate") global_step = _decay_step_counter(1) with layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_steps): warmup_lr = learning_rate * (global_step / warmup_steps) layers.assign(warmup_lr, lr) with switch.default(): decayed_lr = lr * ops.sqrt( decay_steps / (global_step - warmup_steps + decay_steps)) layers.assign(decayed_lr, lr) return lr
def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=500, warmup_minibatch=1000): """ Applies cosine decay to the learning rate. lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) decrease lr for every mini-batch and start with warmup. args: learning_rate(float): initial learning rate step_each_epoch (int): number of step for each epoch in training process epochs(int): number of training epochs warmup_minibatch(int): number of minibatch for warmup return: lr(tensor): learning rate tensor """ global_step = _decay_step_counter() lr = fluid.layers.tensor.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_minibatch = fluid.layers.fill_constant( shape=[1], dtype='float32', value=float(warmup_minibatch), force_cpu=True) with fluid.layers.control_flow.Switch() as switch: with switch.case(global_step < warmup_minibatch): decayed_lr = learning_rate * (1.0 * global_step / warmup_minibatch) fluid.layers.tensor.assign(input=decayed_lr, output=lr) with switch.default(): decayed_lr = learning_rate * \ (ops.cos((global_step - warmup_minibatch) * (math.pi / (epochs * step_each_epoch))) + 1)/2 fluid.layers.tensor.assign(input=decayed_lr, output=lr) return lr
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() save_only = getattr(cfg, 'save_prediction_only', False) if save_only: raise NotImplementedError('The config file only support prediction,' ' training stage is not implemented now') main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage( cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params( exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader( cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use vdl-paddle to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run( exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use vdl_paddle to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
def exponential_with_warmup_decay(learning_rate, boundaries, values, warmup_iter, warmup_factor): global_step = lr_scheduler._decay_step_counter() lr = fluid.layers.create_global_var(shape=[1], value=0.0, dtype='float32', persistable=True, name="learning_rate") warmup_iter_var = fluid.layers.fill_constant(shape=[1], dtype='float32', value=float(warmup_iter), force_cpu=True) with control_flow.Switch() as switch: with switch.case(global_step < warmup_iter_var): alpha = global_step / warmup_iter_var factor = warmup_factor * (1 - alpha) + alpha decayed_lr = learning_rate * factor fluid.layers.assign(decayed_lr, lr) #for i in range(len(boundaries)): # boundary_val = fluid.layers.fill_constant( # shape=[1], # dtype='float32', # value=float(1000000000), # force_cpu=True) # #value_var = fluid.layers.fill_constant( # # shape=[1], dtype='float32', value=float(values[i])) # lr_decay_rate = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(0.94)) # lr_decay_steps = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(1250.0)) # tmp1 = fluid.layers.cast(global_step / lr_decay_steps, dtype='int32') # tmp2 = fluid.layers.cast(tmp1, dtype='float32') # value_var = learning_rate * fluid.layers.pow(lr_decay_rate, global_step / lr_decay_steps) # with switch.case(global_step < boundary_val): # fluid.layers.assign(value_var, lr) #last_value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[len(values) - 1])) ex_decay_lr = fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=10000, decay_rate=0.94, staircase=True) with switch.default(): fluid.layers.assign(ex_decay_lr, lr) # with control_flow.Switch() as switch: # with switch.case(global_step < warmup_iter_var): # alpha = global_step / warmup_iter_var # factor = warmup_factor * (1 - alpha) + alpha # decayed_lr = learning_rate * factor # fluid.layers.assign(decayed_lr, lr) # # for i in range(len(boundaries)): # boundary_val = fluid.layers.fill_constant( # shape=[1], # dtype='float32', # value=float(boundaries[i]), # force_cpu=True) # value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[i])) # with switch.case(global_step < boundary_val): # fluid.layers.assign(value_var, lr) # # last_value_var = fluid.layers.fill_constant( # shape=[1], dtype='float32', value=float(values[len(values) - 1])) # with switch.default(): # fluid.layers.assign(last_value_var, lr) return lr
def poly_decay(): global_step = _decay_step_counter() with init_on_cpu(): decayed_lr = LEARNING_RATE * (fluid.layers.pow( (1 - global_step / TOTAL_STEP), POWER)) return decayed_lr
def cosine_annealing(self): step = _decay_step_counter() lr = FLAGS.lr_min + (FLAGS.lr_max - FLAGS.lr_min) / 2 \ * (1.0 + fluid.layers.ops.cos(step / self.max_step * math.pi)) return lr
def scheduler_handler(self, max_train_steps): scheduled_lr = fluid.layers.create_global_var(shape=[1], value=self.learning_rate, dtype='float32', persistable=True, name="learning_rate") if not self.scheduler["slanted_triangle"]["cut_fraction"]: warmup_steps = int(max_train_steps * self.scheduler["warmup"]) linear_decay_start = int( max_train_steps * self.scheduler["linear_decay"]["start_point"]) if linear_decay_start < warmup_steps: logger.warning( "linear decay can not start during warmup process," "it will start after warmup ends!") linear_decay_start = warmup_steps if self.scheduler["noam_decay"]: if warmup_steps > 0: scheduled_lr = fluid.layers.learning_rate_scheduler \ .noam_decay(1 / (warmup_steps * (self.learning_rate ** 2)), warmup_steps) else: logger.warning( "Noam decay learning rate scheduler should have positive \ warmup steps, using constant learning rate instead!") if not self.scheduler["noam_decay"] and \ (warmup_steps > 0 or self.scheduler["linear_decay"]["start_point"]<1): with self.main_program._lr_schedule_guard(): global_step = lr_scheduler._decay_step_counter() with control_flow.Switch() as switch: if warmup_steps > 0: with switch.case(global_step < warmup_steps): decayed_lr = self.learning_rate * global_step * 1.0 / warmup_steps fluid.layers.assign(decayed_lr, scheduled_lr) if self.scheduler["linear_decay"]["start_point"] < 1: with switch.case( global_step >= linear_decay_start): decayed_lr = lr_scheduler.polynomial_decay( learning_rate=self.learning_rate, decay_steps=max_train_steps, end_learning_rate=self.scheduler[ "linear_decay"]["end_learning_rate"], power=1.0, cycle=False) fluid.layers.assign(decayed_lr, scheduled_lr) else: if self.scheduler["warmup"] or self.scheduler[ "noam_decay"] or self.scheduler["linear_decay"][ "start_point"] < 1: logger.warning( "You are using slanted_triangle learning rate " "which will make warmup, noam_decay and linear_decay unable" ) cut_step = int(max_train_steps * self.scheduler["slanted_triangle"]["cut_fraction"]) ratio = self.scheduler["slanted_triangle"]["ratio"] global_step = lr_scheduler._decay_step_counter() with control_flow.Switch() as switch: with switch.case(global_step <= cut_step): pct = global_step / cut_step decayed_lr = self.learning_rate * (1 + pct * (ratio - 1)) / ratio fluid.layers.assign(decayed_lr, scheduled_lr) with switch.default(): pct = 1 - (global_step - cut_step) / (max_train_steps - cut_step) decayed_lr = self.learning_rate * (1 + pct * (ratio - 1)) / ratio fluid.layers.assign(decayed_lr, scheduled_lr) super(CombinedStrategy, self).__init__(optimizer_name=self._optimizer_name, learning_rate=scheduled_lr) if self.scheduler["discriminative"]["blocks"]: _block_layers = math.ceil( len(self.sorted_depth) / self.scheduler["discriminative"]["blocks"]) power = 0 for cnt, depth in enumerate(self.sorted_depth): for index, param in enumerate(self.depth_params_dict[depth]): param.optimize_attr["learning_rate"] *= \ pow(1.0 / self.scheduler["discriminative"]["factor"], power) if cnt and cnt % _block_layers == 0: power += 1 return scheduled_lr
def poly_decay(): global_step = _decay_step_counter()
# 建立损失函数 y_true = P.data(name='y_true', shape=[-1, 8, 28, 28], append_batch_size=False, dtype='float32') # 先把差值逐项平方,可以用P.pow()这个op,也可以用python里的运算符**。 mseloss = P.pow(y_true - act02_out_tensor, 2) mseloss = P.reduce_mean(mseloss) # 再求平均,即mse损失函数 # 优化器 optimizer = fluid.optimizer.SGD(learning_rate=lr) optimizer.minimize(mseloss) # ema global_steps = _decay_step_counter() ema = ExponentialMovingAverage(ema_decay, thres_steps=global_steps) ema.update() eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): # 重新建立一次网络,用相同的张量名,不用写损失层 inputs = P.data(name='input_1', shape=[-1, 3, 28, 28], append_batch_size=False, dtype='float32') conv01_out_tensor = fluid.layers.conv2d( input=inputs, num_filters=8, filter_size=1,