def gym_train(self): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env policy_mode: 'on-policy' or 'off-policy' """ begin_episode = int(self.train_args['begin_episode']) render = bool(self.train_args['render']) render_episode = int(self.train_args.get('render_episode', 50000)) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) eval_while_train = bool(self.train_args['eval_while_train']) max_eval_episode = int(self.train_args.get('max_eval_episode')) off_policy_step_eval = bool(self.train_args['off_policy_step_eval']) off_policy_step_eval_num = int( self.train_args.get('off_policy_step_eval_num')) policy_mode = str(self.model_args['policy_mode']) moving_average_episode = int(self.train_args['moving_average_episode']) add_noise2buffer = bool(self.train_args['add_noise2buffer']) add_noise2buffer_episode_interval = int( self.train_args['add_noise2buffer_episode_interval']) add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps']) total_step_control = bool(self.train_args['total_step_control']) max_total_step = int(self.train_args['max_total_step']) if total_step_control: max_episode = max_total_step i, state, new_state = self.init_variables() sma = SMA(moving_average_episode) total_step = 0 for episode in range(begin_episode, max_episode): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) step = 0 r = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(self.env.n) if render or episode > render_episode: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem self.model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if policy_mode == 'off-policy': self.model.learn(episode=episode, step=1) if off_policy_step_eval: self.gym_step_eval(total_step, self.model, off_policy_step_eval_num, max_step) total_step += 1 if total_step_control and total_step > max_total_step: return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][ self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] sma.update(r) if policy_mode == 'on-policy': self.model.learn(episode=episode, step=step) self.model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) self.pwi('-' * 40) self.pwi( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}' ) if episode % save_frequency == 0: self.model.save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.gym_random_sample(steps=add_noise2buffer_steps) if eval_while_train and self.env.reward_threshold is not None: if r.max() >= self.env.reward_threshold: self.pwi( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) self.gym_evaluate()
def gym_train(env, model, print_func, begin_train_step, begin_frame_step, begin_episode, render, render_episode, save_frequency, max_step_per_episode, max_train_episode, eval_while_train, max_eval_episode, off_policy_step_eval_episodes, off_policy_train_interval, policy_mode, moving_average_episode, add_noise2buffer, add_noise2buffer_episode_interval, add_noise2buffer_steps, off_policy_eval_interval, max_train_step, max_frame_step): """ TODO: Annotation """ i, state, new_state = init_variables(env) sma = SMA(moving_average_episode) frame_step = begin_frame_step train_step = begin_train_step total_step = 0 for episode in range(begin_episode, max_train_episode): model.reset() state[i] = env.reset() dones_flag = np.full(env.n, False) step = 0 r = np.zeros(env.n) last_done_step = -1 while True: step += 1 if render or episode > render_episode: env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info, correct_new_state = env.step( action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r[unfinished_index] += reward[unfinished_index] model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) model.partial_reset(done) state[i] = correct_new_state if policy_mode == 'off-policy': if total_step % off_policy_train_interval == 0: model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0: gym_step_eval(deepcopy(env), train_step, model, off_policy_step_eval_episodes, max_step_per_episode) frame_step += env.n total_step += 1 if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) logger.info( f'End Training, learn step: {train_step}, frame_step: {frame_step}' ) return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break sma.update(r) if policy_mode == 'on-policy': model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) print_func('-' * 40, out_time=True) print_func( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 2)}' ) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, print_func=print_func, prefill_choose=False, desc='adding noise') if eval_while_train and env.reward_threshold is not None: if r.max() >= env.reward_threshold: print_func( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)
def gym_train(env, model, print_func, begin_episode, render, render_episode, save_frequency, max_step, max_episode, eval_while_train, max_eval_episode, off_policy_step_eval, off_policy_step_eval_num, policy_mode, moving_average_episode, add_noise2buffer, add_noise2buffer_episode_interval, add_noise2buffer_steps, total_step_control, eval_interval, max_total_step): """ TODO: Annotation """ if total_step_control: max_episode = max_total_step i, state, new_state = init_variables(env) sma = SMA(moving_average_episode) total_step = 0 for episode in range(begin_episode, max_episode): model.reset() state[i] = env.reset() dones_flag = np.full(env.n, False) step = 0 r = np.zeros(env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(env.n) if render or episode > render_episode: env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info, correct_new_state = env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem model.store_data( s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done ) model.partial_reset(done) state[i] = correct_new_state if policy_mode == 'off-policy': model.learn(episode=episode, step=1) if off_policy_step_eval and total_step % eval_interval == 0: gym_step_eval(env.eval_env, total_step, model, off_policy_step_eval_num, max_step) total_step += 1 if total_step_control and total_step > max_total_step: return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break sma.update(r) if policy_mode == 'on-policy': model.learn(episode=episode, step=step) model.writer_summary( episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs ) print_func('-' * 40, out_time=True) print_func(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}') if episode % save_frequency == 0: model.save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_random_sample(env, steps=add_noise2buffer_steps, print_func=print_func) if eval_while_train and env.reward_threshold is not None: if r.max() >= env.reward_threshold: print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------') gym_evaluate(env, model, max_step, max_eval_episode, print_func)
def train(env, gym_model, begin_episode, save_frequency, max_step, max_episode, eval_while_train, max_eval_episode, render, render_episode, policy_mode): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env """ i, state, new_state = init_variables(env) sma = SMA(100) for episode in range(begin_episode, max_episode): state[i] = env.reset() dones_flag = np.full(env.n, False) step = 0 r = np.zeros(env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(env.n) if render or episode > render_episode: env.render() action = gym_model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem gym_model.store_data( s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done ) if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][env.dones_index] = env.partial_reset() state[i] = new_state[i] sma.update(r) gym_model.learn(episode=episode, step=step) gym_model.writer_summary( episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs ) print('-' * 40) print(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {r}') if episode % save_frequency == 0: gym_model.save_checkpoint(episode) if eval_while_train and env.reward_threshold is not None: if r.max() >= env.reward_threshold: ave_r, ave_step = Loop.evaluate(env, gym_model, max_step, max_eval_episode) solved = True if ave_r >= env.reward_threshold else False print(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------') print(f'evaluate number: {max_eval_episode:3d} | average step: {ave_step} | average reward: {ave_r} | SOLVED: {solved}') print('----------------------------------------------------------------------------------------------------------------------------')