def gym_train(self): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env policy_mode: 'on-policy' or 'off-policy' """ begin_episode = int(self.train_args['begin_episode']) render = bool(self.train_args['render']) render_episode = int(self.train_args.get('render_episode', 50000)) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) eval_while_train = bool(self.train_args['eval_while_train']) max_eval_episode = int(self.train_args.get('max_eval_episode')) off_policy_step_eval = bool(self.train_args['off_policy_step_eval']) off_policy_step_eval_num = int( self.train_args.get('off_policy_step_eval_num')) policy_mode = str(self.model_args['policy_mode']) moving_average_episode = int(self.train_args['moving_average_episode']) add_noise2buffer = bool(self.train_args['add_noise2buffer']) add_noise2buffer_episode_interval = int( self.train_args['add_noise2buffer_episode_interval']) add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps']) total_step_control = bool(self.train_args['total_step_control']) max_total_step = int(self.train_args['max_total_step']) if total_step_control: max_episode = max_total_step i, state, new_state = self.init_variables() sma = SMA(moving_average_episode) total_step = 0 for episode in range(begin_episode, max_episode): state[i] = self.env.reset() dones_flag = np.full(self.env.n, False) step = 0 r = np.zeros(self.env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(self.env.n) if render or episode > render_episode: self.env.render() action = self.model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = self.env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem self.model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) if policy_mode == 'off-policy': self.model.learn(episode=episode, step=1) if off_policy_step_eval: self.gym_step_eval(total_step, self.model, off_policy_step_eval_num, max_step) total_step += 1 if total_step_control and total_step > max_total_step: return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(self.env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][ self.env.dones_index] = self.env.partial_reset() state[i] = new_state[i] sma.update(r) if policy_mode == 'on-policy': self.model.learn(episode=episode, step=step) self.model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) self.pwi('-' * 40) self.pwi( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}' ) if episode % save_frequency == 0: self.model.save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.gym_random_sample(steps=add_noise2buffer_steps) if eval_while_train and self.env.reward_threshold is not None: if r.max() >= self.env.reward_threshold: self.pwi( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) self.gym_evaluate()
def unity_train(self): """ Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this trianing task. save_frequency: how often to save checkpoints. reset_config: configuration to reset for Unity environment. max_step: maximum number of steps for an episode. sampler_manager: sampler configuration parameters for 'reset_config'. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. rewards: use to record rewards of agents for each brain. """ begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) moving_average_episode = int(self.train_args['moving_average_episode']) add_noise2buffer = bool(self.train_args['add_noise2buffer']) add_noise2buffer_episode_interval = int( self.train_args['add_noise2buffer_episode_interval']) add_noise2buffer_steps = int(self.train_args['add_noise2buffer_steps']) if self.use_GCN: adj, x, visual_state, action, dones_flag, rewards = zeros_initializer( self.env.brain_num, 6) sma = [ SMA(moving_average_episode) for i in range(self.env.brain_num) ] for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) adj[i] = _adj x[i] = _x visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( adj=adj[i], x=x[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_adj, _x, _vs, _r, _d) in enumerate(ObsRewDone): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d self.models[i].store_data_gcn(adj=adj[i], x=x[i], visual_s=visual_state[i], a=action[i], r=_r, adj_=_adj, x_=_x, visual_s_=_vs, done=_d) rewards[i][unfinished_index] += _r[unfinished_index] adj[i] = _adj x[i] = _x visual_state[i] = _vs if policy_mode == 'off-policy': # print("advfdvsdfvfvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv") self.models[i].learn(episode=episode, step=1) if all([ all(dones_flag[i]) for i in range(self.env.brain_num) ]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary( episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(self.env.brain_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.unity_random_sample(steps=add_noise2buffer_steps) else: state, visual_state, action, dones_flag, rewards = zeros_initializer( self.env.brain_num, 5) sma = [ SMA(moving_average_episode) for i in range(self.env.brain_num) ] for episode in range(begin_episode, max_episode): ObsRewDone = self.env.reset() for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(self.env.brain_agents[i]) rewards[i] = np.zeros(self.env.brain_agents[i]) state[i] = _v visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(self.env.brain_num): action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } ObsRewDone = self.env.step(vector_action=actions) for i, (_v, _vs, _r, _d) in enumerate(ObsRewDone): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d self.models[i].store_data(s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_d) rewards[i][unfinished_index] += _r[unfinished_index] state[i] = _v visual_state[i] = _vs if policy_mode == 'off-policy': self.models[i].learn(episode=episode, step=1) if all([ all(dones_flag[i]) for i in range(self.env.brain_num) ]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(self.env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary( episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(self.env.brain_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(self.env.brain_num): self.models[i].save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: self.unity_random_sample(steps=add_noise2buffer_steps)
def unity_train(self): """ Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this trianing task. save_frequency: how often to save checkpoints. reset_config: configuration to reset for Unity environment. max_step: maximum number of steps for an episode. sampler_manager: sampler configuration parameters for 'reset_config'. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. agents_num: use to record 'number' of agents for each brain. rewards: use to record rewards of agents for each brain. """ begin_episode = int(self.train_args['begin_episode']) save_frequency = int(self.train_args['save_frequency']) max_step = int(self.train_args['max_step']) max_episode = int(self.train_args['max_episode']) policy_mode = str(self.model_args['policy_mode']) brains_num = len(self.env.brain_names) state = [0] * brains_num visual_state = [0] * brains_num action = [0] * brains_num dones_flag = [0] * brains_num agents_num = [0] * brains_num rewards = [0] * brains_num sma = [SMA(100) for i in range(brains_num)] for episode in range(begin_episode, max_episode): obs = self.env.reset() for i, brain_name in enumerate(self.env.brain_names): agents_num[i] = len(obs[brain_name].agents) dones_flag[i] = np.zeros(agents_num[i]) rewards[i] = np.zeros(agents_num[i]) step = 0 last_done_step = -1 while True: step += 1 for i, brain_name in enumerate(self.env.brain_names): state[i] = obs[brain_name].vector_observations visual_state[i] = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) action[i] = self.models[i].choose_action( s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(self.env.brain_names) } obs = self.env.step(vector_action=actions) for i, brain_name in enumerate(self.env.brain_names): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += obs[brain_name].local_done next_state = obs[brain_name].vector_observations next_visual_state = self.get_visual_input( agents_num[i], self.models[i].visual_sources, obs[brain_name]) self.models[i].store_data( s=state[i], visual_s=visual_state[i], a=action[i], r=np.asarray(obs[brain_name].rewards), s_=next_state, visual_s_=next_visual_state, done=np.asarray(obs[brain_name].local_done)) rewards[i][unfinished_index] += np.asarray( obs[brain_name].rewards)[unfinished_index] if policy_mode == 'off-policy': self.models[i].learn(episode=episode, step=1) if all([all(dones_flag[i]) for i in range(brains_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break for i in range(brains_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': self.models[i].learn(episode=episode, step=step) self.models[i].writer_summary(episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) self.pwi('-' * 40) self.pwi( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i in range(brains_num): self.pwi(f'brain {i:2d} reward: {arrprint(rewards[i], 3)}') if episode % save_frequency == 0: for i in range(brains_num): self.models[i].save_checkpoint(episode)
def unity_train(env, models, print_func, begin_train_step, begin_frame_step, begin_episode, save_frequency, max_step_per_episode, max_train_episode, policy_mode, moving_average_episode, add_noise2buffer, add_noise2buffer_episode_interval, add_noise2buffer_steps, max_train_step, max_frame_step, real_done, off_policy_train_interval): """ TODO: Annotation Train loop. Execute until episode reaches its maximum or press 'ctrl+c' artificially. Inputs: env: Environment for interaction. models: all models for this training task. save_frequency: how often to save checkpoints. reset_config: configuration to reset for Unity environment. max_step_per_episode: maximum number of steps for an episode. sampler_manager: sampler configuration parameters for 'reset_config'. resampling_interval: how often to resample parameters for env reset. Variables: brain_names: a list of brain names set in Unity. state: store a list of states for each brain. each item contain a list of states for each agents that controlled by the same brain. visual_state: store a list of visual state information for each brain. action: store a list of actions for each brain. dones_flag: store a list of 'done' for each brain. use for judge whether an episode is finished for every agents. rewards: use to record rewards of agents for each brain. """ state, visual_state, action, dones_flag, rewards = zeros_initializer( env.brain_num, 5) sma = [SMA(moving_average_episode) for i in range(env.brain_num)] frame_step = begin_frame_step min_of_all_agents = min(env.brain_agents) train_step = [begin_train_step for _ in range(env.brain_num)] for episode in range(begin_episode, max_train_episode): [model.reset() for model in models] ObsRewDone = env.reset() for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone): dones_flag[i] = np.zeros(env.brain_agents[i]) rewards[i] = np.zeros(env.brain_agents[i]) state[i] = _v visual_state[i] = _vs step = 0 last_done_step = -1 while True: step += 1 for i in range(env.brain_num): action[i] = models[i].choose_action(s=state[i], visual_s=visual_state[i]) actions = { f'{brain_name}': action[i] for i, brain_name in enumerate(env.brain_names) } ObsRewDone = env.step(actions) for i, (_v, _vs, _r, _d, _info) in enumerate(ObsRewDone): unfinished_index = np.where(dones_flag[i] == False)[0] dones_flag[i] += _d models[i].store_data( s=state[i], visual_s=visual_state[i], a=action[i], r=_r, s_=_v, visual_s_=_vs, done=_info['real_done'] if real_done else _d) models[i].partial_reset(_d) rewards[i][unfinished_index] += _r[unfinished_index] state[i] = _v visual_state[i] = _vs if policy_mode == 'off-policy': if train_step[i] % off_policy_train_interval == 0: models[i].learn(episode=episode, train_step=train_step) train_step[i] += 1 if train_step[i] % save_frequency == 0: models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step) frame_step += min_of_all_agents if 0 < max_train_step < min( train_step) or 0 < max_frame_step < frame_step: for i in range(env.brain_num): models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step) logger.info( f'End Training, learn step: {train_step}, frame_step: {frame_step}' ) return if all([all(dones_flag[i]) for i in range(env.brain_num)]): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break for i in range(env.brain_num): sma[i].update(rewards[i]) if policy_mode == 'on-policy': models[i].learn(episode=episode, train_step=train_step) train_step[i] += 1 if train_step[i] % save_frequency == 0: models[i].save_checkpoint(train_step=train_step[i], episode=episode, frame_step=frame_step) models[i].writer_summary(episode, reward_mean=rewards[i].mean(), reward_min=rewards[i].min(), reward_max=rewards[i].max(), step=last_done_step, **sma[i].rs) print_func('-' * 40, out_time=True) print_func( f'episode {episode:3d} | step {step:4d} | last_done_step {last_done_step:4d}' ) for i, bn in enumerate(env.brain_names): print_func(f'{bn} reward: {arrprint(rewards[i], 2)}') if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: unity_no_op(env, models, print_func=print_func, pre_fill_steps=add_noise2buffer_steps, prefill_choose=False, real_done=real_done, desc='adding noise')
def gym_train(env, model, print_func, begin_episode, render, render_episode, save_frequency, max_step, max_episode, eval_while_train, max_eval_episode, off_policy_step_eval, off_policy_step_eval_num, policy_mode, moving_average_episode, add_noise2buffer, add_noise2buffer_episode_interval, add_noise2buffer_steps, total_step_control, eval_interval, max_total_step): """ TODO: Annotation """ if total_step_control: max_episode = max_total_step i, state, new_state = init_variables(env) sma = SMA(moving_average_episode) total_step = 0 for episode in range(begin_episode, max_episode): model.reset() state[i] = env.reset() dones_flag = np.full(env.n, False) step = 0 r = np.zeros(env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(env.n) if render or episode > render_episode: env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info, correct_new_state = env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem model.store_data( s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done ) model.partial_reset(done) state[i] = correct_new_state if policy_mode == 'off-policy': model.learn(episode=episode, step=1) if off_policy_step_eval and total_step % eval_interval == 0: gym_step_eval(env.eval_env, total_step, model, off_policy_step_eval_num, max_step) total_step += 1 if total_step_control and total_step > max_total_step: return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break sma.update(r) if policy_mode == 'on-policy': model.learn(episode=episode, step=step) model.writer_summary( episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs ) print_func('-' * 40, out_time=True) print_func(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 3)}') if episode % save_frequency == 0: model.save_checkpoint(episode) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_random_sample(env, steps=add_noise2buffer_steps, print_func=print_func) if eval_while_train and env.reward_threshold is not None: if r.max() >= env.reward_threshold: print_func(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------') gym_evaluate(env, model, max_step, max_eval_episode, print_func)
def gym_train(env, model, print_func, begin_train_step, begin_frame_step, begin_episode, render, render_episode, save_frequency, max_step_per_episode, max_train_episode, eval_while_train, max_eval_episode, off_policy_step_eval_episodes, off_policy_train_interval, policy_mode, moving_average_episode, add_noise2buffer, add_noise2buffer_episode_interval, add_noise2buffer_steps, off_policy_eval_interval, max_train_step, max_frame_step): """ TODO: Annotation """ i, state, new_state = init_variables(env) sma = SMA(moving_average_episode) frame_step = begin_frame_step train_step = begin_train_step total_step = 0 for episode in range(begin_episode, max_train_episode): model.reset() state[i] = env.reset() dones_flag = np.full(env.n, False) step = 0 r = np.zeros(env.n) last_done_step = -1 while True: step += 1 if render or episode > render_episode: env.render(record=False) action = model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info, correct_new_state = env.step( action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r[unfinished_index] += reward[unfinished_index] model.store_data(s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done) model.partial_reset(done) state[i] = correct_new_state if policy_mode == 'off-policy': if total_step % off_policy_train_interval == 0: model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) if off_policy_eval_interval > 0 and train_step % off_policy_eval_interval == 0: gym_step_eval(deepcopy(env), train_step, model, off_policy_step_eval_episodes, max_step_per_episode) frame_step += env.n total_step += 1 if 0 < max_train_step <= train_step or 0 < max_frame_step <= frame_step: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) logger.info( f'End Training, learn step: {train_step}, frame_step: {frame_step}' ) return if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step_per_episode: break sma.update(r) if policy_mode == 'on-policy': model.learn(episode=episode, train_step=train_step) train_step += 1 if train_step % save_frequency == 0: model.save_checkpoint(train_step=train_step, episode=episode, frame_step=frame_step) model.writer_summary(episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs) print_func('-' * 40, out_time=True) print_func( f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {arrprint(r, 2)}' ) if add_noise2buffer and episode % add_noise2buffer_episode_interval == 0: gym_no_op(env, model, pre_fill_steps=add_noise2buffer_steps, print_func=print_func, prefill_choose=False, desc='adding noise') if eval_while_train and env.reward_threshold is not None: if r.max() >= env.reward_threshold: print_func( f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------' ) gym_evaluate(env, model, max_step_per_episode, max_eval_episode, print_func)
def train(env, gym_model, begin_episode, save_frequency, max_step, max_episode, eval_while_train, max_eval_episode, render, render_episode, policy_mode): """ Inputs: env: gym environment gym_model: algorithm model begin_episode: initial episode save_frequency: how often to save checkpoints max_step: maximum number of steps in an episode max_episode: maximum number of episodes in this training task render: specify whether render the env or not render_episode: if 'render' is false, specify from which episode to render the env """ i, state, new_state = init_variables(env) sma = SMA(100) for episode in range(begin_episode, max_episode): state[i] = env.reset() dones_flag = np.full(env.n, False) step = 0 r = np.zeros(env.n) last_done_step = -1 while True: step += 1 r_tem = np.zeros(env.n) if render or episode > render_episode: env.render() action = gym_model.choose_action(s=state[0], visual_s=state[1]) new_state[i], reward, done, info = env.step(action) unfinished_index = np.where(dones_flag == False)[0] dones_flag += done r_tem[unfinished_index] = reward[unfinished_index] r += r_tem gym_model.store_data( s=state[0], visual_s=state[1], a=action, r=reward, s_=new_state[0], visual_s_=new_state[1], done=done ) if all(dones_flag): if last_done_step == -1: last_done_step = step if policy_mode == 'off-policy': break if step >= max_step: break if len(env.dones_index): # 判断是否有线程中的环境需要局部reset new_state[i][env.dones_index] = env.partial_reset() state[i] = new_state[i] sma.update(r) gym_model.learn(episode=episode, step=step) gym_model.writer_summary( episode, reward_mean=r.mean(), reward_min=r.min(), reward_max=r.max(), step=last_done_step, **sma.rs ) print('-' * 40) print(f'Episode: {episode:3d} | step: {step:4d} | last_done_step {last_done_step:4d} | rewards: {r}') if episode % save_frequency == 0: gym_model.save_checkpoint(episode) if eval_while_train and env.reward_threshold is not None: if r.max() >= env.reward_threshold: ave_r, ave_step = Loop.evaluate(env, gym_model, max_step, max_eval_episode) solved = True if ave_r >= env.reward_threshold else False print(f'-------------------------------------------Evaluate episode: {episode:3d}--------------------------------------------------') print(f'evaluate number: {max_eval_episode:3d} | average step: {ave_step} | average reward: {ave_r} | SOLVED: {solved}') print('----------------------------------------------------------------------------------------------------------------------------')