if global_timestep.numpy() < agent.params.learning_start: action = env.action_space.sample() else: action = agent.predict(state) # scale for execution in env (in DDPG, every action is clipped between [-1, 1] in agent.predict) next_state, reward, done, info = env.step(action * env.action_space.high) replay_buffer.add(state, action, reward, next_state, done) global_timestep.assign_add(1) episode_len += 1 total_reward += reward state = next_state # for evaluation purpose if global_timestep.numpy() % agent.params.eval_interval == 0: agent.eval_flg = True """ ===== After 1 Episode is Done ===== """ # train the model at this point for t_train in range(episode_len): # in mujoco, this will be 1,000 iterations! states, actions, rewards, next_states, dones = replay_buffer.sample(agent.params.batch_size) loss = agent.update(states, actions, rewards, next_states, dones) soft_target_model_update_eager(agent.target_actor, agent.actor, tau=agent.params.soft_update_tau) soft_target_model_update_eager(agent.target_critic, agent.critic, tau=agent.params.soft_update_tau) tf.contrib.summary.scalar("reward", total_reward, step=i) tf.contrib.summary.scalar("exec time", time.time() - start, step=i) if i >= agent.params.reward_buffer_ep:
agent.target_actor, agent.actor, tau=agent.params.soft_update_tau) soft_target_model_update_eager( agent.target_critic, agent.critic, tau=agent.params.soft_update_tau) global_timestep.assign_add(1) episode_len += 1 total_reward += reward state = next_state # for evaluation purpose if global_timestep.numpy() % agent.params.eval_interval == 0: agent.eval_flg = True """ ===== After 1 Episode is Done ===== """ # save the updated models agent.actor_manager.save() agent.critic_manager.save() # store the episode related variables reward_buffer.append(total_reward) time_buffer.append(time.time() - start) # logging on Tensorboard tf.contrib.summary.scalar("reward", total_reward, step=global_timestep.numpy())