else: stacked_states_copy = stacked_states.copy() action = agent.choose_action(stacked_states_copy) next_state, reward, done, _ = env.step(action) stacked_states = stack_states(stacked_states, next_state, False) reward = np.sign(reward) agent.store(stacked_states_copy, action, reward, stacked_states, done) episode_reward += reward state = next_state if step % params["train_period"] == 0: alpha_loss, q_loss, policy_loss = agent.train() if done: logger.off() logger.log(episode, episode_reward, alpha_loss, q_loss, policy_loss, step) episode += 1 obs = env.reset() state = stack_states(state, obs, True) episode_reward = 0 episode_loss = 0 logger.on() logger.load_weights() player = Play(env, agent, params) player.evaluate()
action = agent.choose_action(stacked_states_copy) next_state, reward, done, _ = env.step(action) # 累积状态 stacked_states = stack_states(stacked_states, next_state, False) reward = np.clip(reward, -1.0, 1.0) agent.store(stacked_states_copy, action, reward, stacked_states, done) episode_reward += reward # -------------------------------------------------------------------------------# # -------------------------------------------------------------------------------# # --------------------- 2 填空 -----------------------------# # -------------------------------------------------------------------------------# # -------------------------------------------------------------------------------# #2 使用多步回报,在训练的前期目标价值可以估计地更准,从而加快模型的训练 if step % params["train_period"] == 0: beta = min(1.0, params["beta"] + step * (1.0 - params["beta"]) / params["final_annealing_beta_steps"]) loss += agent.train(beta) agent.soft_update_of_target_network() if done: logger.off() logger.log(episode, episode_reward, loss, step, beta) episode += 1 state = env.reset() stacked_frames = stack_states(stacked_states, state, True) episode_reward = 0 episode_loss = 0 logger.on()