else:
                stacked_states_copy = stacked_states.copy()
                action = agent.choose_action(stacked_states_copy)
                next_state, reward, done, _ = env.step(action)
                stacked_states = stack_states(stacked_states, next_state,
                                              False)
                reward = np.sign(reward)
                agent.store(stacked_states_copy, action, reward,
                            stacked_states, done)
                episode_reward += reward
                state = next_state

                if step % params["train_period"] == 0:
                    alpha_loss, q_loss, policy_loss = agent.train()

                if done:
                    logger.off()
                    logger.log(episode, episode_reward, alpha_loss, q_loss,
                               policy_loss, step)

                    episode += 1
                    obs = env.reset()
                    state = stack_states(state, obs, True)
                    episode_reward = 0
                    episode_loss = 0
                    logger.on()

    logger.load_weights()
    player = Play(env, agent, params)
    player.evaluate()
Esempio n. 2
0
            action = agent.choose_action(stacked_states_copy)
            next_state, reward, done, _ = env.step(action)

            # 累积状态
            stacked_states = stack_states(stacked_states, next_state, False)
            reward = np.clip(reward, -1.0, 1.0)
            agent.store(stacked_states_copy, action, reward, stacked_states, done)
            episode_reward += reward

            # -------------------------------------------------------------------------------#
            # -------------------------------------------------------------------------------#
            # ---------------------             2 填空           -----------------------------#
            # -------------------------------------------------------------------------------#
            # -------------------------------------------------------------------------------#
            #2 使用多步回报,在训练的前期目标价值可以估计地更准,从而加快模型的训练
            if step % params["train_period"] == 0:
                beta = min(1.0, params["beta"] + step * (1.0 - params["beta"]) / params["final_annealing_beta_steps"])
                loss += agent.train(beta)
            agent.soft_update_of_target_network()

            if done:
                logger.off()
                logger.log(episode, episode_reward, loss, step, beta)
                episode += 1
                state = env.reset()
                stacked_frames = stack_states(stacked_states, state, True)
                episode_reward = 0
                episode_loss = 0
                logger.on()