Esempio n. 1
0
import time
from model import DQN
from rewards import CustomReward

# CONFIG
wave = True
learn_timesteps = 1 * int(1e3)

if wave:
    import arlie

    env = arlie.make("LunarLander", render_mode=False, reward=CustomReward())
else:
    import gym

    env = gym.make("LunarLander-v2")

model = DQN(env)

print("Training...")
_t = time.time()
model.learn(total_timesteps=learn_timesteps)
t = time.time() - _t
str_t = time.strftime("%H h, %M m, %S s", time.gmtime(t))
print("Trained in {} during {} timesteps".format(str_t, learn_timesteps))

model.save("{}-trained-model".format("wave" if wave else "gym"))

env.close()
Esempio n. 2
0
                action)  # 环境模型 采用行为,获得下个状态,和潜在的奖励

            if use_my_reward:
                x, x_dot, theta, theat_dot = observation_next  # 这里拆分了 状态值 ,里面有四个参数
                # 这里用了,x 和theta的限度值 来判断奖励的幅度,当然也可以gym自带的 ,但是这个效率据说比较高
                reward1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
                reward2 = (env.theta_threshold_radians -
                           abs(theta)) / env.theta_threshold_radians - 0.5
                reward = reward1 + reward2  # 这里将奖励综合

            if done:
                reward = 0

            RL.store_transition(observation, action, reward,
                                observation_next)  # 先存储到记忆库
            ep_r += reward  # 这里只是为了观察奖励值是否依据实际情况变化,来方便判断模型的正确性

            # 为什么不把失败的情况存起来?
            if steps > 500:  # 这里一开始先不学习,先积累奖励
                RL.learn()
            if done:  # 这里判断的是回合结束,显示奖励积累值,你可以看到每回合奖励的变化,来判定这样一连串行为的结果好不好
                if ep_r > 400:
                    RENDER = True
                print('setp :', steps, 'episode :', episode, 'ep_r:',
                      round(ep_r, 2), "RL's epsilon", round(RL.epsilon, 3))
                break

            observation = observation_next  # 更新状态
            steps += 1
    RL.plot_cost()  # 训练结束后来观察我们的cost