env.viewer.window.on_key_press = key_press env.viewer.window.on_key_release = key_release a = np.array([0]) episode_rewards = [] steps = 0 while True: episode_reward = 0 state = env.reset() state_img = env.render( mode="rgb_array")[::4, ::4, :] # downsampling (every 4th pixel). while True: next_state, r, done, info = env.step(a[0]) next_state_img = env.render(mode="rgb_array")[::4, ::4, :] episode_reward += r samples["state"].append(state) # state has shape (8,) samples["state_img"].append( state_img) # state_img has shape (100, 150, 3) samples["action"].append(np.array(a)) samples["next_state"].append(next_state) samples["next_state_img"].append(next_state_img) samples["reward"].append(r) samples["terminal"].append(done) state = next_state state_img = (
def plot_io_bounds(x, y, vx, vy, theta, omega, a, steps, discrete=True): import matplotlib.pyplot as plt statebox = [x, y, vx, vy, theta, omega] centerstate = [box[0] + .5 * (box[1] - box[0]) for box in statebox] envstate = [i for i in centerstate] # Zero order hold on actions if needed if discrete and isinstance(a, int): a = a * np.ones(steps, dtype=np.int32) elif not discrete: a = [np.array(a) for i in range(steps)] # System IDed model trajectory centerstatehist = [centerstate] for i in range(steps): centerstate = lander_dynamics(*centerstate, a=a[i], discrete=discrete) centerstatehist.append(centerstate) # Actual openai gym model trajectory envstatehist = [envstate] if discrete: from lunar_lander import LunarLander env = LunarLander() else: from lunar_lander import LunarLanderContinuous env = LunarLanderContinuous() s = env.reset(envstate) for i in range(steps): s, _, _, _ = env.step(a[i]) envstatehist.append(s[0:6]) # Overapproximated trajectory stateboxhist = [statebox] for i in range(steps): statebox = lander_box_dynamics(*statebox, a=a[i], steps=1, discrete=discrete) stateboxhist.append(statebox) centerstatehist = np.array(centerstatehist) envstatehist = np.array(envstatehist) stateboxhist = np.array(stateboxhist) t = np.linspace(0, steps, steps + 1) fig, axs = plt.subplots(6, 1, figsize=(4, 9)) # fig.set_size_inches(5,7,forward=True) limits = [[-1, 1], [0, 1], [-1, 1], [-1, 1], [-np.pi / 3, np.pi / 3], [-.5, .5]] for i in range(6): axs[i].fill_between(t, stateboxhist[:, i, 0], stateboxhist[:, i, 1], alpha=0.3) axs[i].plot(centerstatehist[:, i], 'r') axs[i].plot(envstatehist[:, i], 'b.') axs[i].set_ylim(bottom=limits[i][0], top=limits[i][1]) axs[i].set_yticks(np.linspace(limits[i][0], limits[i][1], 17), minor=True) axs[i].grid(which='minor', alpha=.4) axs[0].set_title('Action {0}'.format(a)) plt.show()
if key == KEY.E: _human_agent_action = ACTIONS.index('LEFTFIRE') def key_release(key, mod): global _human_agent_action _human_agent_action = ACTIONS.index('NOOP') env.render() env.unwrapped.viewer.window.on_key_press = key_press env.unwrapped.viewer.window.on_key_release = key_release # create models if RAM: a = random.randrange(env.action_space.n) s, r, done, info = env.step(a) N_STATE = len(s) MODEL = LanderDQN if 'lunar' in opt.env else RamDQN policy_net = MODEL(N_STATE, N_ACTIONS).to(device) target_net = MODEL(N_STATE, N_ACTIONS).to(device) else: MODEL = DDQN if opt.dueling else DQN policy_net = MODEL(n_actions=N_ACTIONS).to(device) target_net = MODEL(n_actions=N_ACTIONS).to(device) # init target model target_net.load_state_dict(policy_net.state_dict()) # setup optimizer optimizer = optim.RMSprop(policy_net.parameters(), lr=LR)
#agent = DQNAgent(state_size, action_size) done = False batch_size = 1 if 0: agent.load("model.dat") for e in range(EPISODES): #agent.load("../../Downloads/model_900.h5") #agent.epsilon = 0.0 state = env.reset() state = np.reshape(state, [1, state_size]) tot_rew = 0 for time in range(300): #action = agent.act(state) action = 0 next_state, reward, done, _ = env.step(action) tot_rew += reward if time < 100: save_state = next_state #reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) #agent.remember(state, action, reward, next_state, done) state = next_state if e % 100 == 0: env.render() print(reward) if done: env.set_state(save_state) #print("episode: {}/{}, score: {}, time {}, e: {:.2}" # .format(e, EPISODES, tot_rew/time, time, agent.epsilon)) break