def go2(): env = CartPoleEnv() episode_step_counter = 0 for i_episode in range(10000): action = env.reset() step_counter = 0 while True: env.render() # 随机选择一个action # 获取环境给予的奖励 observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 print(reward) step_counter = step_counter + 1 if done: episode_step_counter += step_counter # print("第{}回合,坚持了{}步".format(i_episode, step_counter)) print("平均步数:{}".format(episode_step_counter / (i_episode + 1))) break env.close()
def save(): env = CartPoleEnv() total_steps = 0 memory = [] memory_counter = 0 for i_episode in range(100): observation = env.reset() while True: env.render() action = env.action_space.sample() observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 transition = np.hstack((observation, [action, reward], observation_)) memory.append(transition) if done: break observation = observation_ total_steps += 1 memory = np.array(memory) np.save("memory.npy", memory) env.close()
def go(): env = CartPoleEnv() total_steps = 0 memory = [] model = create_model() epsilon = 0.9 memory_counter = 1000 for i_episode in range(1000): observation = env.reset() ep_r = 0 while True: env.render() if np.random.uniform() < epsilon: actions_value = model.predict(np.array([observation])) action = np.argmax(actions_value) else: action = env.action_space.sample() observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 transition = np.hstack((observation, [action, reward], observation_)) memory.append(transition) if len(memory) > memory_counter: xx, yy = get_data(np.array(memory), model) print(xx.shape) model.fit(xx, yy, epochs=10) epsilon = epsilon + 0.00001 memory = [] # memory_counter = memory_counter + 5 ep_r = ep_r + reward if done: # print(ep_r) break observation = observation_ total_steps += 1 model.save("logs/cp.h5") model.summary() env.close()
def get(): env = CartPoleEnv() for i_episode in range(10000): observation = env.reset() action = chose_action(model=model) while True: observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 ransition = np.hstack((observation, [action, reward], observation_)) print()
def test_discrete_vectorized_original_equality(self): venv = DiscreteVectorizedCartPoleEnv() state, action = self.state_action action = (action > 0).astype(int) dim1, dim2 = self.dims venv.state = state vobs, vreward, vdone, _ = venv.step(action) env = CartPoleEnv() for i in range(dim1): for j in range(dim2): env.reset() env.state = state[i, j] obs, reward, done, _ = env.step(int(action[i, j, 0])) np.testing.assert_allclose(obs, vobs[i, j]) np.testing.assert_allclose(reward, vreward[i, j]) np.testing.assert_allclose(done, vdone[i, j])
class CartPoleDictEnvWrapper(gym.Env): def __init__(self, max_angle=12, max_num_steps=1000): self.env = CartPoleEnv() # self.env.theta_threshold_radians = max_angle * 2 * math.pi / 360 self.observation_space = self.env.observation_space self.action_space = self.env.action_space self.step_counter = 0 self.max_num_steps = max_num_steps def step(self, action): if isinstance(action, numpy.ndarray): action = action[0] assert isinstance(action, numpy.int64) obs, _, done, _ = self.env.step(action) self.step_counter += 1 if self.step_counter % self.max_num_steps == 0: done = True if done: reward = -10.0 obs = self.env.reset() else: reward = 0.0 return {"observation": obs, "reward": reward, "done": int(done)} def reset(self): obs = self.env.reset() return {"observation": obs, "reward": 0.0, "done": int(False)} def render(self, mode="human"): return self.env.render(mode) def close(self): self.env.close() def seed(self, seed=None): return self.env.seed(seed)
r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 ransition = np.hstack((observation, [action, reward], observation_)) print() if __name__ == '__main__': env = CartPoleEnv() for i_episode in range(20): observation = env.reset() for t in range(100): env.render() action = env.action_space.sample() observation_, reward, done, info = env.step(action) x, x_dot, theta, theta_dot = observation_ r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = r1 + r2 print(reward) ransition = np.hstack((observation, [action, reward], observation_)) print(ransition) if done: print("Episode finished after {} timesteps".format(t + 1)) break env.close() # Create checkpoint callback