Exemple #1
0
def go2():
    env = CartPoleEnv()
    episode_step_counter = 0
    for i_episode in range(10000):
        action = env.reset()

        step_counter = 0
        while True:
            env.render()
            # 随机选择一个action
            # 获取环境给予的奖励
            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            print(reward)

            step_counter = step_counter + 1

            if done:
                episode_step_counter += step_counter
                # print("第{}回合,坚持了{}步".format(i_episode, step_counter))
                print("平均步数:{}".format(episode_step_counter / (i_episode + 1)))

                break

    env.close()
Exemple #2
0
def save():
    env = CartPoleEnv()

    total_steps = 0
    memory = []

    memory_counter = 0
    for i_episode in range(100):

        observation = env.reset()
        while True:
            env.render()
            action = env.action_space.sample()

            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2

            transition = np.hstack((observation, [action,
                                                  reward], observation_))
            memory.append(transition)

            if done:
                break

            observation = observation_
            total_steps += 1
    memory = np.array(memory)
    np.save("memory.npy", memory)

    env.close()
Exemple #3
0
def go():
    env = CartPoleEnv()

    total_steps = 0
    memory = []
    model = create_model()

    epsilon = 0.9
    memory_counter = 1000
    for i_episode in range(1000):

        observation = env.reset()
        ep_r = 0

        while True:
            env.render()

            if np.random.uniform() < epsilon:
                actions_value = model.predict(np.array([observation]))
                action = np.argmax(actions_value)
            else:
                action = env.action_space.sample()

            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            transition = np.hstack((observation, [action,
                                                  reward], observation_))
            memory.append(transition)
            if len(memory) > memory_counter:
                xx, yy = get_data(np.array(memory), model)
                print(xx.shape)
                model.fit(xx, yy, epochs=10)
                epsilon = epsilon + 0.00001
                memory = []
                # memory_counter = memory_counter + 5
            ep_r = ep_r + reward

            if done:
                # print(ep_r)

                break

            observation = observation_
            total_steps += 1

    model.save("logs/cp.h5")
    model.summary()
    env.close()
Exemple #4
0
def get():
    env = CartPoleEnv()

    for i_episode in range(10000):
        observation = env.reset()
        action = chose_action(model=model)
        while True:
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            ransition = np.hstack((observation, [action, reward], observation_))
            print()
    def test_discrete_vectorized_original_equality(self):
        venv = DiscreteVectorizedCartPoleEnv()
        state, action = self.state_action
        action = (action > 0).astype(int)

        dim1, dim2 = self.dims

        venv.state = state
        vobs, vreward, vdone, _ = venv.step(action)

        env = CartPoleEnv()
        for i in range(dim1):
            for j in range(dim2):
                env.reset()
                env.state = state[i, j]
                obs, reward, done, _ = env.step(int(action[i, j, 0]))

                np.testing.assert_allclose(obs, vobs[i, j])
                np.testing.assert_allclose(reward, vreward[i, j])
                np.testing.assert_allclose(done, vdone[i, j])
Exemple #6
0
class CartPoleDictEnvWrapper(gym.Env):
    def __init__(self, max_angle=12, max_num_steps=1000):
        self.env = CartPoleEnv()
        # self.env.theta_threshold_radians = max_angle * 2 * math.pi / 360
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.step_counter = 0
        self.max_num_steps = max_num_steps

    def step(self, action):
        if isinstance(action, numpy.ndarray):
            action = action[0]
        assert isinstance(action, numpy.int64)
        obs, _, done, _ = self.env.step(action)
        self.step_counter += 1
        if self.step_counter % self.max_num_steps == 0:
            done = True
        if done:
            reward = -10.0
            obs = self.env.reset()
        else:
            reward = 0.0
        return {"observation": obs, "reward": reward, "done": int(done)}

    def reset(self):
        obs = self.env.reset()
        return {"observation": obs, "reward": 0.0, "done": int(False)}

    def render(self, mode="human"):
        return self.env.render(mode)

    def close(self):
        self.env.close()

    def seed(self, seed=None):
        return self.env.seed(seed)
Exemple #7
0
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            ransition = np.hstack((observation, [action, reward], observation_))
            print()


if __name__ == '__main__':

    env = CartPoleEnv()
    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            print(reward)
            ransition = np.hstack((observation, [action, reward], observation_))
            print(ransition)


            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break
    env.close()

# Create checkpoint callback