コード例 #1
0
def go2():
    env = CartPoleEnv()
    episode_step_counter = 0
    for i_episode in range(10000):
        action = env.reset()

        step_counter = 0
        while True:
            env.render()
            # 随机选择一个action
            # 获取环境给予的奖励
            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            print(reward)

            step_counter = step_counter + 1

            if done:
                episode_step_counter += step_counter
                # print("第{}回合,坚持了{}步".format(i_episode, step_counter))
                print("平均步数:{}".format(episode_step_counter / (i_episode + 1)))

                break

    env.close()
コード例 #2
0
ファイル: my_brain.py プロジェクト: bigcong/io
def save():
    env = CartPoleEnv()

    total_steps = 0
    memory = []

    memory_counter = 0
    for i_episode in range(100):

        observation = env.reset()
        while True:
            env.render()
            action = env.action_space.sample()

            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2

            transition = np.hstack((observation, [action,
                                                  reward], observation_))
            memory.append(transition)

            if done:
                break

            observation = observation_
            total_steps += 1
    memory = np.array(memory)
    np.save("memory.npy", memory)

    env.close()
コード例 #3
0
def test_order_enforcing():
    """Checks that the order enforcing works as expected, raising an error before reset is called and not after."""
    # The reason for not using gym.make is that all environments are by default wrapped in the order enforcing wrapper
    env = CartPoleEnv()
    assert not has_wrapper(env, OrderEnforcing)

    # Assert that the order enforcing works for step and render before reset
    order_enforced_env = OrderEnforcing(env)
    assert order_enforced_env._has_reset is False
    with pytest.raises(ResetNeeded):
        order_enforced_env.step(0)
    with pytest.raises(ResetNeeded):
        order_enforced_env.render(mode="rgb_array")
    assert order_enforced_env._has_reset is False

    # Assert that the Assertion errors are not raised after reset
    order_enforced_env.reset()
    assert order_enforced_env._has_reset is True
    order_enforced_env.step(0)
    order_enforced_env.render(mode="rgb_array")

    # Assert that with disable_render_order_enforcing works, the environment has already been reset
    env = CartPoleEnv()
    env = OrderEnforcing(env, disable_render_order_enforcing=True)
    env.render(mode="rgb_array")  # no assertion error
コード例 #4
0
ファイル: my_brain.py プロジェクト: bigcong/io
def go():
    env = CartPoleEnv()

    total_steps = 0
    memory = []
    model = create_model()

    epsilon = 0.9
    memory_counter = 1000
    for i_episode in range(1000):

        observation = env.reset()
        ep_r = 0

        while True:
            env.render()

            if np.random.uniform() < epsilon:
                actions_value = model.predict(np.array([observation]))
                action = np.argmax(actions_value)
            else:
                action = env.action_space.sample()

            observation_, reward, done, info = env.step(action)

            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            transition = np.hstack((observation, [action,
                                                  reward], observation_))
            memory.append(transition)
            if len(memory) > memory_counter:
                xx, yy = get_data(np.array(memory), model)
                print(xx.shape)
                model.fit(xx, yy, epochs=10)
                epsilon = epsilon + 0.00001
                memory = []
                # memory_counter = memory_counter + 5
            ep_r = ep_r + reward

            if done:
                # print(ep_r)

                break

            observation = observation_
            total_steps += 1

    model.save("logs/cp.h5")
    model.summary()
    env.close()
コード例 #5
0
class CartPoleDictEnvWrapper(gym.Env):
    def __init__(self, max_angle=12, max_num_steps=1000):
        self.env = CartPoleEnv()
        # self.env.theta_threshold_radians = max_angle * 2 * math.pi / 360
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space
        self.step_counter = 0
        self.max_num_steps = max_num_steps

    def step(self, action):
        if isinstance(action, numpy.ndarray):
            action = action[0]
        assert isinstance(action, numpy.int64)
        obs, _, done, _ = self.env.step(action)
        self.step_counter += 1
        if self.step_counter % self.max_num_steps == 0:
            done = True
        if done:
            reward = -10.0
            obs = self.env.reset()
        else:
            reward = 0.0
        return {"observation": obs, "reward": reward, "done": int(done)}

    def reset(self):
        obs = self.env.reset()
        return {"observation": obs, "reward": 0.0, "done": int(False)}

    def render(self, mode="human"):
        return self.env.render(mode)

    def close(self):
        self.env.close()

    def seed(self, seed=None):
        return self.env.seed(seed)
コード例 #6
0
ファイル: keras_cartpole.py プロジェクト: bigcong/io
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            ransition = np.hstack((observation, [action, reward], observation_))
            print()


if __name__ == '__main__':

    env = CartPoleEnv()
    for i_episode in range(20):
        observation = env.reset()
        for t in range(100):
            env.render()
            action = env.action_space.sample()
            observation_, reward, done, info = env.step(action)
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            reward = r1 + r2
            print(reward)
            ransition = np.hstack((observation, [action, reward], observation_))
            print(ransition)


            if done:
                print("Episode finished after {} timesteps".format(t + 1))
                break
    env.close()