Example #1
0
def run():
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    N_ACTIONS = env.action_space.n
    N_STATES = env.observation_space.shape[0]

    RL = DeepQNetwork(N_ACTIONS, N_STATES)

    step = 0
    for i in range(600):  # 玩300个回合
        # init env
        observation = env.reset()
        step_in = 0
        while True:
            # refresh env
            env.render()

            action = RL.choose_action(observation)

            observation_, reward, done, info = env.step(action)

            # modify the reward
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r = r1 + r2

            RL.store_transition(observation, action, r, observation_)

            if step > 200 and step % 5 == 0:
                RL.learn()

            if done:
                print('step_in:%s  reward:%s' % (step_in, reward))
                plot_data.append(step_in)
                break
            observation = observation_
            step += 1
            step_in += 1
    # end of game
    print('game over')
    # env.destroy()

    # plot_data = np.array(plot_data, dtype='float32')
    # plot_data = np.divide(plot_data, plot_data.max())
    print(plot_data)
Example #2
0
episodes = 2000
step = 0
for i in range(episodes):

    state = env.reset()
    while True:
        env.render()

        feature = [0] * len(env.getStates())
        feature[state - 1] = 1
        feature = np.hstack(feature)
        action = RL.choose_action(feature)

        state_, reward, done = env.step(action)

        feature_ = [0] * len(env.getStates())
        feature_[state_ - 1] = 1
        feature_ = np.hstack(feature_)

        RL.store_transition(feature, action, reward, feature_)

        if (step > 200) and (step % 5 == 0):
            RL.learn()

        state = state_

        if done:
            break
        step += 1
    while True:
        # env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        # the smaller theta and closer to center the better
        # x, x_dot, theta, theta_dot = observation_
        # r1 = (env.x_threshold - abs(x))/env.x_threshold - 0.8
        # r2 = (env.theta_threshold_radians - abs(theta))/env.theta_threshold_radians - 0.5
        # reward = r1 + r2
        if done:
            reward = -1

        RL.store_transition(observation, action, reward, observation_)

        ep_r += reward
        if total_steps > 1000:
            RL.learn()

        if done or t >= 200:
            if(running_reward == 0):
                running_reward = ep_r
            else:
                running_reward = running_reward*0.95 + ep_r*0.05
            print('episode: ', i_episode,
                  'ep_r: ', int(running_reward),
                  ' epsilon: ', round(RL.epsilon, 2))
            reward_c.append(ep_r)
            show.append(running_reward)
Example #4
0
step = 0
score_history = []
for episode in range(600):
    blue_state, red_state = env.reset()
    score = 0
    #Main game loop
    while True:
        blue_action = RL.choose_action(blue_state)
        red_action = 0  #RL.choose_action(red_state)

        blue_state_, red_state, blue_reward, done = env.step(
            translate_int_action(blue_action),
            translate_int_action(red_action))

        RL.store_transition(blue_state, blue_action, blue_reward, blue_state_)

        if (step > 200) and (step % 50 == 0):
            RL.learn()

        blue_state = blue_state_

        env.render()

        score += blue_reward

        if episode % 200 == 0:
            turtle.update()
        if done:
            score_history.append(score)
            print("Final Score: " + str("%9.2f" % score) + " [Episode] " +