Ejemplo n.º 1
0
def run():
    env = gym.make('CartPole-v0')
    env = env.unwrapped
    N_ACTIONS = env.action_space.n
    N_STATES = env.observation_space.shape[0]

    RL = DeepQNetwork(N_ACTIONS, N_STATES)

    step = 0
    for i in range(600):  # 玩300个回合
        # init env
        observation = env.reset()
        step_in = 0
        while True:
            # refresh env
            env.render()

            action = RL.choose_action(observation)

            observation_, reward, done, info = env.step(action)

            # modify the reward
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians -
                  abs(theta)) / env.theta_threshold_radians - 0.5
            r = r1 + r2

            RL.store_transition(observation, action, r, observation_)

            if step > 200 and step % 5 == 0:
                RL.learn()

            if done:
                print('step_in:%s  reward:%s' % (step_in, reward))
                plot_data.append(step_in)
                break
            observation = observation_
            step += 1
            step_in += 1
    # end of game
    print('game over')
    # env.destroy()

    # plot_data = np.array(plot_data, dtype='float32')
    # plot_data = np.divide(plot_data, plot_data.max())
    print(plot_data)
Ejemplo n.º 2
0
# -*- coding: utf-8 -*-
'''
Author: winddy
'''
import numpy as np
from grid_mdp import GridEnv
from DQN_modified import DeepQNetwork

env = GridEnv()
RL = DeepQNetwork(len(env.getAction()),
                  len(env.getStates()),
                  learning_rate=0.01,
                  reward_decay=0.9,
                  e_greedy=0.1,
                  replace_target_iter=200,
                  memory_size=2000)

episodes = 2000
step = 0
for i in range(episodes):

    state = env.reset()
    while True:
        env.render()

        feature = [0] * len(env.getStates())
        feature[state - 1] = 1
        feature = np.hstack(feature)
        action = RL.choose_action(feature)

        state_, reward, done = env.step(action)
Ejemplo n.º 3
0
        #         move = line.strip().split(',')
        #         numbers_move = [int(l) for l in move ]
        #         # print('aaa')
        #         # print(origin_chess)
        #         move_way = trans_action_to_A(origin_chess, numbers_move)
        #         # print(move_way)
        #     if line.strip().__contains__(',') == False :
        #         move = line.strip().split(',')
        #         numbers_move = [float(l) for l in move ]
        #         print(numbers_move)


if __name__ == "__main__":
    #print(S_test[9,8])
    #print(trans_S(S_test))
    RL = DeepQNetwork(187, 96,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )
    file_path = os.getcwd() + '\\ajax.txt'
    parse_txt(file_path)
    RL.plot_cost()
    # s = '11.200000000000728'
    # print(s.__contains__('.'))
    # print(np.loadtxt(file_path))
    # print(trans_action_to_A(S_test,action_test))
    # print(trans_A_to_action(S_test,trans_action_to_A(S_test,action_test)))
Ejemplo n.º 4
0
        # RL take action and get next observation and reward
        observation_, reward, done = env.step(action)

        RL.store_transition(observation, action, reward, observation_)

        observation = observation_

        if done:
            print('total: %s, reward: %s' % (step, reward))
            break
        step += 1
        time.sleep(1)
    # end of game
    print('game over')
    env.destroy()


if __name__ == '__main__':
    env = Maze()
    # RL = QLearningTable(actions=list(range(env.n_actions)))
    # RL = SarsaLambda(actions=list(range(env.n_actions)))
    RL = DeepQNetwork(n_actions=env.n_actions,
                      n_features=env.n_features,
                      output_graph=True)
    # 开始可视化环境 env
    # env.after(100, update)
    # env.after(100, update_sarsa)
    env.after(100, update_DQN)
    # env.after(100, play_once)
    env.mainloop()
    RL.plot_cost()
Ejemplo n.º 5
0
def train_hdqnm(seed, file):
    # maze game
    MAZE_H=3
    MAZE_W=3
    hell_coord = [0,2]
    door_coord = [2,2]
    oval_coord = [1,1]

    np.random.seed(seed)
    tf.set_random_seed(seed)

    # maze game
    env = Maze(MAZE_H, MAZE_W, hell_coord, door_coord, oval_coord)
    n_goals = 3
    max_episode = 10000
    controller_start = 200
    meta_controller_start = 10000
    controller = DeepQNetwork(env.n_actions, env.n_features + 1, 'controller',
                              optimizer='rmsprop',
                              momentum=0.9,
                              learning_rate=1e-3,
                              opt_decay=0.99,
                              reward_decay=0.99,
                              e_greedy=0,
                              e_greedy_max=0.99,
                              e_greedy_increment=1e-4,
                              e_greedy_iter=5e3,
                              replace_target_iter=200,
                              memory_size=5000,
                              output_graph=False,
                              prioritized_replay=False,
                              prioritized_replay_alpha=0.6,
                              prioritized_replay_beta0=0.4,
                              prioritized_replay_beta_iters=1e5,
                              prioritized_replay_eps=1e-6
                              )

    meta_controller = DeepQNetwork(n_goals, env.n_features, 'meta_controller',
                                   optimizer='rmsprop',
                                   momentum=0.9,
                                   learning_rate=1e-3,
                                   opt_decay=0.99,
                                   reward_decay=0.99,
                                   e_greedy=0,
                                   e_greedy_max=0.99,
                                   e_greedy_increment=1e-4,
                                   e_greedy_iter=1e3,
                                   replace_target_iter=200,
                                   memory_size=500,
                                   output_graph=False,
                                   prioritized_replay=True,
                                   prioritized_replay_alpha=0.6,
                                   prioritized_replay_beta0=0.4,
                                   prioritized_replay_beta_iters=1e3,
                                   prioritized_replay_eps=1e-6,
                                   )

    def play_maze():
        def goal_reached(g, s):
            return ((s[2 * g + 1] == 0) and (s[2 * g + 2] == 0))

        s = env.reset()
        g = meta_controller.choose_action(s, test=True)
        score = 0
        while True:
            env.render()
            a = controller.choose_action(np.hstack((s, g)), test=True)
            s_, r, done = env.step(a)
            score += r
            if done:
                break
            s = s_
            if goal_reached(g, s):
                g = meta_controller.choose_action(s, test=True)
        return score

    def run_maze(max_episode):
        def goal_reached(g, s):
            return ((s[2 * g + 1] == 0) and (s[2 * g + 2] == 0))

        def goal_distance(g, s):
            return np.sqrt(s[2 * g + 1] ** 2 + s[2 * g + 2] ** 2)

        def reward(g, s_, done):
            # return 1+1/episode_step if goal_reached(g, s_) else 0

            # if done and not goal_reached(g, s_): return -1
            return 1 if goal_reached(g, s_) else -1

            #return -goal_distance(g, s_)

            #if goal_reached(g, s_):
            #     return 1
            # else:
            #     return -1

            # for i in range(n_goals):
            #    if goal_reached(i, s_):
            #        if i == g: return 1
            #        else: return -1
            # return 0

        step = 0
        score_list = []
        avescore_list = []
        testscore_list = []
        flag = [False, False, False, False]
        for episode in range(max_episode):
            # initial observation
            s = env.reset()
            score = 0
            g = meta_controller.choose_action(s)
            Done = False
            while True:
                F = 0
                s0 = s
                episode_step = 1
                while True:
                    env.render()
                    a = controller.choose_action(np.hstack((s, g)))
                    s_, f, done = env.step(a)
                    score += f
                    r = reward(g, s_, done)
                    controller.memory.add(np.hstack((s, g)), a, r, np.hstack((s_, g)), done)
                    if (step > controller_start) and (step % 5 == 0):
                        controller.learn()
                    # if (step > meta_controller_start) and (step % 5 == 0):
                    #    meta_controller.learn()

                    if step == meta_controller_start:
                        print('\nmeta controller start learn~~~~~~~~~~~~~~~~~~~~')
                    if step == controller_start:
                        print('\ncontroller start learn~~~~~~~~~~~~~~~~~~~~~~~~~~')

                    F = F + f
                    s = s_
                    step = step + 1

                    if done:
                        Done = True
                        break
                    if goal_reached(g, s):
                        break

                    episode_step = episode_step + 1
                if goal_reached(g, s):
                    meta_controller.memory.add(s0, g, F, s, done)
                if step > meta_controller_start:
                    meta_controller.learn()
                if Done:
                    break
                g = meta_controller.choose_action(s)
            # print(step)
            score_list.append(score)
            if (episode > 0 and episode % 50 == 0):
                avescore = np.average(np.array(score_list[-50:]))
                avescore_list.append(avescore)
                print("\nepisode %d : average score = %f" % (episode, avescore))
                testscore = play_maze()
                testscore_list.append(testscore)
                print("episode %d : test score = %f\n" % (episode, testscore))

                if avescore > 2.5 and not flag[0]:
                    flag[0] = True
                    with open(file, 'a+') as f:
                        f.write('train average score achieves 2.5 (' + str(avescore) + ')\n')

                    # print('game over')
                    # env.destroy()
                    # return
                elif avescore > 0.5 and not flag[1]:
                    flag[1] = True
                    with open(file, 'a+') as f:
                        f.write('train average score achieves 0.5 (' + str(avescore) + ')\n')

                if testscore > 2.5 and not flag[2]:
                    flag[2] = True
                    with open(file, 'a+') as f:
                        f.write('test score achieves 2.5 (' + str(testscore) + ')\n')
                elif testscore > 0.5 and not flag[3]:
                    flag[3] = True
                    with open(file, 'a+') as f:
                        f.write('test score achieves 0.5 (' + str(testscore) + ')\n')

            if (episode > 0 and episode % 10 == 0):
                if step > controller_start:
                    print('controller loss:', np.mean(controller.cost_his[np.max(
                        [0, controller.learn_step_counter - 100]):controller.learn_step_counter]))
                if step > meta_controller_start:
                    print('meta controller loss:', np.mean(meta_controller.cost_his[np.max(
                        [0, meta_controller.learn_step_counter - 100]):meta_controller.learn_step_counter]))
        # end of game
        print('game over')
        env.destroy()
        plt.plot(range(len(avescore_list)), avescore_list)
        plt.show()
        np.savetxt("E:/my_py_project/rl/log/maze33/hdqnm/SEED_%d_avescore_log.txt" % (seed), avescore_list)
        np.savetxt("E:/my_py_project/rl/log/maze33/hdqnm/SEED_%d_testscore_log.txt" % (seed), testscore_list)

    env.after(100, run_maze, max_episode)
    env.mainloop()
    controller.plot_cost()
    meta_controller.plot_cost()
import gym
from DQN_modified import DeepQNetwork
import matplotlib.pyplot as plt
import numpy as np

env = gym.make('CartPole-v0')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = DeepQNetwork(n_actions=env.action_space.n,
                  n_features=env.observation_space.shape[0],
                  learning_rate=0.01, e_greedy=0.9,
                  replace_target_iter=100, memory_size=1000,
                  )

total_steps = 0
reward_c = []
show = []
running_reward = 0
for i_episode in range(1000):
    t = 0
    observation = env.reset()
    ep_r = 0
    while True:
        # env.render()

        action = RL.choose_action(observation)
Ejemplo n.º 7
0
    # action_space=[[0.0, 0.8], [1.0, 0.0]]
    # action_space = [[0.0, 1.0], [1.0, 0.8]]
    # action_space = [[1.0, 1.0], [1.0, 0.0],[0.0,0.0],[0.0,1.0]]
    # action_space=[[0.0,0.8],[0.0,1.0]]
    # print(action_space)
    # print(action_space[:4])
    # print(action_space[5:])
    # print(action_space[:-6])
    # print(action_space[-5:])
    envSeqDec = ChallengeProveEnvironment()
    action_space=getActionSpace(envSeqDec)#缩减action space
    print('actionSpace::::::::::::',action_space)
    RL = DeepQNetwork(len(action_space), 6,
                      learning_rate=0.0001,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=20,#200
                      memory_size=100,#2000
                      batch_size=16,
                      # output_graph=True
                      )
    # print(RL)
    # print('\n'.join(['%s:%s' % item for item in RL.__dict__.items()]))
    rewards=run_maze()
    print('Best Reward:',np.max(rewards))
    x = list(range(len(rewards)))
    plt.plot(x, rewards)
    plt.show()

    # env.mainloop()
    # RL.plot_cost()
Ejemplo n.º 8
0
def train_dqn(seed, file):
    np.random.seed(seed)
    tf.set_random_seed(seed)

    # dsdp game
    env = dsdp()
    max_episode = 20000
    dqn_start = 5000
    dqn = DeepQNetwork(env.n_actions,
                       env.n_features,
                       'dqn',
                       optimizer='rmsprop',
                       momentum=0.9,
                       learning_rate=0.00025,
                       opt_decay=0.99,
                       reward_decay=0.99,
                       e_greedy=0,
                       e_greedy_max=0.99,
                       e_greedy_increment=1e-4,
                       e_greedy_iter=5e3,
                       replace_target_iter=200,
                       memory_size=10000,
                       output_graph=False,
                       prioritized_replay=True,
                       prioritized_replay_alpha=0.6,
                       prioritized_replay_beta0=0.4,
                       prioritized_replay_beta_iters=1e5,
                       prioritized_replay_eps=1e-6)

    def play_maze():
        # def goal_reached(g, s):
        #    if(g == 0):
        #        return (s[0]==0)
        #    else:
        #        return ((s[2*g-1]==0) and (s[2*g]==0))
        def goal_reached(g, s):
            return (s[1] == g)

        s = env.reset()
        score = 0
        while True:
            env.render()
            a = dqn.choose_action(s, test=True)
            s_, r, done = env.step(a)
            score += r
            if done:
                break
            s = s_

        return score

    def run_maze(max_episode):
        step = 0
        score_list = []
        avescore_list = []
        testscore_list = []
        flag = [False, False, False, False]
        for episode in range(max_episode):
            # initial observation
            s = env.reset()
            score = 0
            while True:
                env.render()
                a = dqn.choose_action(s)
                s_, r, done = env.step(a)
                score += r
                dqn.memory.add(s, a, r, s_, done)

                if (step > dqn_start) and (step % 5 == 0):
                    dqn.learn()
                if step == dqn_start:
                    print('\ndqn start learn~~~~~~~~~~~~~~~~~~~~')

                s = s_
                step = step + 1
                if done:
                    break

            score_list.append(score)
            if (episode > 0 and episode % 50 == 0):
                # average score
                avescore = np.average(np.array(score_list[-50:]))
                avescore_list.append(avescore)
                print("\nepisode %d : average score = %f" %
                      (episode, avescore))
                # test score
                testscore = 0
                for i in range(5):
                    testscore += play_maze()
                testscore /= 5
                testscore_list.append(testscore)
                print("episode %d : test score = %f\n" % (episode, testscore))

                # logs
                if avescore > 0.1 and not flag[0]:
                    flag[0] = True
                    with open(file, 'a+') as f:
                        f.write('train average score achieves 0.1 (' +
                                str(avescore) + ')\n')

                    #print('game over')
                    #env.destroy()
                    #return
                elif avescore > 0.02 and not flag[1]:
                    flag[1] = True
                    with open(file, 'a+') as f:
                        f.write('train average score achieves 0.02 (' +
                                str(avescore) + ')\n')

                if testscore > 0.1 and not flag[2]:
                    flag[2] = True
                    with open(file, 'a+') as f:
                        f.write('test score achieves 0.1 (' + str(testscore) +
                                ')\n')
                elif testscore > 0.02 and not flag[3]:
                    flag[3] = True
                    with open(file, 'a+') as f:
                        f.write('test score achieves 0.02 (' + str(testscore) +
                                ')\n')

            # loss
            if (episode > 0 and episode % 10 == 0):
                if step > dqn_start:
                    print(
                        'dqn loss:',
                        np.mean(
                            dqn.cost_his[np.
                                         max([0, dqn.learn_step_counter -
                                              100]):dqn.learn_step_counter]))

        # end of game
        print('game over')
        env.destroy()
        plt.plot(range(len(avescore_list)), avescore_list)
        plt.show()
        np.savetxt(
            "E:/my_py_project/rl/log/dsdp/dqn/SEED_%d_avescore_log.txt" %
            (seed), avescore_list)
        np.savetxt(
            "E:/my_py_project/rl/log/dsdp/dqn/SEED_%d_testscore_log.txt" %
            (seed), testscore_list)

    env.after(100, run_maze, max_episode)
    env.mainloop()
    dqn.plot_cost()
Ejemplo n.º 9
0
import gym
from DQN_modified import DeepQNetwork

env = gym.make('MountainCar-v0')
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = DeepQNetwork(
    n_actions=3,
    n_features=2,
    learning_rate=0.001,
    e_greedy=0.9,
    replace_target_iter=300,
    memory_size=3000,
    e_greedy_increment=0.0002,
)

total_steps = 0

for i_episode in range(10):

    observation = env.reset()
    ep_r = 0
    while True:
        env.render()

        action = RL.choose_action(observation)
Ejemplo n.º 10
0
            step += 1

    # end of game
    print('game over')
    # print(RL.memory)
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze()
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,
        memory_size=2000,
        # output_graph=True
    )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()

    import matplotlib.pyplot as plt
    plt.plot(np.arange(len(episode_reward)), episode_reward)
    plt.ylabel('episode_reward')
    plt.xlabel('run steps')
    plt.show()
Ejemplo n.º 11
0
def translate_int_action(int_action):
    act = np.zeros(2)
    if int_action == 1:
        act[0] = 1
    if int_action == 2:
        act[1] = 1
    return act


env = Dogfight()
RL = DeepQNetwork(n_actions=3,
                  n_features=14,
                  learning_rate=0.01,
                  reward_decay=0.9,
                  e_greedy=0.9,
                  replace_target_iter=200,
                  memory_size=1048576,
                  batch_size=50 * 700,
                  training=True,
                  import_file='saved/trained_dqn')

step = 0
score_history = []
for episode in range(600):
    blue_state, red_state = env.reset()
    score = 0
    #Main game loop
    while True:
        blue_action = RL.choose_action(blue_state)
        red_action = 0  #RL.choose_action(red_state)
Ejemplo n.º 12
0
from DQN_modified import DeepQNetwork

env = Crypto(name='BTC-USD',
             data_path='./test.csv',
             start_cash=1000,
             fee=0.001,
             drawdown_call=10,
             fixed_stake=0.001,
             period=180)

RL = DeepQNetwork(
    env.n_actions,
    env.n_features,
    learning_rate=0.01,
    reward_decay=0.9,
    e_greedy=0.9,
    e_greedy_increment=0.00001,
    replace_target_iter=6400,
    memory_size=100000,
    # output_graph=True
)
total_steps = 0
total_length = env.length

for i_episode in range(total_length):

    observation = env.reset()

    ep_r = 0
    while True:
Ejemplo n.º 13
0
            # swap observation
            observation = observation_
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze()
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,  #每200步替换一次target_net的参数
        memory_size=2000,  #记忆上线
        # output_graph=True #是否输出tensorboard文件
    )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()