action = RL.choose_action(observation)
            observation_,reward,done = env.step(action)
            RL.store_transition(observation, action, reward, observation_)
            if (step > 200) and (step % 5 == 0):
                RL.learn()

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    print('game over')
    env.destroy()

if __name__ == '__main__':
    env = Maze()
    RL = DoubleDQN(env.n_actions, env.n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )

    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()
Beispiel #2
0
from maze_env import Maze
from RL_brain import Agent
from RL_brain import myAgent
import time
import matplotlib as plt
from pylab import *

if __name__ == "__main__":
    ### START CODE HERE ###
    # This is an agent with random policy. You can learn how to interact with the environment through the code below.
    # Then you can delete it and write your own code.
    graph_episode = []
    graph_reward = []
    env = Maze()
    agent = myAgent(actions=list(range(env.n_actions)))
    for episode in range(500):
        s = env.reset()
        episode_reward = 0
        agent.epoch_num += 1
        while True:
            #env.render()                 # You can comment all render() to turn off the graphical interface in training process to accelerate your code.
            a = agent.choose_action(s)
            s_, r, done = env.step(a)
            q_dict = agent.update(s, a, s_, r)
            episode_reward += r
            s = s_
            agent.has_been_to_this_state[float((s[0] + s[2]) / 2),
                                         float((s[1] + s[3]) / 2)] = True
            if done:
                #env.render()
                #time.sleep(0.5)
Beispiel #3
0
            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('run over')
    time_elapsed = time.time() - since
    # 代码计时
    print('The run_maze code run {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))


if __name__ == "__main__":

    env = Maze()
    RL = DeepQNetwork(env.n_actions, env.n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )

    run_maze()
    RL.plot_cost()
Beispiel #4
0
from RL_brain import DeepQNetwork
from maze_env import Maze
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from knn_reward_model import KNN_Predict

MEMORY_SIZE = 20000
epsilon = 200
sess = tf.Session()
np.random.seed(1)
tf.set_random_seed(1)

env = Maze()
with tf.variable_scope('natural_DQN'):
    RL_natural = DeepQNetwork(n_actions=env.n_actions,
                              n_features=env.n_features,
                              memory_size=MEMORY_SIZE,
                              replace_target_iter=200
                              # e_greedy_increment=0.0001
                              )
knn = KNN_Predict(3)
times = []
real = []


def onehot(action):
    x = [0, 0, 0, 0]
    x[action] = 1
    return x
Beispiel #5
0
def plot_reward_movements():
    plt.figure()
    plt.subplot(2, 1, 1)
    plt.plot(episodes, movements)
    plt.xlabel("Episode")
    plt.ylabel("# Movements")

    plt.subplot(2, 1, 2)
    plt.step(episodes, rewards)
    plt.xlabel("Episode")
    plt.ylabel("Reward")
    plt.savefig("rewards_movements_q_learn.png")
    plt.show()


if __name__ == "__main__":

    # Craete maze environment
    env = Maze()  #ToDo: instanciate Maze class

    # Create Q-learning agent
    q_learning_agent = QLearningTable(actions=list(range(
        env.n_actions)))  #ToDo: instanciate QLearningTable class

    # Call run_experiment() function once after given time in milliseconds.
    env.window.after(10, run_experiment)

    # The infinite loop used to run the application, wait for an event to occur and process the event
    # till the window is not closed.
    env.window.mainloop()
Beispiel #6
0
from maze_env import Maze
from RL_brain import QL
import time

# Parameters
EPSILON = 0.9  # Greedy Policy
ALPHA = 0.1  # Learing Rate
LAMBDA = 0.9  # Discount Factor
MAX_EPISODE = 50
MAZE_SIZE = 5
TRAP_SET = [[0, 1], [2, 2], [3, 3], [3, 0]]
TREASURE_SET = [[4, 4]]

env = Maze(MAZE_SIZE)
RL = QL(MAZE_SIZE, EPSILON, LAMBDA, ALPHA)


def update():
    for episode in range(MAX_EPISODE):
        O = env.reset()  # return initial observation
        step_number = 0
        env.set_trap(TRAP_SET)
        env.set_treasure(TREASURE_SET)
        is_terminated = False
        env.render()

        while not is_terminated:
            A = RL.choose_action(O)
            # print(O)
            OO, R, is_terminated = env.step(A)
            # print(O,OO)
                    print('failed!')
                env.destroy()
            else:
                observation = env.reset()


def on_release(key):
    print('{0} release'.format(key))
    if key == Key.esc:
        # Stop listener
        return False


def get_action():
    global observation
    observation = env.reset()
    listener = Listener(on_press=on_press, on_release=on_release)
    listener.start()


if __name__ == "__main__":
    # maze game
    env = Maze()

    env.after(100, get_action)
    print('env started!')
    env.mainloop()
    print('env ended!')

    # Collect events until released
class Controller:
    def __init__(self):
        self.env = Maze(self)
        self.env.mainloop()

    def main(self, tubarao, foca, peixe, alga, calorias):
        qtd_tub = int(tubarao)
        qtd_foca = int(foca)
        qtd_peixe = int(peixe)
        qtd_alga = int(alga)
        cal = int(calorias)

        threads = []
        MAX_THREAD = qtd_foca + qtd_peixe + qtd_tub + qtd_alga
        nome = None

        self.env.p.set(MAX_THREAD)

        semaforo = threading.Semaphore(MAX_THREAD)
        semaforo_validation = threading.Semaphore(0)
        teste = threading.Lock()

        thread_timer = TimerThread(596, self.env)
        thread_timer.start()

        next_id = 0

        #faz o numero de tubarao
        for i in range(qtd_tub):
            tipo = 4

            thread = IndividuoThread(next_id,
                                     cal,
                                     nome,
                                     tipo,
                                     semaforo_validation,
                                     self.env,
                                     semaforo,
                                     teste=teste,
                                     max_t=MAX_THREAD)
            thread.start()
            threads.append(thread)
            next_id += 1

        # faz o numero de foca
        for i in range(qtd_foca):
            tipo = 3

            thread = IndividuoThread(next_id,
                                     cal,
                                     nome,
                                     tipo,
                                     semaforo_validation,
                                     self.env,
                                     semaforo,
                                     teste=teste,
                                     max_t=MAX_THREAD)
            thread.start()
            threads.append(thread)
            next_id += 1

        # faz o numero de peixe
        for i in range(qtd_peixe):
            tipo = 2
            thread = IndividuoThread(next_id,
                                     cal,
                                     nome,
                                     tipo,
                                     semaforo_validation,
                                     self.env,
                                     semaforo,
                                     teste=teste,
                                     max_t=MAX_THREAD)
            thread.start()
            threads.append(thread)
            next_id += 1

        # faz o numero de peixe
        for i in range(qtd_alga):
            tipo = 1
            thread = IndividuoThread(next_id,
                                     cal,
                                     nome,
                                     tipo,
                                     semaforo_validation,
                                     self.env,
                                     semaforo,
                                     teste=teste,
                                     max_t=MAX_THREAD)
            thread.start()
            threads.append(thread)
            next_id += 1

        thread_validacao = ValidationThread(133, threads, semaforo_validation,
                                            semaforo, MAX_THREAD, self.env)
        thread_validacao.start()
 def __init__(self):
     self.env = Maze(self)
     self.env.mainloop()
Beispiel #10
0
def main():
    env = Maze()
    agent = QLearning(action_space=list(range(env.n_action)))
    update(agent, env)

    env.mainloop()
Beispiel #11
0
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":

    time_string = utils.get_string_time()
    print(time_string, " the test begins")
    start_time = time.clock()

    # maze game
    env = Maze()

    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.98,  # 最终不再探索
        replace_target_iter=300,
        memory_size=4800,
        e_greedy_origin=0.5,
        e_greedy_increment=0.0001,  # epsilon增长速度
        model_load=True,
        model_load_dir="save/2018-3-30-22:17/model.ckpt",
        model_save_dir="save/{time}/model.ckpt".format(time=time_string),
        output_graph=False,
Beispiel #12
0
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":

    time_string = utils.get_string_time()
    print(time_string, " the test begins")
    start_time = time.clock()

    # maze game
    env = Maze()

    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.02,
        reward_decay=0.9,
        e_greedy=0.98,  # 最终不再探索
        replace_target_iter=300,
        memory_size=60000,
        e_greedy_origin=0.7,
        e_greedy_increment=0.00005,  # epsilon增长速度
        model_load=False,
        model_load_dir="save/2018-4-4-16:50/model.ckpt",
        model_save_dir="save/{time}/model.ckpt".format(time=time_string),
        output_graph=True,
Beispiel #13
0

def modify_path(routeX, routeY):
    isFind, i1, i2 = find_same(routeX, routeY)
    while isFind:
        routeX = np.append(routeX[:i1], routeX[i2:])
        routeY = np.append(routeY[:i1], routeY[i2:])
        #print(routeX)
        #print(routeY)
        isFind, i1, i2 = find_same(routeX, routeY)
    return routeX, routeY


if __name__ == "__main__":
    # maze game
    env = Maze()
    model_path = os.path.dirname(
        os.path.abspath(__file__)) + "/model20171219.ckpt"
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        model_path,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,
        memory_size=2000,
        # output_graph=True
    )
    env.after(100, get_route_maze)
    env.mainloop()
Beispiel #14
0
            print("新观测值,即下一位置:\n", observation_)
            print("奖励:\n", reward)
            print("done?:\n", done)
            print("Q表:\n", RL.q_table)
            print("-" * 50 + "\n")

            observation = observation_  # 将新观测值作为下一次的初始观测值

            # 若下地狱或上天堂,则结束循环
            if done:
                break

        print("~" * 50)
        print("第%d回合结束" % (episode + 1))
        print("~" * 50 + "\n")

    # 游戏结束,销毁环境
    print('Game Over')
    env.destroy()
    RL.q_table.to_csv(
        './model.csv'
    )  # 【可选】将最终的Q表存储下来,Q表即训练之后的模型;(本想将模型保存下来以便下次直接使用,但pandas的存储与读取遇到小问题,非重点,故暂搁置)


if __name__ == "__main__":
    env = Maze()  # 创建迷宫环境
    RL = QLearningTable(actions=list(range(
        env.n_actions)))  # 声明RL,即强化学习的行动者,暂称机器人;参数actions=[0, 1, 2, 3]
    env.after(100, update)  # tkinter的after函数;每100ms调用一次update函数
    env.mainloop()  # 启动tkinter,即以窗口形式显示环境
        while True:
            # fresh env
            env.render()

            # RL choose action based on observation
            action = RL.choose_action(str(observation))  #选择

            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = Maze()  #先定义一个环境
    RL = QLearningTable(actions=list(range(env.n_actions)))  #给出一个动作list

    env.after(100, update)
    env.mainloop()
Beispiel #16
0
from maze_env import Maze
from RL_brain import Agent
import time

# from uilts import view

actions = ["up", "down", "right", "left"]

if __name__ == "__main__":
    ### START CODE HERE ###
    # This is an agent with random policy. You can learn how to interact with the environment through the code below.
    # Then you can delete it and write your own code.

    env = Maze()
    agent = Agent(actions=list(range(env.n_actions)))
    for episode in range(50):
        s = env.reset()
        episode_reward = 0
        while True:
            env.render(
            )  # You can comment all render() to turn off the graphical interface in training process to accelerate your code.
            # time.sleep(0.1)
            # view(agent.q_table)

            a = agent.choose_action(s)
            s_, r, done = env.step(a)

            agent.add_observation(s, a, r, s_, done)

            episode_reward += r
            s = s_
Beispiel #17
0
                RL.learn(str(observation), action, reward, str(observation_))
                record(str(observation), action, reward, str(observation_),
                       done, RL.q_table, int(int(episode / 10000 + 1) * 10000))
                observation = copy.deepcopy(observation_)
                if done:
                    break


def trainWithTrajectory(trajectory, episode):
    for item in trajectory:
        observation, action, reward, observation_ = item
        RL.learn(observation, action, reward, observation)
        record(observation, action, reward, observation_, True, RL.q_table,
               int(int(episode / 10000 + 1) * 10000))


if __name__ == '__main__':
    task_num = [
        3,
    ]
    for taskNum in task_num:
        parameter["taskNum"] = taskNum
        from task import *

        task = createTask()

        # Q-learning
        env = Maze(task)
        RL = QLearningTable(actions=list(range(env.n_actions)))
        update1(env, RL, 5)
        RL.q_table.to_csv("Q_learning Table" + str(taskNum) + "_Insert")
        (actor_loss + critic_loss).backward()
        self.actor_critic.optimizer.step()


from maze_env import Maze
import os
import time
import matplotlib.pyplot as plt
import pickle

np.random.seed(42)

if __name__ == '__main__':

    env = Maze(height=21,
               width=21,
               detection_range=1,
               obstacle_occupancy_prob=0.5)

    agent = ACAgent(alpha=0.00001,
                    gamma=0.999,
                    input_dims=[529],
                    n_actions=8,
                    layer1_size=2048,
                    layer2_size=512)

    score_history = []
    avg_score_history = []

    epsiodes = 5000000
    max_steps = 23 * 23
Beispiel #19
0
            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_),
                     _action)
            action = _action

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    # RL = QLearningTable(actions=list(range(env.n_actions)))
    # RL = SarsaTable(actions=list(range(env.n_actions)))
    RL = SarsaLambdaTable(actions=list(range(env.n_actions)))

    # delay 一段時間後執行cb function
    # env.after(100, update)
    # update()
    update_sarsa()
    print(RL.q_table)
    env.mainloop()
Beispiel #20
0
            rew.append(R)
            RL.learn(str(stade), action, reward, str(observation_))
            # 将下一个 state 的值传到下一次循环
            stade = copy.deepcopy(observation_)
            # 如果掉下地狱或者升上天堂, 这回合就结束了
            if done:
                break
    plt.plot(np.arange(len(rew)), rew)
    plt.ylabel('Reward', fontsize=14)
    plt.xlabel('Training steps', fontsize=14)
    plt.show()


if __name__ == "__main__":
    # 定义环境 env 和 RL 方式
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)))
    # 开始可视化环境 env
    update()
    for edgeuser in [5, 10, 15, 20, 25, 30, 35, 40, 45]:
        # 随机处理
        t = 0
        for i in range(10):
            G = topology()  # 任务图
            E1 = E
            t += Baseline(user, G, E1, KE, edgeuser)
        random.append(t / 10)

        # 穷举法
        H = topology()  # 任务图
        E2 = E
    # end of game
    print('game over for QV-learning')
    env.destroy()


if __name__ == "__main__":
    plot_y = list(range(EPIS))
    fig, ax = plt.subplots()
    # Data for plotting
    ax.set(xlabel='episode (s)',
           ylabel='Total Rewards',
           title='Total rewards at each episode')
    ax.grid()

    #QV-learning
    env = Maze()
    RL = QVLearningTable(actions=list(range(env.n_actions)),
                         learning_rate=0.7,
                         lr_v=0.5)
    env.after(100, update_QV_learning)
    env.mainloop()
    ax.plot(range(EPIS), plot_y, label='QV-learning')

    #Q-learning
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)),
                        learning_rate=0.5,
                        e_greedy=0.85)
    env.after(100, update_Q_learning)
    env.mainloop()
    ax.plot(range(EPIS), plot_y, label='Q-learning')
Beispiel #22
0
            s1, reward, done = env.step(a0)
            a1 = agent.get_action(s1, episode)

            agent.learning(s0, a0, reward, s1, a1)

            s0 = s1

            if done:
                if reward < 0:
                    print("\033[0;31;40m\t{}\033[0m".format(path))
                    fail += 1
                else:
                    print("\033[0;32;40m\t{}\033[0m".format(path))
                    success += 1
                break

    print('game over')
    print('success:', success)
    print('fail:', fail)
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    # agent = SARSA()
    # agent = SARSA_LAMBDA()
    # env.after(100, on_policy)
    agent = Q_learning()
    env.after(100, q_learning)
    env.mainloop()
    episode = max(plot_episode)
    env.destroy()
    env.mainloop()
    plot_episode.clear()
    # This step is a very special bug. We import this function and run this in a for loop.
    # The data in plot_episode will be stored in thr for loop.
    # We must clear it at the end.
    return running_time, episode, plot_sum_reward

# This 'if' judgement is very special, the function is: if we import this file in other places,
# Rather than running this file directly, the content below will not be processed.


if __name__ == "__main__":
    print('-----------Start-------------')
    env = Maze(height=10, width=10)
    Sarsa_brain_ = Model.SARSA(greedy_rate=0.9, learning_rate=0.01, reward_decay=0.9)
    # Use two methods to evaluate the algorithm,
    # the time it takes to complete the appointed number of episode
    # the step it takes to converge (judged by the repeated steps: if the reward in 10 sequent episodes are the same
    # We roughly think the algorithm started to converge
    # When the judge_method == numbers of repeated steps, the
    # update(judge_number=100, judge_method='numbers of repeated steps')

    T1 = time.perf_counter()
    # update(judge_number=1000, judge_method='sum of episodes', delay_time=0.00)
    update(judge_number=10, judge_method='repeated steps', delay_time=0.00)
    T2 = time.perf_counter()

    print('Time spend :%s ms' % ((T2 - T1)*1000))
    result_display.result_plot(x=plot_episode, y=plot_sum_reward, x_name='Episode', y_name='Sum of Reward',
Beispiel #24
0
            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze()

    env = gym.make('DoomDefendLine-v0')
    env.reset()


    RL = DeepQNetwork(2, 4,
                      learning_rate=0.03,
                      reward_decay=0.7,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      e_greedy_increment = 0.001
                      # output_graph=True
                      )
    run_maze()
Beispiel #25
0
import os
from maze_env import Maze
import time
import numpy as np
import random

maz = Maze()
# only for interaction:
# maz.render()

# initiate matrix with dimension from maze
dim = maz.MAZE_Limit

np.set_printoptions(suppress=True)
# action options up down left right
a = [(0, 1), (0, -1), (1, 0), (-1, 0)]

# crate environment
state_value = np.tile(0, dim)
counter = np.tile(0, dim)

# [DONE] First or Every visit state value counted
visit = "first"  # "first" or "every"
# [DONE] Discounted factor
gamma = 0.9
# [DONE] MC
# [TO-DO] if there's block
# [TO-DO] if random start

n_episode = 10000
for n in range(0, n_episode):
Beispiel #26
0
main function


"""

#next_stat, reward, done, stat, action = q_learning.observation
#Q = q_learning.Q


##
    
if __name__ == "__main__":
    
    N_episode = 50
    env = Maze() 
    q_learning = table_qlearning(
            n_actions = 4,
            epsilon=0.8,
            )
    
    
    for episode in range(N_episode):
        print("starting an new episode\n")
        stat = env.reset()
        i_step = 0
        R =[] 
        while True:
            i_step += 1
            env.render()
            
                )  #从200幕后,每5幕进行一次学习(5就是学习频率learning_freq),更新evaluate-Q网络参数,每200次学习更新一次目标Q网络参数,学习的大脑中的记忆是最近2000幕,也就是可能跨越过程选batch

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            step += 1  #每进行一幕,step+1

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    # maze game
    env = Maze()
    RL = DeepQNetwork(
        env.n_actions,
        env.n_features,
        learning_rate=0.01,
        reward_decay=0.9,
        e_greedy=0.9,
        replace_target_iter=200,  #每200次学习更新一次target-Q参数
        memory_size=2000,  #记忆容量提到2000,即在最近2000幕中随机选batch
        # output_graph=True
    )
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()
Beispiel #28
0
                print(episode, 'episode,', episode_step, 'step_counter, 失败结束')
            elif reward == 1:
                print(episode, 'episode,', episode_step, 'step_counter, 成功结束')
                # print(RL.policy_net.state_dict()['fc1.bias'])

    if not learn:
        print('测试结束')
    else:
        print('学习结束')
    env.destroy()

    return RL


if __name__ == '__main__':
    env = Maze('train')
    n_actions = env.n_actions
    n_features = env.n_features
    RL = DQN(n_actions,
             n_features,
             memory_size=256,
             batch_size=8,
             learning_rate=0.01,
             gamma=0.9,
             e_greedy=0.9,
             e_greedy_increment=None,
             ddqn=True,
             dueling=True,
             target_update_step=10,
             output_graph=False)
Beispiel #29
0
        observation = env.reset()
        action = RL.choose_action(str(observation))
        num_steps = 0
        while True:
            num_steps += 1
            # fresh env
            env.render()

            # 下一个action
            action_ = RL.choose_action(str(observation))
            observation_, reward, done = env.step(action)
            RL.learn(str(observation), action, reward, str(observation_),
                     action_)

            observation = observation_
            action = action_

            if done:
                print('Episode %d, num_steps=%d' % (episode + 1, num_steps))
                break

    print('game over')
    env.destroy()


if __name__ == '__main__':
    env = Maze()
    RL = SarsaTable(actions=list(range(env.n_actions)))
    env.after(100, update)
    env.mainloop()
Beispiel #30
0
class CustomGym(Env):
    """The main OpenAI Gym class. It encapsulates an environment with
    arbitrary behind-the-scenes dynamics. An environment can be
    partially or fully observed.
    The main API methods that users of this class need to know are:
        step
        reset
        render
        close
        seed
    And set the following attributes:
        action_space: The Space object corresponding to valid actions
        observation_space: The Space object corresponding to valid observations
        reward_range: A tuple corresponding to the min and max possible rewards
    Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range.
    The methods are accessed publicly as "step", "reset", etc...
    """
    def __init__(
        self,
        agentXY,
        goalXY,
        walls=[],
        pits=[],
        title='Maze',
    ):
        super(CustomGym, self).__init__()
        self.env = Maze(agentXY, goalXY, walls, pits, title)
        self.title = title
        self.action_space = spaces.Discrete(self.env.n_actions)
        self.observation_space = spaces.Box(low=0,
                                            high=0,
                                            shape=(4, ),
                                            dtype=np.float32)

        self.rewards = [[]]
        self.variance = []
        self.median = []

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        Args:
            action (object): an action provided by the agent
        Returns:
            observation (object): agent's observation of the current environment
            reward (float) : amount of reward returned after previous action
            done (bool): whether the episode has ended, in which case further step() calls will return undefined results
            info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)
        """

        s_, reward, done = self.env.step(action)

        self.rewards[-1].append(reward)
        if done:
            self.variance.append(np.var(self.rewards[-1]))
            self.median.append(np.median(self.rewards[-1]))
            self.rewards.append([])

        return s_, reward, done, {}

    def render(self, mode='human'):
        self.env.render()

    def reset(self, value=1, resetAgent=True):
        return self.env.reset()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Note:
            Some environments use multiple pseudorandom number generators.
            We want to capture all such seeds used in order to ensure that
            there aren't accidental correlations between multiple generators.
        Returns:
            list<bigint>: Returns the list of seeds used in this env's random
              number generators. The first value in the list should be the
              "main" seed, or the value which a reproducer should pass to
              'seed'. Often, the main seed equals the provided 'seed', but
              this won't be true if seed=None, for example.
        """
        np.random.seed(10)
        random.seed(10)
        return

    def save_csv(self):
        with open(f"./data/{self.title}_rewards_{time.time()}",
                  "w+") as my_csv:
            csvWriter = csv.writer(my_csv, delimiter=',')
            csvWriter.writerows(self.rewards)
        with open(f"./data/{self.title}_variance_{time.time()}",
                  "w+") as my_csv:
            csvWriter = csv.writer(my_csv, delimiter=',')
            for var in self.variance:
                csvWriter.writerow([var])
        with open(f"./data/{self.title}_median_{time.time()}", "w+") as my_csv:
            csvWriter = csv.writer(my_csv, delimiter=',')
            for med in self.median:
                csvWriter.writerow([med])

    def destroy(self):
        self.env.destroy()
Beispiel #31
0
        while True:
            # fresh env
            env.render()

            # RL choose action based on observation
            action = RL.choose_action(str(observation))

            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            # RL learn from this transition
            RL.learn(str(observation), action, reward, str(observation_))

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break

    # end of game
    print('game over')
    env.destroy()


if __name__ == "__main__":
    env = Maze()
    RL = QLearningTable(actions=list(range(env.n_actions)))
    env.after(100, update)
    env.mainloop()
Beispiel #32
0
    #Task 3
    # wall_shape=np.array([[7,4],[7,3],[6,3],[6,2],[5,2],[4,2],[3,2],[3,3],[3,4],[3,5],[3,6],[4,6],[5,6]])
    # pits=np.array([[1,3],[0,5], [7,7]])

    experiments = []

    # # alg0 (Aynsc)
    # env0 = Maze(agentXY,goalXY,wall_shape, pits)
    # RL0 = rlalg0(actions=list(range(env0.n_actions)))
    # data0={}
    # env0.after(10, update(env0, RL0, data0, episodes))
    # env0.mainloop()
    # experiments = [(env0,RL0, data0)]

    # alg2 (SARSA)
    env2 = Maze(agentXY, goalXY, wall_shape, pits)
    RL2 = rlalg2(actions=list(range(env2.n_actions)))
    data2 = {}
    env2.after(10, update(env2, RL2, data2, episodes))
    env2.mainloop()
    experiments.append((env2, RL2, data2))

    # alg1 (Q-Learning)
    env1 = Maze(agentXY, goalXY, wall_shape, pits)
    RL1 = rlalg1(actions=list(range(env1.n_actions)))
    data1 = {}
    env1.after(10, update(env1, RL1, data1, episodes))
    env1.mainloop()
    experiments.append((env1, RL1, data1))

    # alg4 (Expected Sarsa)