action = RL.choose_action(observation) observation_,reward,done = env.step(action) RL.store_transition(observation, action, reward, observation_) if (step > 200) and (step % 5 == 0): RL.learn() # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 print('game over') env.destroy() if __name__ == '__main__': env = Maze() RL = DoubleDQN(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
from maze_env import Maze from RL_brain import Agent from RL_brain import myAgent import time import matplotlib as plt from pylab import * if __name__ == "__main__": ### START CODE HERE ### # This is an agent with random policy. You can learn how to interact with the environment through the code below. # Then you can delete it and write your own code. graph_episode = [] graph_reward = [] env = Maze() agent = myAgent(actions=list(range(env.n_actions))) for episode in range(500): s = env.reset() episode_reward = 0 agent.epoch_num += 1 while True: #env.render() # You can comment all render() to turn off the graphical interface in training process to accelerate your code. a = agent.choose_action(s) s_, r, done = env.step(a) q_dict = agent.update(s, a, s_, r) episode_reward += r s = s_ agent.has_been_to_this_state[float((s[0] + s[2]) / 2), float((s[1] + s[3]) / 2)] = True if done: #env.render() #time.sleep(0.5)
# swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('run over') time_elapsed = time.time() - since # 代码计时 print('The run_maze code run {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) if __name__ == "__main__": env = Maze() RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) run_maze() RL.plot_cost()
from RL_brain import DeepQNetwork from maze_env import Maze import tensorflow as tf from tqdm import tqdm import numpy as np from knn_reward_model import KNN_Predict MEMORY_SIZE = 20000 epsilon = 200 sess = tf.Session() np.random.seed(1) tf.set_random_seed(1) env = Maze() with tf.variable_scope('natural_DQN'): RL_natural = DeepQNetwork(n_actions=env.n_actions, n_features=env.n_features, memory_size=MEMORY_SIZE, replace_target_iter=200 # e_greedy_increment=0.0001 ) knn = KNN_Predict(3) times = [] real = [] def onehot(action): x = [0, 0, 0, 0] x[action] = 1 return x
def plot_reward_movements(): plt.figure() plt.subplot(2, 1, 1) plt.plot(episodes, movements) plt.xlabel("Episode") plt.ylabel("# Movements") plt.subplot(2, 1, 2) plt.step(episodes, rewards) plt.xlabel("Episode") plt.ylabel("Reward") plt.savefig("rewards_movements_q_learn.png") plt.show() if __name__ == "__main__": # Craete maze environment env = Maze() #ToDo: instanciate Maze class # Create Q-learning agent q_learning_agent = QLearningTable(actions=list(range( env.n_actions))) #ToDo: instanciate QLearningTable class # Call run_experiment() function once after given time in milliseconds. env.window.after(10, run_experiment) # The infinite loop used to run the application, wait for an event to occur and process the event # till the window is not closed. env.window.mainloop()
from maze_env import Maze from RL_brain import QL import time # Parameters EPSILON = 0.9 # Greedy Policy ALPHA = 0.1 # Learing Rate LAMBDA = 0.9 # Discount Factor MAX_EPISODE = 50 MAZE_SIZE = 5 TRAP_SET = [[0, 1], [2, 2], [3, 3], [3, 0]] TREASURE_SET = [[4, 4]] env = Maze(MAZE_SIZE) RL = QL(MAZE_SIZE, EPSILON, LAMBDA, ALPHA) def update(): for episode in range(MAX_EPISODE): O = env.reset() # return initial observation step_number = 0 env.set_trap(TRAP_SET) env.set_treasure(TREASURE_SET) is_terminated = False env.render() while not is_terminated: A = RL.choose_action(O) # print(O) OO, R, is_terminated = env.step(A) # print(O,OO)
print('failed!') env.destroy() else: observation = env.reset() def on_release(key): print('{0} release'.format(key)) if key == Key.esc: # Stop listener return False def get_action(): global observation observation = env.reset() listener = Listener(on_press=on_press, on_release=on_release) listener.start() if __name__ == "__main__": # maze game env = Maze() env.after(100, get_action) print('env started!') env.mainloop() print('env ended!') # Collect events until released
class Controller: def __init__(self): self.env = Maze(self) self.env.mainloop() def main(self, tubarao, foca, peixe, alga, calorias): qtd_tub = int(tubarao) qtd_foca = int(foca) qtd_peixe = int(peixe) qtd_alga = int(alga) cal = int(calorias) threads = [] MAX_THREAD = qtd_foca + qtd_peixe + qtd_tub + qtd_alga nome = None self.env.p.set(MAX_THREAD) semaforo = threading.Semaphore(MAX_THREAD) semaforo_validation = threading.Semaphore(0) teste = threading.Lock() thread_timer = TimerThread(596, self.env) thread_timer.start() next_id = 0 #faz o numero de tubarao for i in range(qtd_tub): tipo = 4 thread = IndividuoThread(next_id, cal, nome, tipo, semaforo_validation, self.env, semaforo, teste=teste, max_t=MAX_THREAD) thread.start() threads.append(thread) next_id += 1 # faz o numero de foca for i in range(qtd_foca): tipo = 3 thread = IndividuoThread(next_id, cal, nome, tipo, semaforo_validation, self.env, semaforo, teste=teste, max_t=MAX_THREAD) thread.start() threads.append(thread) next_id += 1 # faz o numero de peixe for i in range(qtd_peixe): tipo = 2 thread = IndividuoThread(next_id, cal, nome, tipo, semaforo_validation, self.env, semaforo, teste=teste, max_t=MAX_THREAD) thread.start() threads.append(thread) next_id += 1 # faz o numero de peixe for i in range(qtd_alga): tipo = 1 thread = IndividuoThread(next_id, cal, nome, tipo, semaforo_validation, self.env, semaforo, teste=teste, max_t=MAX_THREAD) thread.start() threads.append(thread) next_id += 1 thread_validacao = ValidationThread(133, threads, semaforo_validation, semaforo, MAX_THREAD, self.env) thread_validacao.start()
def __init__(self): self.env = Maze(self) self.env.mainloop()
def main(): env = Maze() agent = QLearning(action_space=list(range(env.n_action))) update(agent, env) env.mainloop()
break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": time_string = utils.get_string_time() print(time_string, " the test begins") start_time = time.clock() # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.98, # 最终不再探索 replace_target_iter=300, memory_size=4800, e_greedy_origin=0.5, e_greedy_increment=0.0001, # epsilon增长速度 model_load=True, model_load_dir="save/2018-3-30-22:17/model.ckpt", model_save_dir="save/{time}/model.ckpt".format(time=time_string), output_graph=False,
break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": time_string = utils.get_string_time() print(time_string, " the test begins") start_time = time.clock() # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.02, reward_decay=0.9, e_greedy=0.98, # 最终不再探索 replace_target_iter=300, memory_size=60000, e_greedy_origin=0.7, e_greedy_increment=0.00005, # epsilon增长速度 model_load=False, model_load_dir="save/2018-4-4-16:50/model.ckpt", model_save_dir="save/{time}/model.ckpt".format(time=time_string), output_graph=True,
def modify_path(routeX, routeY): isFind, i1, i2 = find_same(routeX, routeY) while isFind: routeX = np.append(routeX[:i1], routeX[i2:]) routeY = np.append(routeY[:i1], routeY[i2:]) #print(routeX) #print(routeY) isFind, i1, i2 = find_same(routeX, routeY) return routeX, routeY if __name__ == "__main__": # maze game env = Maze() model_path = os.path.dirname( os.path.abspath(__file__)) + "/model20171219.ckpt" RL = DeepQNetwork( env.n_actions, env.n_features, model_path, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) env.after(100, get_route_maze) env.mainloop()
print("新观测值,即下一位置:\n", observation_) print("奖励:\n", reward) print("done?:\n", done) print("Q表:\n", RL.q_table) print("-" * 50 + "\n") observation = observation_ # 将新观测值作为下一次的初始观测值 # 若下地狱或上天堂,则结束循环 if done: break print("~" * 50) print("第%d回合结束" % (episode + 1)) print("~" * 50 + "\n") # 游戏结束,销毁环境 print('Game Over') env.destroy() RL.q_table.to_csv( './model.csv' ) # 【可选】将最终的Q表存储下来,Q表即训练之后的模型;(本想将模型保存下来以便下次直接使用,但pandas的存储与读取遇到小问题,非重点,故暂搁置) if __name__ == "__main__": env = Maze() # 创建迷宫环境 RL = QLearningTable(actions=list(range( env.n_actions))) # 声明RL,即强化学习的行动者,暂称机器人;参数actions=[0, 1, 2, 3] env.after(100, update) # tkinter的after函数;每100ms调用一次update函数 env.mainloop() # 启动tkinter,即以窗口形式显示环境
while True: # fresh env env.render() # RL choose action based on observation action = RL.choose_action(str(observation)) #选择 # RL take action and get next observation and reward observation_, reward, done = env.step(action) # RL learn from this transition RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() #先定义一个环境 RL = QLearningTable(actions=list(range(env.n_actions))) #给出一个动作list env.after(100, update) env.mainloop()
from maze_env import Maze from RL_brain import Agent import time # from uilts import view actions = ["up", "down", "right", "left"] if __name__ == "__main__": ### START CODE HERE ### # This is an agent with random policy. You can learn how to interact with the environment through the code below. # Then you can delete it and write your own code. env = Maze() agent = Agent(actions=list(range(env.n_actions))) for episode in range(50): s = env.reset() episode_reward = 0 while True: env.render( ) # You can comment all render() to turn off the graphical interface in training process to accelerate your code. # time.sleep(0.1) # view(agent.q_table) a = agent.choose_action(s) s_, r, done = env.step(a) agent.add_observation(s, a, r, s_, done) episode_reward += r s = s_
RL.learn(str(observation), action, reward, str(observation_)) record(str(observation), action, reward, str(observation_), done, RL.q_table, int(int(episode / 10000 + 1) * 10000)) observation = copy.deepcopy(observation_) if done: break def trainWithTrajectory(trajectory, episode): for item in trajectory: observation, action, reward, observation_ = item RL.learn(observation, action, reward, observation) record(observation, action, reward, observation_, True, RL.q_table, int(int(episode / 10000 + 1) * 10000)) if __name__ == '__main__': task_num = [ 3, ] for taskNum in task_num: parameter["taskNum"] = taskNum from task import * task = createTask() # Q-learning env = Maze(task) RL = QLearningTable(actions=list(range(env.n_actions))) update1(env, RL, 5) RL.q_table.to_csv("Q_learning Table" + str(taskNum) + "_Insert")
(actor_loss + critic_loss).backward() self.actor_critic.optimizer.step() from maze_env import Maze import os import time import matplotlib.pyplot as plt import pickle np.random.seed(42) if __name__ == '__main__': env = Maze(height=21, width=21, detection_range=1, obstacle_occupancy_prob=0.5) agent = ACAgent(alpha=0.00001, gamma=0.999, input_dims=[529], n_actions=8, layer1_size=2048, layer2_size=512) score_history = [] avg_score_history = [] epsiodes = 5000000 max_steps = 23 * 23
# RL learn from this transition RL.learn(str(observation), action, reward, str(observation_), _action) action = _action # swap observation observation = observation_ # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() # RL = QLearningTable(actions=list(range(env.n_actions))) # RL = SarsaTable(actions=list(range(env.n_actions))) RL = SarsaLambdaTable(actions=list(range(env.n_actions))) # delay 一段時間後執行cb function # env.after(100, update) # update() update_sarsa() print(RL.q_table) env.mainloop()
rew.append(R) RL.learn(str(stade), action, reward, str(observation_)) # 将下一个 state 的值传到下一次循环 stade = copy.deepcopy(observation_) # 如果掉下地狱或者升上天堂, 这回合就结束了 if done: break plt.plot(np.arange(len(rew)), rew) plt.ylabel('Reward', fontsize=14) plt.xlabel('Training steps', fontsize=14) plt.show() if __name__ == "__main__": # 定义环境 env 和 RL 方式 env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) # 开始可视化环境 env update() for edgeuser in [5, 10, 15, 20, 25, 30, 35, 40, 45]: # 随机处理 t = 0 for i in range(10): G = topology() # 任务图 E1 = E t += Baseline(user, G, E1, KE, edgeuser) random.append(t / 10) # 穷举法 H = topology() # 任务图 E2 = E
# end of game print('game over for QV-learning') env.destroy() if __name__ == "__main__": plot_y = list(range(EPIS)) fig, ax = plt.subplots() # Data for plotting ax.set(xlabel='episode (s)', ylabel='Total Rewards', title='Total rewards at each episode') ax.grid() #QV-learning env = Maze() RL = QVLearningTable(actions=list(range(env.n_actions)), learning_rate=0.7, lr_v=0.5) env.after(100, update_QV_learning) env.mainloop() ax.plot(range(EPIS), plot_y, label='QV-learning') #Q-learning env = Maze() RL = QLearningTable(actions=list(range(env.n_actions)), learning_rate=0.5, e_greedy=0.85) env.after(100, update_Q_learning) env.mainloop() ax.plot(range(EPIS), plot_y, label='Q-learning')
s1, reward, done = env.step(a0) a1 = agent.get_action(s1, episode) agent.learning(s0, a0, reward, s1, a1) s0 = s1 if done: if reward < 0: print("\033[0;31;40m\t{}\033[0m".format(path)) fail += 1 else: print("\033[0;32;40m\t{}\033[0m".format(path)) success += 1 break print('game over') print('success:', success) print('fail:', fail) env.destroy() if __name__ == "__main__": env = Maze() # agent = SARSA() # agent = SARSA_LAMBDA() # env.after(100, on_policy) agent = Q_learning() env.after(100, q_learning) env.mainloop()
episode = max(plot_episode) env.destroy() env.mainloop() plot_episode.clear() # This step is a very special bug. We import this function and run this in a for loop. # The data in plot_episode will be stored in thr for loop. # We must clear it at the end. return running_time, episode, plot_sum_reward # This 'if' judgement is very special, the function is: if we import this file in other places, # Rather than running this file directly, the content below will not be processed. if __name__ == "__main__": print('-----------Start-------------') env = Maze(height=10, width=10) Sarsa_brain_ = Model.SARSA(greedy_rate=0.9, learning_rate=0.01, reward_decay=0.9) # Use two methods to evaluate the algorithm, # the time it takes to complete the appointed number of episode # the step it takes to converge (judged by the repeated steps: if the reward in 10 sequent episodes are the same # We roughly think the algorithm started to converge # When the judge_method == numbers of repeated steps, the # update(judge_number=100, judge_method='numbers of repeated steps') T1 = time.perf_counter() # update(judge_number=1000, judge_method='sum of episodes', delay_time=0.00) update(judge_number=10, judge_method='repeated steps', delay_time=0.00) T2 = time.perf_counter() print('Time spend :%s ms' % ((T2 - T1)*1000)) result_display.result_plot(x=plot_episode, y=plot_sum_reward, x_name='Episode', y_name='Sum of Reward',
# swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() env = gym.make('DoomDefendLine-v0') env.reset() RL = DeepQNetwork(2, 4, learning_rate=0.03, reward_decay=0.7, e_greedy=0.9, replace_target_iter=200, memory_size=2000, e_greedy_increment = 0.001 # output_graph=True ) run_maze()
import os from maze_env import Maze import time import numpy as np import random maz = Maze() # only for interaction: # maz.render() # initiate matrix with dimension from maze dim = maz.MAZE_Limit np.set_printoptions(suppress=True) # action options up down left right a = [(0, 1), (0, -1), (1, 0), (-1, 0)] # crate environment state_value = np.tile(0, dim) counter = np.tile(0, dim) # [DONE] First or Every visit state value counted visit = "first" # "first" or "every" # [DONE] Discounted factor gamma = 0.9 # [DONE] MC # [TO-DO] if there's block # [TO-DO] if random start n_episode = 10000 for n in range(0, n_episode):
main function """ #next_stat, reward, done, stat, action = q_learning.observation #Q = q_learning.Q ## if __name__ == "__main__": N_episode = 50 env = Maze() q_learning = table_qlearning( n_actions = 4, epsilon=0.8, ) for episode in range(N_episode): print("starting an new episode\n") stat = env.reset() i_step = 0 R =[] while True: i_step += 1 env.render()
) #从200幕后,每5幕进行一次学习(5就是学习频率learning_freq),更新evaluate-Q网络参数,每200次学习更新一次目标Q网络参数,学习的大脑中的记忆是最近2000幕,也就是可能跨越过程选batch # swap observation observation = observation_ # break while loop when end of this episode if done: break step += 1 #每进行一幕,step+1 # end of game print('game over') env.destroy() if __name__ == "__main__": # maze game env = Maze() RL = DeepQNetwork( env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, #每200次学习更新一次target-Q参数 memory_size=2000, #记忆容量提到2000,即在最近2000幕中随机选batch # output_graph=True ) env.after(100, run_maze) env.mainloop() RL.plot_cost()
print(episode, 'episode,', episode_step, 'step_counter, 失败结束') elif reward == 1: print(episode, 'episode,', episode_step, 'step_counter, 成功结束') # print(RL.policy_net.state_dict()['fc1.bias']) if not learn: print('测试结束') else: print('学习结束') env.destroy() return RL if __name__ == '__main__': env = Maze('train') n_actions = env.n_actions n_features = env.n_features RL = DQN(n_actions, n_features, memory_size=256, batch_size=8, learning_rate=0.01, gamma=0.9, e_greedy=0.9, e_greedy_increment=None, ddqn=True, dueling=True, target_update_step=10, output_graph=False)
observation = env.reset() action = RL.choose_action(str(observation)) num_steps = 0 while True: num_steps += 1 # fresh env env.render() # 下一个action action_ = RL.choose_action(str(observation)) observation_, reward, done = env.step(action) RL.learn(str(observation), action, reward, str(observation_), action_) observation = observation_ action = action_ if done: print('Episode %d, num_steps=%d' % (episode + 1, num_steps)) break print('game over') env.destroy() if __name__ == '__main__': env = Maze() RL = SarsaTable(actions=list(range(env.n_actions))) env.after(100, update) env.mainloop()
class CustomGym(Env): """The main OpenAI Gym class. It encapsulates an environment with arbitrary behind-the-scenes dynamics. An environment can be partially or fully observed. The main API methods that users of this class need to know are: step reset render close seed And set the following attributes: action_space: The Space object corresponding to valid actions observation_space: The Space object corresponding to valid observations reward_range: A tuple corresponding to the min and max possible rewards Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range. The methods are accessed publicly as "step", "reset", etc... """ def __init__( self, agentXY, goalXY, walls=[], pits=[], title='Maze', ): super(CustomGym, self).__init__() self.env = Maze(agentXY, goalXY, walls, pits, title) self.title = title self.action_space = spaces.Discrete(self.env.n_actions) self.observation_space = spaces.Box(low=0, high=0, shape=(4, ), dtype=np.float32) self.rewards = [[]] self.variance = [] self.median = [] def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Args: action (object): an action provided by the agent Returns: observation (object): agent's observation of the current environment reward (float) : amount of reward returned after previous action done (bool): whether the episode has ended, in which case further step() calls will return undefined results info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning) """ s_, reward, done = self.env.step(action) self.rewards[-1].append(reward) if done: self.variance.append(np.var(self.rewards[-1])) self.median.append(np.median(self.rewards[-1])) self.rewards.append([]) return s_, reward, done, {} def render(self, mode='human'): self.env.render() def reset(self, value=1, resetAgent=True): return self.env.reset() def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Note: Some environments use multiple pseudorandom number generators. We want to capture all such seeds used in order to ensure that there aren't accidental correlations between multiple generators. Returns: list<bigint>: Returns the list of seeds used in this env's random number generators. The first value in the list should be the "main" seed, or the value which a reproducer should pass to 'seed'. Often, the main seed equals the provided 'seed', but this won't be true if seed=None, for example. """ np.random.seed(10) random.seed(10) return def save_csv(self): with open(f"./data/{self.title}_rewards_{time.time()}", "w+") as my_csv: csvWriter = csv.writer(my_csv, delimiter=',') csvWriter.writerows(self.rewards) with open(f"./data/{self.title}_variance_{time.time()}", "w+") as my_csv: csvWriter = csv.writer(my_csv, delimiter=',') for var in self.variance: csvWriter.writerow([var]) with open(f"./data/{self.title}_median_{time.time()}", "w+") as my_csv: csvWriter = csv.writer(my_csv, delimiter=',') for med in self.median: csvWriter.writerow([med]) def destroy(self): self.env.destroy()
while True: # fresh env env.render() # RL choose action based on observation action = RL.choose_action(str(observation)) # RL take action and get next observation and reward observation_, reward, done = env.step(action) # RL learn from this transition RL.learn(str(observation), action, reward, str(observation_)) # swap observation observation = observation_ # break while loop when end of this episode if done: break # end of game print('game over') env.destroy() if __name__ == "__main__": env = Maze() RL = QLearningTable(actions=list(range(env.n_actions))) env.after(100, update) env.mainloop()
#Task 3 # wall_shape=np.array([[7,4],[7,3],[6,3],[6,2],[5,2],[4,2],[3,2],[3,3],[3,4],[3,5],[3,6],[4,6],[5,6]]) # pits=np.array([[1,3],[0,5], [7,7]]) experiments = [] # # alg0 (Aynsc) # env0 = Maze(agentXY,goalXY,wall_shape, pits) # RL0 = rlalg0(actions=list(range(env0.n_actions))) # data0={} # env0.after(10, update(env0, RL0, data0, episodes)) # env0.mainloop() # experiments = [(env0,RL0, data0)] # alg2 (SARSA) env2 = Maze(agentXY, goalXY, wall_shape, pits) RL2 = rlalg2(actions=list(range(env2.n_actions))) data2 = {} env2.after(10, update(env2, RL2, data2, episodes)) env2.mainloop() experiments.append((env2, RL2, data2)) # alg1 (Q-Learning) env1 = Maze(agentXY, goalXY, wall_shape, pits) RL1 = rlalg1(actions=list(range(env1.n_actions))) data1 = {} env1.after(10, update(env1, RL1, data1, episodes)) env1.mainloop() experiments.append((env1, RL1, data1)) # alg4 (Expected Sarsa)