def update(agent: RL, env: Maze): """Overall learning flow Parameters ---------- agent : RL agent to interact with environment Maze : Maze environment give feedback using state and action """ episodes = 100 for episode in range(episodes): state = env.reset() steps = 0 while True: env.render() action = agent.choose_action(str(state)) n_state, reward, done = env.step(action) agent.learn(str(state), action, reward, str(n_state)) state = n_state steps += 1 if done: break print(f"Episode [{episode+1:03d}/{episodes:03d}]: {steps}") print('game over') env.destroy()
def m_test(judge_number): global MonteCarlo_brain_ global env env = Maze(height=10, width=10) MonteCarlo_brain_ = Model.Monte(greedy_rate=0.5, learning_rate=0.9, reward_decay=0.9) T1 = time.perf_counter() update(judge_number=judge_number, judge_method='repeated steps', delay_time=0.00) T2 = time.perf_counter() running_time = (T2 - T1) * 1000 episode = max(plot_episode) env.destroy() env.mainloop() return running_time, episode, plot_sum_reward
def s_test(judge_number): # This function is designed to be imported in other file to carried out test and multiple Sarse learning cases. # Such as run the Sarsa learning 20 times to get an average episode number and consuming time. global Sarsa_brain_ global env env = Maze(height=10, width=10) Sarsa_brain_ = Model.SARSA(greedy_rate=0.9, learning_rate=0.01, reward_decay=0.9) T1 = time.perf_counter() update(judge_number=judge_number, judge_method='repeated steps', delay_time=0.00) T2 = time.perf_counter() running_time = (T2 - T1) * 1000 episode = max(plot_episode) env.destroy() env.mainloop() plot_episode.clear() # This step is a very special bug. We import this function and run this in a for loop. # The data in plot_episode will be stored in thr for loop. # We must clear it at the end. return running_time, episode, plot_sum_reward
def q_test(judge_number): global Q_brain_ global env env = Maze(height=10, width=10) Q_brain_ = Model.Qlearning(greedy_rate=0.9, learning_rate=0.01, reward_decay=0.9) T1 = time.perf_counter() update(judge_number=judge_number, judge_method='repeated steps', delay_time=0.00) T2 = time.perf_counter() running_time = (T2 - T1) * 1000 episode = max(plot_episode) env.destroy() env.mainloop() plot_episode.clear() # This step is a very special bug. We import this function and run this in a for loop. # The data in plot_episode will be stored in thr for loop. # We must clear it at the end. return running_time, episode, plot_sum_reward
if (step > 20)and (i_step % 10 ==0): q_learning.update() print("learning ") if (step > 20)and (i_step %20 ==0): q_learning.update_target_net() ##add reward R.append(reward) if done: break; q_learning.add_record(sum(R), i_step) print("Done !!!") env.destroy() steps = [q_learning.reward_his[x][1] for x in range(episode) ] plt.figure() plt1 = plt.subplot(121) plt1.plot(range(len(steps)), np.array(steps)) plt2 = plt.subplot(122) plt2.plot(range(len(q_learning.loss_his)), q_learning.loss_his) plt.show()
class CustomGym(Env): """The main OpenAI Gym class. It encapsulates an environment with arbitrary behind-the-scenes dynamics. An environment can be partially or fully observed. The main API methods that users of this class need to know are: step reset render close seed And set the following attributes: action_space: The Space object corresponding to valid actions observation_space: The Space object corresponding to valid observations reward_range: A tuple corresponding to the min and max possible rewards Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range. The methods are accessed publicly as "step", "reset", etc... """ def __init__( self, agentXY, goalXY, walls=[], pits=[], title='Maze', ): super(CustomGym, self).__init__() self.env = Maze(agentXY, goalXY, walls, pits, title) self.title = title self.action_space = spaces.Discrete(self.env.n_actions) self.observation_space = spaces.Box(low=0, high=0, shape=(4, ), dtype=np.float32) self.rewards = [[]] self.variance = [] self.median = [] def step(self, action): """Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` to reset this environment's state. Accepts an action and returns a tuple (observation, reward, done, info). Args: action (object): an action provided by the agent Returns: observation (object): agent's observation of the current environment reward (float) : amount of reward returned after previous action done (bool): whether the episode has ended, in which case further step() calls will return undefined results info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning) """ s_, reward, done = self.env.step(action) self.rewards[-1].append(reward) if done: self.variance.append(np.var(self.rewards[-1])) self.median.append(np.median(self.rewards[-1])) self.rewards.append([]) return s_, reward, done, {} def render(self, mode='human'): self.env.render() def reset(self, value=1, resetAgent=True): return self.env.reset() def seed(self, seed=None): """Sets the seed for this env's random number generator(s). Note: Some environments use multiple pseudorandom number generators. We want to capture all such seeds used in order to ensure that there aren't accidental correlations between multiple generators. Returns: list<bigint>: Returns the list of seeds used in this env's random number generators. The first value in the list should be the "main" seed, or the value which a reproducer should pass to 'seed'. Often, the main seed equals the provided 'seed', but this won't be true if seed=None, for example. """ np.random.seed(10) random.seed(10) return def save_csv(self): with open(f"./data/{self.title}_rewards_{time.time()}", "w+") as my_csv: csvWriter = csv.writer(my_csv, delimiter=',') csvWriter.writerows(self.rewards) with open(f"./data/{self.title}_variance_{time.time()}", "w+") as my_csv: csvWriter = csv.writer(my_csv, delimiter=',') for var in self.variance: csvWriter.writerow([var]) with open(f"./data/{self.title}_median_{time.time()}", "w+") as my_csv: csvWriter = csv.writer(my_csv, delimiter=',') for med in self.median: csvWriter.writerow([med]) def destroy(self): self.env.destroy()