def get_q_table(self,episodes=100): all_epochs = [] all_penalties = [] #print('before for loop') rewards_list = np.zeros(episodes) print("SARSA") for i in tqdm(range(1, episodes)): state = self.env.reset cum_rewards = 0 epochs, penalties, reward = 0, 0, 0 done = False #terminal state param while not done: state_index = get_index(state,self.env.observation_space) if random.uniform(0, 1) < self.epsilon: action = np.random.choice(self.env.action_space) # Explore action space action_index = np.where(self.env.action_space == action) else: action_index = np.argmax(self.q_table[state_index]) # Exploit learned values action = self.env.action_space[action_index] next_state, reward, done = self.env.step(state,action) next_state_index = get_index(next_state,self.env.observation_space) next_action_index = np.argmax(self.q_table[next_state_index]) next_action = self.env.action_space[next_action_index] #print(next_state_index,next_action_index) #print(state_index,action_index) delta = reward + self.gamma*self.q_table[next_state_index,next_action_index] - self.q_table[state_index,action_index] self.q_table[state_index,action_index] = self.q_table[state_index,action_index] + self.alpha * (delta) cum_rewards += reward state = next_state #move to next state epochs += 1 #if i % 1000 == 0: # clear_output(wait=True) # print(f"Episode: {i}") rewards_list[i] = cum_rewards #print('Training finished.') return self.q_table, rewards_list #, self.eligibility_trace
def evaluate_agent(self,episodes=1): total_epochs, total_penalties = 0, 0 for _ in range(episodes): rewards_list = [] state = self.env.reset epochs, penalties, reward = 0, 0, 0 done = False while not done: state_index = get_index(state,self.env.observation_space) action_index = np.argmax(self.q_table[state_index]) action = self.env.action_space[action_index] state, reward, done = self.env.step(state,action) rewards_list.append(reward) #print('state',state) if reward == 0: penalties += 1 epochs += 1 total_penalties += penalties total_epochs += epochs return rewards_list
def evaluate_agent(self, test_episodes=100): total_epochs, total_penalties = 0, 0 save_policies = np.zeros((test_episodes, self.env.timesteps - 1)) for _ in range(test_episodes): state = self.env.reset epochs, penalties, reward = 0, 0, 0 reward_list = [] policy = [] stock_list = [] done = False while not done: state_index = get_index(state, self.env.observation_space) action_index = np.argmax(self.q_table[state_index]) action = self.env.action_space[action_index] state, reward, done = self.env.step(state, action) reward_list.append(reward) policy.append(action) #print((state)) if isinstance(state, int) == False: stock_list.append(state[1]) if reward == 0: penalties += 1 epochs += 1 total_penalties += penalties total_epochs += epochs save_policies[_] = policy return reward_list, stock_list, policy
def get_q_table(self, episodes=100001): rewards_list = np.zeros(episodes) print('Q-learning') for i in tqdm(range(1, episodes)): state = self.env.reset epochs, penalties, reward = 0, 0, 0 cum_rewards = 0 done = False while not done: state_index = get_index(state, self.env.observation_space) if random.uniform(0, 1) < self.epsilon: action = np.random.choice( self.env.action_space) # Explore action space action_index = np.where(self.env.action_space == action) else: action_index = np.argmax( self.q_table[state_index]) # Exploit learned values action = self.env.action_space[action_index] next_state, reward, done = self.env.step(state, action) old_value = self.q_table[state_index, action_index] next_state_index = get_index(next_state, self.env.observation_space) next_max = np.max(self.q_table[next_state_index]) new_value = (1 - self.alpha) * old_value + self.alpha * ( reward + self.gamma * next_max) self.q_table[state_index, action_index] = new_value cum_rewards += reward if reward == 0: penalties += 1 state = next_state epochs += 1 rewards_list[i] = cum_rewards return self.q_table, rewards_list
def get_q_table(self, episodes=100001): all_epochs = [] all_penalties = [] r_list = np.zeros(episodes) print('Monte Carlo') for i in tqdm(range(1, episodes)): state = self.env.reset state_list = [] action_list = [] reward_list = [] cum_rewards = 0 epochs, penalties, reward = 0, 0, 0 done = False while not done: state_index = get_index(state, self.env.observation_space) if random.uniform(0, 1) < self.epsilon: action = np.random.choice( self.env.action_space) # Explore action space action_index = np.where(self.env.action_space == action) else: action_index = np.argmax( self.q_table[state_index]) # Exploit learned values action = self.env.action_space[action_index] state_list.append(state) action_list.append(action) next_state, reward, done = self.env.step(state, action) reward_list.append(reward) if reward == 0: penalties += 1 cum_rewards += reward state = next_state epochs += 1 for k in range(len(state_list)): state = state_list[k] action = action_list[k] action_index = np.where(self.env.action_space == action) state_index = get_index(state, self.env.observation_space) reward = reward_list[k:] discount = [self.gamma**j for j in range(len(reward))] discounted_reward = np.dot(reward, discount) self.q_table[state_index, action_index] = (1 - self.alpha) * self.q_table[ state_index, action_index] + self.alpha * discounted_reward #if i % 1000 == 0: # clear_output(wait=True) # print(f"Episode: {i}") r_list[i] = cum_rewards #print("Training finished.\n") return self.q_table, r_list