def looping(qt=None, epsilon=config.epsilon, visu=False): plt.ion() cart = CartPoleEnv() data = [] data_rm = [] if (qt is None): qt = initialize_Qtable() for episode in range(config.episodes): cart.reset() turn = 0 end = False epsilon = epsilon * 0.9999 while not end: current_state = cart.state action = choose_action(current_state, qt, epsilon) new_state, reward, end, _ = cart.step(action) if end: reward = -10 update_qt_new(qt, current_state, reward, action, new_state) turn += 1 if (visu): cart.render() data.append(turn) data_rm.append(np.mean(data[-100:])) print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon) if episode % config.graph_update == 0 and episode != 0: graph(data, data_rm) # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")): # break cart.close() return (data, qt)
def loop(qt=None, epsilon=1, visu=False): plt.ion() cart = CartPoleEnv() data = [] data_rm = [] config.epsilon = epsilon if (qt is None): qt = initialize_Qtable() for episode in range(config.episodes): cart.reset() turn = 0 s = cart.state end = False epsilon_tmp = config.epsilon while not end: config.epsilon *= 0.97 if (visu): cart.render() a = choose_action(s, qt) _, _, end, _ = cart.step(a) l_val = bellman_q(s, qt, dummy_cart(s), action=a) # print(l_val) update_qt(qt, s, a, l_val) s = cart.state turn += 1 data.append(turn) data_rm.append(np.mean(data[-100:])) print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", config.epsilon) config.epsilon = epsilon_tmp if episode % config.graph_update == 0 and episode != 0: graph(data, data_rm) # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")): # break cart.close() return (data, qt)
learning_episodes = 1000 Q, R = Q_learning(env, learning_episodes) print(np.max(Q)) #zaokrąglamy do dwóch miejsc po przecinku # In[92]: meanR = [] my_range = 200 for i in range(my_range): meanR.append( np.mean(R[int(learning_episodes / my_range) * i:int(learning_episodes / my_range) * (i + 1)])) x_data = range(0, my_range) plt.figure(figsize=(30, 10)) plt.plot(x_data, R[0:my_range], label="reward") plt.plot(x_data, meanR, label="mean reward") plt.title('CartPole: SARSA') plt.xlabel('Episode') plt.ylabel('Reward') plt.legend() plt.show() env.close() # In[ ]: # In[ ]:
from cartpole import CartPoleEnv import numpy as np cart = CartPoleEnv() cart.reset() for _ in range(1000): # Calculate the Gradients # Update Thetas # Sample u trajectory # Apply u[0] to the actual system cart.step(10) # Apply Some force # Update the New State in the Learner # Shift the Thetas # Simulate cart.render() cart.close()
class CuteLearning(): def __init__(self): self.plot_data = PlotData() self.cart = CartPoleEnv() self.cart.reset() self.predi_net = DQN() self.updat_net = deepcopy(self.predi_net) self.turn = 0 self.epidode = 0 self.epsilon = config.epsilon self.eps_decay = 0.99 self.visu = False self.visu_update = False #300 self.visu_window = 5 self.consecutive_wins = 0 self.best_consecutive_wins = 0 self.last_save = 0 self.memory = [] def reward_optimisation(self, state, end): reward = -25 if end else 1 if reward == 1: # Angle reward modification angle_r = 0.418 / 2 reward += (((abs(angle_r - abs(state[2])) / angle_r) * 2) - 1) * 2 # Position reward modification pos_r = 0.418 / 2 reward += (((abs(pos_r - abs(state[0])) / pos_r) * 2) - 1) * 2 return reward def learn(self): self.episode = 0 n = 0 while self.episode < 10: self.turn = 0 end = False states = [] targets = [] while not end: # 1. Init state = self.cart.state # 2. Choose action q_values = self.predi_net.predict(state).tolist() a = choose_action_net(q_values, self.epsilon) # 3. Perform action next_state, _, end, _ = self.cart.step(a) # 4. Measure reward reward = self.reward_optimisation(next_state, end) q_values_next = self.predi_net.predict(next_state) # 5. Calcul Q-Values q_values[a] = reward + net_config.gamma * \ torch.max(q_values_next).item() self.turn += 1 self.memory.append((state, a, next_state, reward, end)) # self.updat_net.update(state, q_values) states.append(state) targets.append(q_values) if (self.turn % 20 and self.turn) or end: self.updat_net.update(states, targets) states = [] targets = [] if self.turn >= 500: end = True if self.visu: self.cart.render() self.episode += 1 self.replay(20) if self.episode % net_config.n_update == 0 and self.episode: print("Update") self.predi_net.model.load_state_dict( self.updat_net.model.state_dict()) self.end() n += 1 self.save() self.cart.close() self.plot_data.clear() def replay(self, size): if size > len(self.memory): size = len(self.memory) data = random.sample(self.memory, size) states = [] targets = [] for state, action, next_state, reward, done in data: q_values = self.predi_net.predict(state) if done: q_values[action] = reward else: # The only difference between the simple replay is in this line # It ensures that next q values are predicted with the target network. q_values_next = self.predi_net.predict(next_state) q_values[action] = reward + net_config.gamma * torch.max( q_values_next).item() states.append(state) targets.append(q_values) self.updat_net.update(state, q_values) def end(self): self.plot_data.new_data(self.turn) if self.turn > 195: self.consecutive_wins += 1 if self.best_consecutive_wins < self.consecutive_wins: self.best_consecutive_wins = self.consecutive_wins if self.consecutive_wins > 200: self.save() print(("WIN IN " + str(self.episode) + " EPISODES\n") * 100) else: self.consecutive_wins = 0 if self.last_save * 1.2 < self.best_consecutive_wins and 50 <= self.best_consecutive_wins: self.save() self.last_save = self.best_consecutive_wins print("Episode: ", self.episode, "\tTurn:", self.turn, "\tEpsilon:", self.epsilon, "\tWins: ", "{:3}".format(self.consecutive_wins), "/", self.best_consecutive_wins) self.turn = 0 self.cart.reset() if self.episode % config.graph_update == 0 and self.episode != 0: self.plot_data.graph() if self.visu_update: if self.episode % self.visu_update == 0: self.visu = True if self.episode % self.visu_update == self.visu_window: self.visu = False self.cart.close() self.epsilon = max(self.epsilon * self.eps_decay, 0.01) def save(self): pass