Esempio n. 1
0
def looping(qt=None, epsilon=config.epsilon, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        end = False
        epsilon = epsilon * 0.9999
        while not end:
            current_state = cart.state
            action = choose_action(current_state, qt, epsilon)
            new_state, reward, end, _ = cart.step(action)
            if end:
                reward = -10
            update_qt_new(qt, current_state, reward, action, new_state)
            turn += 1
            if (visu):
                cart.render()
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon)
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Esempio n. 2
0
def loop(qt=None, epsilon=1, visu=False):
    plt.ion()
    cart = CartPoleEnv()
    data = []
    data_rm = []
    config.epsilon = epsilon
    if (qt is None):
        qt = initialize_Qtable()
    for episode in range(config.episodes):
        cart.reset()
        turn = 0
        s = cart.state
        end = False
        epsilon_tmp = config.epsilon
        while not end:
            config.epsilon *= 0.97
            if (visu):
                cart.render()
            a = choose_action(s, qt)
            _, _, end, _ = cart.step(a)
            l_val = bellman_q(s, qt, dummy_cart(s), action=a)
            # print(l_val)
            update_qt(qt, s, a, l_val)
            s = cart.state
            turn += 1
        data.append(turn)
        data_rm.append(np.mean(data[-100:]))
        print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:",
              config.epsilon)
        config.epsilon = epsilon_tmp
        if episode % config.graph_update == 0 and episode != 0:
            graph(data, data_rm)
        # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")):
        #     break
    cart.close()
    return (data, qt)
Esempio n. 3
0
learning_episodes = 1000
Q, R = Q_learning(env, learning_episodes)
print(np.max(Q))  #zaokrąglamy do dwóch miejsc po przecinku

# In[92]:

meanR = []
my_range = 200
for i in range(my_range):
    meanR.append(
        np.mean(R[int(learning_episodes / my_range) *
                  i:int(learning_episodes / my_range) * (i + 1)]))

x_data = range(0, my_range)
plt.figure(figsize=(30, 10))

plt.plot(x_data, R[0:my_range], label="reward")
plt.plot(x_data, meanR, label="mean reward")
plt.title('CartPole: SARSA')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.legend()

plt.show()

env.close()

# In[ ]:

# In[ ]:
Esempio n. 4
0
from cartpole import CartPoleEnv
import numpy as np
cart = CartPoleEnv()
cart.reset()

for _ in range(1000):

    # Calculate the Gradients

    # Update Thetas

    # Sample u trajectory

    # Apply u[0] to the actual system
    cart.step(10)  # Apply Some force

    # Update the New State in the Learner

    # Shift the Thetas

    # Simulate
    cart.render()

cart.close()
Esempio n. 5
0
class CuteLearning():
    def __init__(self):
        self.plot_data = PlotData()
        self.cart = CartPoleEnv()
        self.cart.reset()
        self.predi_net = DQN()
        self.updat_net = deepcopy(self.predi_net)
        self.turn = 0
        self.epidode = 0
        self.epsilon = config.epsilon
        self.eps_decay = 0.99
        self.visu = False
        self.visu_update = False  #300
        self.visu_window = 5
        self.consecutive_wins = 0
        self.best_consecutive_wins = 0
        self.last_save = 0
        self.memory = []

    def reward_optimisation(self, state, end):
        reward = -25 if end else 1
        if reward == 1:
            # Angle reward modification
            angle_r = 0.418 / 2
            reward += (((abs(angle_r - abs(state[2])) / angle_r) * 2) - 1) * 2
            # Position reward modification
            pos_r = 0.418 / 2
            reward += (((abs(pos_r - abs(state[0])) / pos_r) * 2) - 1) * 2
        return reward

    def learn(self):
        self.episode = 0
        n = 0
        while self.episode < 10:
            self.turn = 0
            end = False
            states = []
            targets = []
            while not end:
                # 1. Init
                state = self.cart.state
                # 2. Choose action
                q_values = self.predi_net.predict(state).tolist()
                a = choose_action_net(q_values, self.epsilon)
                # 3. Perform action
                next_state, _, end, _ = self.cart.step(a)
                # 4. Measure reward
                reward = self.reward_optimisation(next_state, end)
                q_values_next = self.predi_net.predict(next_state)
                # 5. Calcul Q-Values
                q_values[a] = reward + net_config.gamma * \
                    torch.max(q_values_next).item()

                self.turn += 1
                self.memory.append((state, a, next_state, reward, end))
                # self.updat_net.update(state, q_values)
                states.append(state)
                targets.append(q_values)
                if (self.turn % 20 and self.turn) or end:
                    self.updat_net.update(states, targets)
                    states = []
                    targets = []

                if self.turn >= 500:
                    end = True
                if self.visu:
                    self.cart.render()

            self.episode += 1
            self.replay(20)
            if self.episode % net_config.n_update == 0 and self.episode:
                print("Update")
                self.predi_net.model.load_state_dict(
                    self.updat_net.model.state_dict())
            self.end()
            n += 1

        self.save()
        self.cart.close()
        self.plot_data.clear()

    def replay(self, size):
        if size > len(self.memory):
            size = len(self.memory)
        data = random.sample(self.memory, size)
        states = []
        targets = []
        for state, action, next_state, reward, done in data:
            q_values = self.predi_net.predict(state)
            if done:
                q_values[action] = reward
            else:
                # The only difference between the simple replay is in this line
                # It ensures that next q values are predicted with the target network.
                q_values_next = self.predi_net.predict(next_state)
                q_values[action] = reward + net_config.gamma * torch.max(
                    q_values_next).item()
            states.append(state)
            targets.append(q_values)
        self.updat_net.update(state, q_values)

    def end(self):
        self.plot_data.new_data(self.turn)
        if self.turn > 195:
            self.consecutive_wins += 1
            if self.best_consecutive_wins < self.consecutive_wins:
                self.best_consecutive_wins = self.consecutive_wins
            if self.consecutive_wins > 200:
                self.save()
                print(("WIN IN " + str(self.episode) + " EPISODES\n") * 100)
        else:
            self.consecutive_wins = 0
            if self.last_save * 1.2 < self.best_consecutive_wins and 50 <= self.best_consecutive_wins:
                self.save()
                self.last_save = self.best_consecutive_wins
        print("Episode: ", self.episode, "\tTurn:", self.turn, "\tEpsilon:",
              self.epsilon, "\tWins: ", "{:3}".format(self.consecutive_wins),
              "/", self.best_consecutive_wins)
        self.turn = 0
        self.cart.reset()
        if self.episode % config.graph_update == 0 and self.episode != 0:
            self.plot_data.graph()
        if self.visu_update:
            if self.episode % self.visu_update == 0:
                self.visu = True
            if self.episode % self.visu_update == self.visu_window:
                self.visu = False
                self.cart.close()
        self.epsilon = max(self.epsilon * self.eps_decay, 0.01)

    def save(self):
        pass