Esempi in Python per CartPoleEnv, esempi in Python per cartpole_env.CartPoleEnv

Esempio n. 1

0

Mostra file

File: cartpole.py Progetto: veda-s4dhak/AQIL

    def __init__(self, **kwargs):
        """
        Constructor
        """

        self.config = kwargs

        self.threadID = kwargs['model_name']
        self.name = kwargs['model_name']

        # Setting configuration
        self.USER_IMITATION_MODE = kwargs['user_imitation_mode']
        self.PID_IMITATION_MODE = kwargs['pid_imitation_mode']

        # Initializing the environment
        self.env = CartPoleEnv()
        self.observation_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.n

        # Initializing the neural network
        self.model_name = kwargs['model_name']

        # The maximum number of episodes to run
        self.n_episodes = kwargs['n_episodes']

        # Initializing the model
        self.dqn_params = kwargs
        self.dqn_params['observation_space'] = self.observation_space
        self.dqn_params['action_space'] = self.action_space
        self.dqn = CartpoleDQN(**self.dqn_params)

        # Average training loss per step
        self.loss_aggregation = []

        # Total Reward per Episode
        self.reward_aggregation = []

        # User Action per Step
        self.user_action_aggregation = []

        # Machine Action per Step
        self.machine_action_aggregation = []
        self.score_aggregation = []

        # List of lists of output activations for each layer
        self.layer_outputs_list = []

        # Creates directory if directory does not exist
        if not os.path.exists('.//models'):
            os.mkdir('.//models')

        if not os.path.exists('.//plots'):
            os.mkdir('.//plots')

        # Required for PID Control
        self.P = 0
        self.I = 0
        self.D = 0
        self.prev_error = 0

Esempio n. 2

0

Mostra file

File: deepQLearning_Replay.py Progetto: Petlja/PSIML

def main():
    env = CartPoleEnv()
    agent = DQNAgent(env.state_size(), env.action_size())

    epsilon = EPSILON_START
    results = []
    start = time.time()
    random.seed(0)

    for episode in range(EPISODES):
        #Start game/episode
        state = env.reset()

        if (episode > SWITCH_FREQ and episode % SWITCH_FREQ == 0):
            agent.update_target_model()

        #Loop inside one game episode
        for t in range(STEPS):
            # Display the game. Comment bellow line in order to get faster training.
            env.render()

            state_action_q_values = agent.forward(torch.from_numpy(state))
            if random.random() <= epsilon:
                action = random.randrange(env.action_size())
            else:
                action = torch.argmax(state_action_q_values).item()

            next_state, reward, done = env.step(action)

            agent.remember(state, action, reward, next_state)

            if done or (t == STEPS - 1):
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    episode, EPISODES, t, epsilon))
                results.append(t)
                break

            if episode > 10 and (episode + t) % UPDATE_FREQ == 0:
                agent.backward()

            state = next_state

        if epsilon > EPSILON_END:
            epsilon *= EPSILON_DECAY

    end = time.time()
    print("TIME")
    print(end - start)
    print("STEPS")
    print(sum(results))
    plt.plot(results)
    plt.show()

Esempio n. 3

0

Mostra file

File: deepQLearning_Basic.py Progetto: Petlja/PSIML

def main():
    env = CartPoleEnv()
    agent = DQNAgent(env.state_size(), env.action_size())

    epsilon = EPSILON_START
    results = []
    start = time.time()

    for episode in range(EPISODES):
        #Start game/episode
        state = env.reset()

        #Loop inside one game episode
        for t in range(STEPS):
            # Display the game. Comment bellow line in order to get faster training.
            env.render()

            #0. Currently you are in "state S (state)"
            #1.1 Determine action q values from state S.
            #1.2 Calculate action to be taken from state S. Use 'e-rand off-policy'
            #1.3 Play/perform the action in the environment
            # Move to "next state S' (next_state), get reward, and flag for is game over (is new state terminal)

            pass
            done = True  #Update this flag correctly

            #2.1 From state S' peek into the future - Determine action q values from state S'
            #2.2 Using the SARSA-MAX formula update the net.
            # Suggestion: You can start with formula: Q(S,A) <- R + gamma * max(Q(S',A'))
            # Hint1: Don't forget that you should only perform update for taken action (#1.2)
            # Hint2: Don't forget that the target is no_grad constant.
            pass

            if done or (t == STEPS - 1):
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    episode, EPISODES, t, epsilon))
                results.append(t)
                break

            #3.1 Current state is now next_state
            pass

        if epsilon > EPSILON_END:
            epsilon *= EPSILON_DECAY

    end = time.time()
    print("TIME")
    print(end - start)
    print("STEPS")
    print(sum(results))
    plt.plot(results)
    plt.show()

Esempio n. 4

0

Mostra file

File: baselines_deepq.py Progetto: philzook58/cart_pole

def main():
    env = CartPoleEnv()
    model = deepq.models.mlp([64])
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=500000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")

Esempio n. 5

0

Mostra file

File: keras-rl-dqn.py Progetto: philzook58/cart_pole

import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory
from cartpole_env import CartPoleEnv

# Get the environment and extract the number of actions.
env = CartPoleEnv()
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

# Option 2: deep network
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))

Esempio n. 6

0

Mostra file

File: cartpole.py Progetto: veda-s4dhak/AQIL

class Cartpole:
    """
    Cartpole runs the game using the deep neural network and the OpenAI Gym
    """

    USER_ACTION = dict()
    USER_ACTION[1] = "APPLY FORCE RIGHT"
    USER_ACTION[2] = "APPLY FORCE LEFT"
    USER_ACTION[0] = "EXIT"

    USER_INPUT_INDEX = [0, 1, 2]

    def __init__(self, **kwargs):
        """
        Constructor
        """

        self.config = kwargs

        self.threadID = kwargs['model_name']
        self.name = kwargs['model_name']

        # Setting configuration
        self.USER_IMITATION_MODE = kwargs['user_imitation_mode']
        self.PID_IMITATION_MODE = kwargs['pid_imitation_mode']

        # Initializing the environment
        self.env = CartPoleEnv()
        self.observation_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.n

        # Initializing the neural network
        self.model_name = kwargs['model_name']

        # The maximum number of episodes to run
        self.n_episodes = kwargs['n_episodes']

        # Initializing the model
        self.dqn_params = kwargs
        self.dqn_params['observation_space'] = self.observation_space
        self.dqn_params['action_space'] = self.action_space
        self.dqn = CartpoleDQN(**self.dqn_params)

        # Average training loss per step
        self.loss_aggregation = []

        # Total Reward per Episode
        self.reward_aggregation = []

        # User Action per Step
        self.user_action_aggregation = []

        # Machine Action per Step
        self.machine_action_aggregation = []
        self.score_aggregation = []

        # List of lists of output activations for each layer
        self.layer_outputs_list = []

        # Creates directory if directory does not exist
        if not os.path.exists('.//models'):
            os.mkdir('.//models')

        if not os.path.exists('.//plots'):
            os.mkdir('.//plots')

        # Required for PID Control
        self.P = 0
        self.I = 0
        self.D = 0
        self.prev_error = 0

    @staticmethod
    def getch():
        """
        This method gets the user input without requiring the user to press
        enter afterwards
        """

        fd = sys.stdin.fileno()
        old_settings = termios.tcgetattr(fd)
        try:
            tty.setraw(sys.stdin.fileno())
            ch = sys.stdin.read(1)

        finally:
            termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)

        return int(ch)

    def get_user_action(self):
        """Gets the user input and parses the corresponding user action"""

        user_action = None

        print("Please enter an input:")
        user_input = self.getch()

        # Getting user action
        while user_input not in self.USER_INPUT_INDEX:
            print("Please enter an input:")
            user_input = int(self.getch())

            print("User input: {}".format(user_input))

        user_action = self.USER_ACTION[user_input]

        return user_input, user_action

    def get_pid_action(self):

        # PID Constants
        kP = self.config["P"]  # 0.3 Optimal
        kI = self.config["I"]  # 0.1 Optimal
        kD = self.config["D"]  # 10 Optimal
        desired_angle = 0

        # 1) Get the pole angle
        pole_angle = self.env.theta

        # Error computation
        error = desired_angle - pole_angle

        # 2) Compute action
        self.P = error
        self.I += error
        self.D = error - self.prev_error
        action = kP * self.P + kI * self.I + kD * self.D

        self.prev_error = error

        return (1 if action < 0 else 0)

    def plot_data(self):
        """
        Plots the loss across the episodes which have been run
        """

        figure_title = '{} Experiment Results'.format(self.model_name)

        fig = plt.figure(figure_title, figsize=(8, 15))
        nrows = 5 if self.USER_IMITATION_MODE else 4

        # Plots Model Loss graph
        ax1 = fig.add_subplot(nrows, 1, 1)
        plt.plot(self.loss_aggregation)
        ax1.set_yscale('log')
        plt.title('Model Loss')
        plt.ylabel('Average Loss')
        plt.xlabel('Step')

        # Plots Reward graph
        ax2 = plt.subplot(nrows, 1, 2)
        plt.plot(self.reward_aggregation)
        plt.title('Reward')
        plt.ylabel('Reward')
        plt.xlabel('Episode')

        # Plots Machine Action Graph
        ax3 = plt.subplot(nrows, 1, 3)
        plt.plot(self.machine_action_aggregation)
        plt.title('Machine Action')
        plt.ylabel('Action')
        plt.xlabel('Step')

        # Plots Score
        ax4 = plt.subplot(nrows, 1, 4)
        plt.plot(self.score_aggregation)
        plt.title('Score')
        plt.ylabel('Score')
        plt.xlabel('Episode')

        # Plots User Action Graph if IMITATION_MODE
        if self.USER_IMITATION_MODE:
            ax5 = plt.subplot(nrows, 1, 5)
            plt.plot(self.user_action_aggregation)
            plt.title('User Action')
            plt.ylabel('Action')
            plt.xlabel('Step')

        plt.tight_layout(pad=3, h_pad=3)

        plt.savefig(os.path.join(".", "plots",
                                 "{}.png".format(self.model_name)),
                    bbox_inches='tight')

        # https://stackoverflow.com/questions/34732305/contour-plot-of-2d-array-in-matplotlib
        # Plots weight contours

        self.model_weights = self.dqn.get_weights()
        weight_fig, weight_axes = plt.subplots(1, 4, figsize=(10, 10))

        for i in range(4):
            h, w = self.model_weights[i][0].shape
            X, Y = np.mgrid[0:1:(h * 1j), 0:1:(w * 1j)]
            c1 = weight_axes[i].contourf(X, Y, self.model_weights[i][0])
            plt.colorbar(c1, ax=weight_axes[i])
            weight_axes[i].set_title('Layer {}'.format(i + 1))
            print('Layer {} Weight Shape: ({}, {})'.format(i + 1, h, w))

        plt.tight_layout()
        weight_fig.savefig(os.path.join(
            ".", "plots", "{}.png".format(self.model_name + '_weights')),
                           bbox_inches='tight')
        plt.show()

        # Plots max activations of each layer
        # Stacks data by layer
        # self.acts_by_layer = [
        #     np.stack([self.layer_outputs_list[i][layer] for i in range(len(self.layer_outputs_list))], axis=-1)
        #     for layer in range(5)]
        # print(self.acts_by_layer[0].shape)
        # print(self.acts_by_layer[1].shape)
        # print(self.acts_by_layer[2].shape)
        # print(self.acts_by_layer[3].shape)
        # print(self.acts_by_layer[4].shape)

        # Gets indices of max values along batch axis
        # self.batch_indices_by_layer = [np.argmax(self.acts_by_layer[layer], axis=-1) for layer in range(5)]
        # print(self.batch_indices_by_layer[0].shape)
        # print(self.batch_indices_by_layer[1].shape)
        # print(self.batch_indices_by_layer[2].shape)
        # print(self.batch_indices_by_layer[3].shape)
        # print(self.batch_indices_by_layer[4].shape)

        # Gets input for each max activation by layer
        # self.max_act_inputs_by_layer = [
        #     [self.acts_by_layer[0][:, :, i] for i in np.reshape(self.batch_indices_by_layer[layer], [-1])] for
        #     layer
        #     in range(5)]

        # for i in range(len(self.max_act_inputs_by_layer)):
        #     print('N_Activations: {} Input_Shape: {}'.format(len(self.max_act_inputs_by_layer[i]),
        #                                                      self.max_act_inputs_by_layer[i][0].shape))

        with open(os.path.join(".", "plots", "{}.txt".format(self.model_name)),
                  'w') as f:
            json.dump(self.config, f)

        # Generates dicts for saving to csv
        loss_aggregation_dict = dict()
        action_dict = dict()
        reward_dict = dict()

        # Creates dict for loss data
        loss_aggregation_dict['Loss'] = self.loss_aggregation

        # Creates dict for User Action data if IMITATION_MODE
        if self.USER_IMITATION_MODE:
            action_dict['User_Action'] = self.user_action_aggregation

        # Creates dict for Machine Action data
        action_dict['Machine_Action'] = self.machine_action_aggregation

        # Creates dict for Reward data
        reward_dict['Reward'] = self.reward_aggregation
        # reward_dict['Reward'] = self.reward_aggregation

        # for episode_num in range(0, len(self.loss_aggregation)):
        #     loss_aggregation_dict[episode_num] = self.loss_aggregation[episode_num]

        # Saving the data to a csv
        df = pd.DataFrame.from_dict(loss_aggregation_dict)
        df.to_csv(os.path.join(".", "plots",
                               "{}.csv".format(self.model_name + '_loss')),
                  header=True,
                  index=True)

        df = pd.DataFrame.from_dict(action_dict)
        df.to_csv(os.path.join(".", "plots",
                               "{}.csv".format(self.model_name + '_action')),
                  header=True,
                  index=True)

        df = pd.DataFrame.from_dict(reward_dict)
        df.to_csv(os.path.join(".", "plots",
                               "{}.csv".format(self.model_name + '_reward')),
                  header=True,
                  index=True)

    def run(self):
        """
        Runs the cartpole game (main program entry point)
        """
        # The number of episodes which have completed
        episode = 0

        user_action_string = None

        while (user_action_string != "EXIT") and (episode < self.n_episodes):

            # Environment reset
            state = self.env.reset()
            state = np.reshape(state, [1, self.observation_space])
            step = 0

            # Episode Reward
            r_episode = 0

            # Running the episode
            print('Episode: {}'.format(episode))
            while True:

                # Rendering the step
                step += 1
                self.env.render()

                # Getting the user action based on the specified mode
                if not self.USER_IMITATION_MODE:
                    user_action_string = None
                    user_action = None
                else:
                    user_action, user_action_string = self.get_user_action()
                    user_action -= 1
                    self.user_action_aggregation.append(user_action)

                if self.USER_IMITATION_MODE:
                    user_action, user_action_string = self.get_user_action()
                    user_action -= 1
                    self.user_action_aggregation.append(user_action)
                elif self.PID_IMITATION_MODE:
                    pid_action = self.get_pid_action()
                    self.user_action_aggregation.append(pid_action)
                    user_action = pid_action
                else:
                    user_action_string = None
                    user_action = None

                # Exiting on user request
                # This will also save the model and plot the loss
                if user_action_string == "EXIT":
                    print("Saving model...")
                    loss, r, layer_outputs = self.dqn.experience_replay(
                        save=True)
                    if layer_outputs != -1:
                        self.layer_outputs_list += layer_outputs
                    self.loss_aggregation.append(loss)
                    self.reward_aggregation.append(r_episode)
                    print("Saved model.")
                    break

                # Getting the machine action
                machine_action = self.dqn.act(state)

                # Records machine action for step step
                self.machine_action_aggregation.append(machine_action)

                # Printing actions
                if self.USER_IMITATION_MODE or self.PID_IMITATION_MODE:
                    print("User Action: {} Machine Action: {}".format(
                        user_action, machine_action))
                else:
                    print("Machine Action: {}".format(machine_action))

                # Computing the state
                state_next, reward, terminal, info = self.env.step(
                    machine_action, user_input=user_action)

                # Computing the reward
                reward = reward if not terminal else -reward
                r_episode += reward

                state_next = np.reshape(state_next,
                                        [1, self.observation_space])

                # Storing the step for experience replay
                self.dqn.remember(state, machine_action, reward, state_next,
                                  terminal)

                # Setting the current state to be the next state
                state = state_next

                # Post processing
                loss = 1
                if (episode % 1 == 0) and terminal:
                    print('Saving models...')
                    loss, r_step, layer_outputs = self.dqn.experience_replay(
                        save=True)
                else:
                    loss, r_step, layer_outputs = self.dqn.experience_replay(
                        save=False)

                if layer_outputs != -1:
                    self.layer_outputs_list += layer_outputs

                # Checking if game over
                if terminal:
                    print("Episode: {} Exploration: {} Score: {}".format(
                        episode, self.dqn.exploration_rate, step))
                    self.reward_aggregation.append(r_episode)
                    self.score_aggregation.append(step)
                    episode += 1

                    # input() # Debugging at the end of every episode
                    break

                # Adds loss to plot list, if replay buffer is ready for training
                if loss != -1:
                    self.loss_aggregation.append(loss)

                # Getting ready for next state
                print("Reward: {} Step: {} Episode: {} Loss: {}".format(
                    reward, step, episode, loss))

        self.plot_data()