Esempio n. 1
0
    def __init__(self, **kwargs):
        """
        Constructor
        """

        self.config = kwargs

        self.threadID = kwargs['model_name']
        self.name = kwargs['model_name']

        # Setting configuration
        self.USER_IMITATION_MODE = kwargs['user_imitation_mode']
        self.PID_IMITATION_MODE = kwargs['pid_imitation_mode']

        # Initializing the environment
        self.env = CartPoleEnv()
        self.observation_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.n

        # Initializing the neural network
        self.model_name = kwargs['model_name']

        # The maximum number of episodes to run
        self.n_episodes = kwargs['n_episodes']

        # Initializing the model
        self.dqn_params = kwargs
        self.dqn_params['observation_space'] = self.observation_space
        self.dqn_params['action_space'] = self.action_space
        self.dqn = CartpoleDQN(**self.dqn_params)

        # Average training loss per step
        self.loss_aggregation = []

        # Total Reward per Episode
        self.reward_aggregation = []

        # User Action per Step
        self.user_action_aggregation = []

        # Machine Action per Step
        self.machine_action_aggregation = []
        self.score_aggregation = []

        # List of lists of output activations for each layer
        self.layer_outputs_list = []

        # Creates directory if directory does not exist
        if not os.path.exists('.//models'):
            os.mkdir('.//models')

        if not os.path.exists('.//plots'):
            os.mkdir('.//plots')

        # Required for PID Control
        self.P = 0
        self.I = 0
        self.D = 0
        self.prev_error = 0
Esempio n. 2
0
def main():
    env = CartPoleEnv()
    agent = DQNAgent(env.state_size(), env.action_size())

    epsilon = EPSILON_START
    results = []
    start = time.time()
    random.seed(0)

    for episode in range(EPISODES):
        #Start game/episode
        state = env.reset()

        if (episode > SWITCH_FREQ and episode % SWITCH_FREQ == 0):
            agent.update_target_model()

        #Loop inside one game episode
        for t in range(STEPS):
            # Display the game. Comment bellow line in order to get faster training.
            env.render()

            state_action_q_values = agent.forward(torch.from_numpy(state))
            if random.random() <= epsilon:
                action = random.randrange(env.action_size())
            else:
                action = torch.argmax(state_action_q_values).item()

            next_state, reward, done = env.step(action)

            agent.remember(state, action, reward, next_state)

            if done or (t == STEPS - 1):
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    episode, EPISODES, t, epsilon))
                results.append(t)
                break

            if episode > 10 and (episode + t) % UPDATE_FREQ == 0:
                agent.backward()

            state = next_state

        if epsilon > EPSILON_END:
            epsilon *= EPSILON_DECAY

    end = time.time()
    print("TIME")
    print(end - start)
    print("STEPS")
    print(sum(results))
    plt.plot(results)
    plt.show()
Esempio n. 3
0
def main():
    env = CartPoleEnv()
    agent = DQNAgent(env.state_size(), env.action_size())

    epsilon = EPSILON_START
    results = []
    start = time.time()

    for episode in range(EPISODES):
        #Start game/episode
        state = env.reset()

        #Loop inside one game episode
        for t in range(STEPS):
            # Display the game. Comment bellow line in order to get faster training.
            env.render()

            #0. Currently you are in "state S (state)"
            #1.1 Determine action q values from state S.
            #1.2 Calculate action to be taken from state S. Use 'e-rand off-policy'
            #1.3 Play/perform the action in the environment
            # Move to "next state S' (next_state), get reward, and flag for is game over (is new state terminal)

            pass
            done = True  #Update this flag correctly

            #2.1 From state S' peek into the future - Determine action q values from state S'
            #2.2 Using the SARSA-MAX formula update the net.
            # Suggestion: You can start with formula: Q(S,A) <- R + gamma * max(Q(S',A'))
            # Hint1: Don't forget that you should only perform update for taken action (#1.2)
            # Hint2: Don't forget that the target is no_grad constant.
            pass

            if done or (t == STEPS - 1):
                print("episode: {}/{}, score: {}, e: {:.2}".format(
                    episode, EPISODES, t, epsilon))
                results.append(t)
                break

            #3.1 Current state is now next_state
            pass

        if epsilon > EPSILON_END:
            epsilon *= EPSILON_DECAY

    end = time.time()
    print("TIME")
    print(end - start)
    print("STEPS")
    print(sum(results))
    plt.plot(results)
    plt.show()
Esempio n. 4
0
def main():
    env = CartPoleEnv()
    model = deepq.models.mlp([64])
    act = deepq.learn(
        env,
        q_func=model,
        lr=1e-3,
        max_timesteps=500000,
        buffer_size=50000,
        exploration_fraction=0.1,
        exploration_final_eps=0.02,
        print_freq=10,
        callback=callback
    )
    print("Saving model to cartpole_model.pkl")
    act.save("cartpole_model.pkl")
Esempio n. 5
0
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory
from cartpole_env import CartPoleEnv

# Get the environment and extract the number of actions.
env = CartPoleEnv()
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

# Option 2: deep network
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))
Esempio n. 6
0
class Cartpole:
    """
    Cartpole runs the game using the deep neural network and the OpenAI Gym
    """

    USER_ACTION = dict()
    USER_ACTION[1] = "APPLY FORCE RIGHT"
    USER_ACTION[2] = "APPLY FORCE LEFT"
    USER_ACTION[0] = "EXIT"

    USER_INPUT_INDEX = [0, 1, 2]

    def __init__(self, **kwargs):
        """
        Constructor
        """

        self.config = kwargs

        self.threadID = kwargs['model_name']
        self.name = kwargs['model_name']

        # Setting configuration
        self.USER_IMITATION_MODE = kwargs['user_imitation_mode']
        self.PID_IMITATION_MODE = kwargs['pid_imitation_mode']

        # Initializing the environment
        self.env = CartPoleEnv()
        self.observation_space = self.env.observation_space.shape[0]
        self.action_space = self.env.action_space.n

        # Initializing the neural network
        self.model_name = kwargs['model_name']

        # The maximum number of episodes to run
        self.n_episodes = kwargs['n_episodes']

        # Initializing the model
        self.dqn_params = kwargs
        self.dqn_params['observation_space'] = self.observation_space
        self.dqn_params['action_space'] = self.action_space
        self.dqn = CartpoleDQN(**self.dqn_params)

        # Average training loss per step
        self.loss_aggregation = []

        # Total Reward per Episode
        self.reward_aggregation = []

        # User Action per Step
        self.user_action_aggregation = []

        # Machine Action per Step
        self.machine_action_aggregation = []
        self.score_aggregation = []

        # List of lists of output activations for each layer
        self.layer_outputs_list = []

        # Creates directory if directory does not exist
        if not os.path.exists('.//models'):
            os.mkdir('.//models')

        if not os.path.exists('.//plots'):
            os.mkdir('.//plots')

        # Required for PID Control
        self.P = 0
        self.I = 0
        self.D = 0
        self.prev_error = 0

    @staticmethod
    def getch():
        """
        This method gets the user input without requiring the user to press
        enter afterwards
        """

        fd = sys.stdin.fileno()
        old_settings = termios.tcgetattr(fd)
        try:
            tty.setraw(sys.stdin.fileno())
            ch = sys.stdin.read(1)

        finally:
            termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)

        return int(ch)

    def get_user_action(self):
        """Gets the user input and parses the corresponding user action"""

        user_action = None

        print("Please enter an input:")
        user_input = self.getch()

        # Getting user action
        while user_input not in self.USER_INPUT_INDEX:
            print("Please enter an input:")
            user_input = int(self.getch())

            print("User input: {}".format(user_input))

        user_action = self.USER_ACTION[user_input]

        return user_input, user_action

    def get_pid_action(self):

        # PID Constants
        kP = self.config["P"]  # 0.3 Optimal
        kI = self.config["I"]  # 0.1 Optimal
        kD = self.config["D"]  # 10 Optimal
        desired_angle = 0

        # 1) Get the pole angle
        pole_angle = self.env.theta

        # Error computation
        error = desired_angle - pole_angle

        # 2) Compute action
        self.P = error
        self.I += error
        self.D = error - self.prev_error
        action = kP * self.P + kI * self.I + kD * self.D

        self.prev_error = error

        return (1 if action < 0 else 0)

    def plot_data(self):
        """
        Plots the loss across the episodes which have been run
        """

        figure_title = '{} Experiment Results'.format(self.model_name)

        fig = plt.figure(figure_title, figsize=(8, 15))
        nrows = 5 if self.USER_IMITATION_MODE else 4

        # Plots Model Loss graph
        ax1 = fig.add_subplot(nrows, 1, 1)
        plt.plot(self.loss_aggregation)
        ax1.set_yscale('log')
        plt.title('Model Loss')
        plt.ylabel('Average Loss')
        plt.xlabel('Step')

        # Plots Reward graph
        ax2 = plt.subplot(nrows, 1, 2)
        plt.plot(self.reward_aggregation)
        plt.title('Reward')
        plt.ylabel('Reward')
        plt.xlabel('Episode')

        # Plots Machine Action Graph
        ax3 = plt.subplot(nrows, 1, 3)
        plt.plot(self.machine_action_aggregation)
        plt.title('Machine Action')
        plt.ylabel('Action')
        plt.xlabel('Step')

        # Plots Score
        ax4 = plt.subplot(nrows, 1, 4)
        plt.plot(self.score_aggregation)
        plt.title('Score')
        plt.ylabel('Score')
        plt.xlabel('Episode')

        # Plots User Action Graph if IMITATION_MODE
        if self.USER_IMITATION_MODE:
            ax5 = plt.subplot(nrows, 1, 5)
            plt.plot(self.user_action_aggregation)
            plt.title('User Action')
            plt.ylabel('Action')
            plt.xlabel('Step')

        plt.tight_layout(pad=3, h_pad=3)

        plt.savefig(os.path.join(".", "plots",
                                 "{}.png".format(self.model_name)),
                    bbox_inches='tight')

        # https://stackoverflow.com/questions/34732305/contour-plot-of-2d-array-in-matplotlib
        # Plots weight contours

        self.model_weights = self.dqn.get_weights()
        weight_fig, weight_axes = plt.subplots(1, 4, figsize=(10, 10))

        for i in range(4):
            h, w = self.model_weights[i][0].shape
            X, Y = np.mgrid[0:1:(h * 1j), 0:1:(w * 1j)]
            c1 = weight_axes[i].contourf(X, Y, self.model_weights[i][0])
            plt.colorbar(c1, ax=weight_axes[i])
            weight_axes[i].set_title('Layer {}'.format(i + 1))
            print('Layer {} Weight Shape: ({}, {})'.format(i + 1, h, w))

        plt.tight_layout()
        weight_fig.savefig(os.path.join(
            ".", "plots", "{}.png".format(self.model_name + '_weights')),
                           bbox_inches='tight')
        plt.show()

        # Plots max activations of each layer
        # Stacks data by layer
        # self.acts_by_layer = [
        #     np.stack([self.layer_outputs_list[i][layer] for i in range(len(self.layer_outputs_list))], axis=-1)
        #     for layer in range(5)]
        # print(self.acts_by_layer[0].shape)
        # print(self.acts_by_layer[1].shape)
        # print(self.acts_by_layer[2].shape)
        # print(self.acts_by_layer[3].shape)
        # print(self.acts_by_layer[4].shape)

        # Gets indices of max values along batch axis
        # self.batch_indices_by_layer = [np.argmax(self.acts_by_layer[layer], axis=-1) for layer in range(5)]
        # print(self.batch_indices_by_layer[0].shape)
        # print(self.batch_indices_by_layer[1].shape)
        # print(self.batch_indices_by_layer[2].shape)
        # print(self.batch_indices_by_layer[3].shape)
        # print(self.batch_indices_by_layer[4].shape)

        # Gets input for each max activation by layer
        # self.max_act_inputs_by_layer = [
        #     [self.acts_by_layer[0][:, :, i] for i in np.reshape(self.batch_indices_by_layer[layer], [-1])] for
        #     layer
        #     in range(5)]

        # for i in range(len(self.max_act_inputs_by_layer)):
        #     print('N_Activations: {} Input_Shape: {}'.format(len(self.max_act_inputs_by_layer[i]),
        #                                                      self.max_act_inputs_by_layer[i][0].shape))

        with open(os.path.join(".", "plots", "{}.txt".format(self.model_name)),
                  'w') as f:
            json.dump(self.config, f)

        # Generates dicts for saving to csv
        loss_aggregation_dict = dict()
        action_dict = dict()
        reward_dict = dict()

        # Creates dict for loss data
        loss_aggregation_dict['Loss'] = self.loss_aggregation

        # Creates dict for User Action data if IMITATION_MODE
        if self.USER_IMITATION_MODE:
            action_dict['User_Action'] = self.user_action_aggregation

        # Creates dict for Machine Action data
        action_dict['Machine_Action'] = self.machine_action_aggregation

        # Creates dict for Reward data
        reward_dict['Reward'] = self.reward_aggregation
        # reward_dict['Reward'] = self.reward_aggregation

        # for episode_num in range(0, len(self.loss_aggregation)):
        #     loss_aggregation_dict[episode_num] = self.loss_aggregation[episode_num]

        # Saving the data to a csv
        df = pd.DataFrame.from_dict(loss_aggregation_dict)
        df.to_csv(os.path.join(".", "plots",
                               "{}.csv".format(self.model_name + '_loss')),
                  header=True,
                  index=True)

        df = pd.DataFrame.from_dict(action_dict)
        df.to_csv(os.path.join(".", "plots",
                               "{}.csv".format(self.model_name + '_action')),
                  header=True,
                  index=True)

        df = pd.DataFrame.from_dict(reward_dict)
        df.to_csv(os.path.join(".", "plots",
                               "{}.csv".format(self.model_name + '_reward')),
                  header=True,
                  index=True)

    def run(self):
        """
        Runs the cartpole game (main program entry point)
        """
        # The number of episodes which have completed
        episode = 0

        user_action_string = None

        while (user_action_string != "EXIT") and (episode < self.n_episodes):

            # Environment reset
            state = self.env.reset()
            state = np.reshape(state, [1, self.observation_space])
            step = 0

            # Episode Reward
            r_episode = 0

            # Running the episode
            print('Episode: {}'.format(episode))
            while True:

                # Rendering the step
                step += 1
                self.env.render()

                # Getting the user action based on the specified mode
                if not self.USER_IMITATION_MODE:
                    user_action_string = None
                    user_action = None
                else:
                    user_action, user_action_string = self.get_user_action()
                    user_action -= 1
                    self.user_action_aggregation.append(user_action)

                if self.USER_IMITATION_MODE:
                    user_action, user_action_string = self.get_user_action()
                    user_action -= 1
                    self.user_action_aggregation.append(user_action)
                elif self.PID_IMITATION_MODE:
                    pid_action = self.get_pid_action()
                    self.user_action_aggregation.append(pid_action)
                    user_action = pid_action
                else:
                    user_action_string = None
                    user_action = None

                # Exiting on user request
                # This will also save the model and plot the loss
                if user_action_string == "EXIT":
                    print("Saving model...")
                    loss, r, layer_outputs = self.dqn.experience_replay(
                        save=True)
                    if layer_outputs != -1:
                        self.layer_outputs_list += layer_outputs
                    self.loss_aggregation.append(loss)
                    self.reward_aggregation.append(r_episode)
                    print("Saved model.")
                    break

                # Getting the machine action
                machine_action = self.dqn.act(state)

                # Records machine action for step step
                self.machine_action_aggregation.append(machine_action)

                # Printing actions
                if self.USER_IMITATION_MODE or self.PID_IMITATION_MODE:
                    print("User Action: {} Machine Action: {}".format(
                        user_action, machine_action))
                else:
                    print("Machine Action: {}".format(machine_action))

                # Computing the state
                state_next, reward, terminal, info = self.env.step(
                    machine_action, user_input=user_action)

                # Computing the reward
                reward = reward if not terminal else -reward
                r_episode += reward

                state_next = np.reshape(state_next,
                                        [1, self.observation_space])

                # Storing the step for experience replay
                self.dqn.remember(state, machine_action, reward, state_next,
                                  terminal)

                # Setting the current state to be the next state
                state = state_next

                # Post processing
                loss = 1
                if (episode % 1 == 0) and terminal:
                    print('Saving models...')
                    loss, r_step, layer_outputs = self.dqn.experience_replay(
                        save=True)
                else:
                    loss, r_step, layer_outputs = self.dqn.experience_replay(
                        save=False)

                if layer_outputs != -1:
                    self.layer_outputs_list += layer_outputs

                # Checking if game over
                if terminal:
                    print("Episode: {} Exploration: {} Score: {}".format(
                        episode, self.dqn.exploration_rate, step))
                    self.reward_aggregation.append(r_episode)
                    self.score_aggregation.append(step)
                    episode += 1

                    # input() # Debugging at the end of every episode
                    break

                # Adds loss to plot list, if replay buffer is ready for training
                if loss != -1:
                    self.loss_aggregation.append(loss)

                # Getting ready for next state
                print("Reward: {} Step: {} Episode: {} Loss: {}".format(
                    reward, step, episode, loss))

        self.plot_data()