def __init__(self, **kwargs): """ Constructor """ self.config = kwargs self.threadID = kwargs['model_name'] self.name = kwargs['model_name'] # Setting configuration self.USER_IMITATION_MODE = kwargs['user_imitation_mode'] self.PID_IMITATION_MODE = kwargs['pid_imitation_mode'] # Initializing the environment self.env = CartPoleEnv() self.observation_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.n # Initializing the neural network self.model_name = kwargs['model_name'] # The maximum number of episodes to run self.n_episodes = kwargs['n_episodes'] # Initializing the model self.dqn_params = kwargs self.dqn_params['observation_space'] = self.observation_space self.dqn_params['action_space'] = self.action_space self.dqn = CartpoleDQN(**self.dqn_params) # Average training loss per step self.loss_aggregation = [] # Total Reward per Episode self.reward_aggregation = [] # User Action per Step self.user_action_aggregation = [] # Machine Action per Step self.machine_action_aggregation = [] self.score_aggregation = [] # List of lists of output activations for each layer self.layer_outputs_list = [] # Creates directory if directory does not exist if not os.path.exists('.//models'): os.mkdir('.//models') if not os.path.exists('.//plots'): os.mkdir('.//plots') # Required for PID Control self.P = 0 self.I = 0 self.D = 0 self.prev_error = 0
def main(): env = CartPoleEnv() agent = DQNAgent(env.state_size(), env.action_size()) epsilon = EPSILON_START results = [] start = time.time() random.seed(0) for episode in range(EPISODES): #Start game/episode state = env.reset() if (episode > SWITCH_FREQ and episode % SWITCH_FREQ == 0): agent.update_target_model() #Loop inside one game episode for t in range(STEPS): # Display the game. Comment bellow line in order to get faster training. env.render() state_action_q_values = agent.forward(torch.from_numpy(state)) if random.random() <= epsilon: action = random.randrange(env.action_size()) else: action = torch.argmax(state_action_q_values).item() next_state, reward, done = env.step(action) agent.remember(state, action, reward, next_state) if done or (t == STEPS - 1): print("episode: {}/{}, score: {}, e: {:.2}".format( episode, EPISODES, t, epsilon)) results.append(t) break if episode > 10 and (episode + t) % UPDATE_FREQ == 0: agent.backward() state = next_state if epsilon > EPSILON_END: epsilon *= EPSILON_DECAY end = time.time() print("TIME") print(end - start) print("STEPS") print(sum(results)) plt.plot(results) plt.show()
def main(): env = CartPoleEnv() agent = DQNAgent(env.state_size(), env.action_size()) epsilon = EPSILON_START results = [] start = time.time() for episode in range(EPISODES): #Start game/episode state = env.reset() #Loop inside one game episode for t in range(STEPS): # Display the game. Comment bellow line in order to get faster training. env.render() #0. Currently you are in "state S (state)" #1.1 Determine action q values from state S. #1.2 Calculate action to be taken from state S. Use 'e-rand off-policy' #1.3 Play/perform the action in the environment # Move to "next state S' (next_state), get reward, and flag for is game over (is new state terminal) pass done = True #Update this flag correctly #2.1 From state S' peek into the future - Determine action q values from state S' #2.2 Using the SARSA-MAX formula update the net. # Suggestion: You can start with formula: Q(S,A) <- R + gamma * max(Q(S',A')) # Hint1: Don't forget that you should only perform update for taken action (#1.2) # Hint2: Don't forget that the target is no_grad constant. pass if done or (t == STEPS - 1): print("episode: {}/{}, score: {}, e: {:.2}".format( episode, EPISODES, t, epsilon)) results.append(t) break #3.1 Current state is now next_state pass if epsilon > EPSILON_END: epsilon *= EPSILON_DECAY end = time.time() print("TIME") print(end - start) print("STEPS") print(sum(results)) plt.plot(results) plt.show()
def main(): env = CartPoleEnv() model = deepq.models.mlp([64]) act = deepq.learn( env, q_func=model, lr=1e-3, max_timesteps=500000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, print_freq=10, callback=callback ) print("Saving model to cartpole_model.pkl") act.save("cartpole_model.pkl")
import numpy as np import gym from keras.models import Sequential from keras.layers import Dense, Activation, Flatten from keras.optimizers import Adam from rl.agents.cem import CEMAgent from rl.memory import EpisodeParameterMemory from cartpole_env import CartPoleEnv # Get the environment and extract the number of actions. env = CartPoleEnv() np.random.seed(123) env.seed(123) nb_actions = env.action_space.n obs_dim = env.observation_space.shape[0] # Option 1 : Simple model model = Sequential() model.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) model.add(Dense(nb_actions)) model.add(Activation('softmax')) # Option 2: deep network # model = Sequential() # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) # model.add(Dense(16)) # model.add(Activation('relu')) # model.add(Dense(16))
class Cartpole: """ Cartpole runs the game using the deep neural network and the OpenAI Gym """ USER_ACTION = dict() USER_ACTION[1] = "APPLY FORCE RIGHT" USER_ACTION[2] = "APPLY FORCE LEFT" USER_ACTION[0] = "EXIT" USER_INPUT_INDEX = [0, 1, 2] def __init__(self, **kwargs): """ Constructor """ self.config = kwargs self.threadID = kwargs['model_name'] self.name = kwargs['model_name'] # Setting configuration self.USER_IMITATION_MODE = kwargs['user_imitation_mode'] self.PID_IMITATION_MODE = kwargs['pid_imitation_mode'] # Initializing the environment self.env = CartPoleEnv() self.observation_space = self.env.observation_space.shape[0] self.action_space = self.env.action_space.n # Initializing the neural network self.model_name = kwargs['model_name'] # The maximum number of episodes to run self.n_episodes = kwargs['n_episodes'] # Initializing the model self.dqn_params = kwargs self.dqn_params['observation_space'] = self.observation_space self.dqn_params['action_space'] = self.action_space self.dqn = CartpoleDQN(**self.dqn_params) # Average training loss per step self.loss_aggregation = [] # Total Reward per Episode self.reward_aggregation = [] # User Action per Step self.user_action_aggregation = [] # Machine Action per Step self.machine_action_aggregation = [] self.score_aggregation = [] # List of lists of output activations for each layer self.layer_outputs_list = [] # Creates directory if directory does not exist if not os.path.exists('.//models'): os.mkdir('.//models') if not os.path.exists('.//plots'): os.mkdir('.//plots') # Required for PID Control self.P = 0 self.I = 0 self.D = 0 self.prev_error = 0 @staticmethod def getch(): """ This method gets the user input without requiring the user to press enter afterwards """ fd = sys.stdin.fileno() old_settings = termios.tcgetattr(fd) try: tty.setraw(sys.stdin.fileno()) ch = sys.stdin.read(1) finally: termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) return int(ch) def get_user_action(self): """Gets the user input and parses the corresponding user action""" user_action = None print("Please enter an input:") user_input = self.getch() # Getting user action while user_input not in self.USER_INPUT_INDEX: print("Please enter an input:") user_input = int(self.getch()) print("User input: {}".format(user_input)) user_action = self.USER_ACTION[user_input] return user_input, user_action def get_pid_action(self): # PID Constants kP = self.config["P"] # 0.3 Optimal kI = self.config["I"] # 0.1 Optimal kD = self.config["D"] # 10 Optimal desired_angle = 0 # 1) Get the pole angle pole_angle = self.env.theta # Error computation error = desired_angle - pole_angle # 2) Compute action self.P = error self.I += error self.D = error - self.prev_error action = kP * self.P + kI * self.I + kD * self.D self.prev_error = error return (1 if action < 0 else 0) def plot_data(self): """ Plots the loss across the episodes which have been run """ figure_title = '{} Experiment Results'.format(self.model_name) fig = plt.figure(figure_title, figsize=(8, 15)) nrows = 5 if self.USER_IMITATION_MODE else 4 # Plots Model Loss graph ax1 = fig.add_subplot(nrows, 1, 1) plt.plot(self.loss_aggregation) ax1.set_yscale('log') plt.title('Model Loss') plt.ylabel('Average Loss') plt.xlabel('Step') # Plots Reward graph ax2 = plt.subplot(nrows, 1, 2) plt.plot(self.reward_aggregation) plt.title('Reward') plt.ylabel('Reward') plt.xlabel('Episode') # Plots Machine Action Graph ax3 = plt.subplot(nrows, 1, 3) plt.plot(self.machine_action_aggregation) plt.title('Machine Action') plt.ylabel('Action') plt.xlabel('Step') # Plots Score ax4 = plt.subplot(nrows, 1, 4) plt.plot(self.score_aggregation) plt.title('Score') plt.ylabel('Score') plt.xlabel('Episode') # Plots User Action Graph if IMITATION_MODE if self.USER_IMITATION_MODE: ax5 = plt.subplot(nrows, 1, 5) plt.plot(self.user_action_aggregation) plt.title('User Action') plt.ylabel('Action') plt.xlabel('Step') plt.tight_layout(pad=3, h_pad=3) plt.savefig(os.path.join(".", "plots", "{}.png".format(self.model_name)), bbox_inches='tight') # https://stackoverflow.com/questions/34732305/contour-plot-of-2d-array-in-matplotlib # Plots weight contours self.model_weights = self.dqn.get_weights() weight_fig, weight_axes = plt.subplots(1, 4, figsize=(10, 10)) for i in range(4): h, w = self.model_weights[i][0].shape X, Y = np.mgrid[0:1:(h * 1j), 0:1:(w * 1j)] c1 = weight_axes[i].contourf(X, Y, self.model_weights[i][0]) plt.colorbar(c1, ax=weight_axes[i]) weight_axes[i].set_title('Layer {}'.format(i + 1)) print('Layer {} Weight Shape: ({}, {})'.format(i + 1, h, w)) plt.tight_layout() weight_fig.savefig(os.path.join( ".", "plots", "{}.png".format(self.model_name + '_weights')), bbox_inches='tight') plt.show() # Plots max activations of each layer # Stacks data by layer # self.acts_by_layer = [ # np.stack([self.layer_outputs_list[i][layer] for i in range(len(self.layer_outputs_list))], axis=-1) # for layer in range(5)] # print(self.acts_by_layer[0].shape) # print(self.acts_by_layer[1].shape) # print(self.acts_by_layer[2].shape) # print(self.acts_by_layer[3].shape) # print(self.acts_by_layer[4].shape) # Gets indices of max values along batch axis # self.batch_indices_by_layer = [np.argmax(self.acts_by_layer[layer], axis=-1) for layer in range(5)] # print(self.batch_indices_by_layer[0].shape) # print(self.batch_indices_by_layer[1].shape) # print(self.batch_indices_by_layer[2].shape) # print(self.batch_indices_by_layer[3].shape) # print(self.batch_indices_by_layer[4].shape) # Gets input for each max activation by layer # self.max_act_inputs_by_layer = [ # [self.acts_by_layer[0][:, :, i] for i in np.reshape(self.batch_indices_by_layer[layer], [-1])] for # layer # in range(5)] # for i in range(len(self.max_act_inputs_by_layer)): # print('N_Activations: {} Input_Shape: {}'.format(len(self.max_act_inputs_by_layer[i]), # self.max_act_inputs_by_layer[i][0].shape)) with open(os.path.join(".", "plots", "{}.txt".format(self.model_name)), 'w') as f: json.dump(self.config, f) # Generates dicts for saving to csv loss_aggregation_dict = dict() action_dict = dict() reward_dict = dict() # Creates dict for loss data loss_aggregation_dict['Loss'] = self.loss_aggregation # Creates dict for User Action data if IMITATION_MODE if self.USER_IMITATION_MODE: action_dict['User_Action'] = self.user_action_aggregation # Creates dict for Machine Action data action_dict['Machine_Action'] = self.machine_action_aggregation # Creates dict for Reward data reward_dict['Reward'] = self.reward_aggregation # reward_dict['Reward'] = self.reward_aggregation # for episode_num in range(0, len(self.loss_aggregation)): # loss_aggregation_dict[episode_num] = self.loss_aggregation[episode_num] # Saving the data to a csv df = pd.DataFrame.from_dict(loss_aggregation_dict) df.to_csv(os.path.join(".", "plots", "{}.csv".format(self.model_name + '_loss')), header=True, index=True) df = pd.DataFrame.from_dict(action_dict) df.to_csv(os.path.join(".", "plots", "{}.csv".format(self.model_name + '_action')), header=True, index=True) df = pd.DataFrame.from_dict(reward_dict) df.to_csv(os.path.join(".", "plots", "{}.csv".format(self.model_name + '_reward')), header=True, index=True) def run(self): """ Runs the cartpole game (main program entry point) """ # The number of episodes which have completed episode = 0 user_action_string = None while (user_action_string != "EXIT") and (episode < self.n_episodes): # Environment reset state = self.env.reset() state = np.reshape(state, [1, self.observation_space]) step = 0 # Episode Reward r_episode = 0 # Running the episode print('Episode: {}'.format(episode)) while True: # Rendering the step step += 1 self.env.render() # Getting the user action based on the specified mode if not self.USER_IMITATION_MODE: user_action_string = None user_action = None else: user_action, user_action_string = self.get_user_action() user_action -= 1 self.user_action_aggregation.append(user_action) if self.USER_IMITATION_MODE: user_action, user_action_string = self.get_user_action() user_action -= 1 self.user_action_aggregation.append(user_action) elif self.PID_IMITATION_MODE: pid_action = self.get_pid_action() self.user_action_aggregation.append(pid_action) user_action = pid_action else: user_action_string = None user_action = None # Exiting on user request # This will also save the model and plot the loss if user_action_string == "EXIT": print("Saving model...") loss, r, layer_outputs = self.dqn.experience_replay( save=True) if layer_outputs != -1: self.layer_outputs_list += layer_outputs self.loss_aggregation.append(loss) self.reward_aggregation.append(r_episode) print("Saved model.") break # Getting the machine action machine_action = self.dqn.act(state) # Records machine action for step step self.machine_action_aggregation.append(machine_action) # Printing actions if self.USER_IMITATION_MODE or self.PID_IMITATION_MODE: print("User Action: {} Machine Action: {}".format( user_action, machine_action)) else: print("Machine Action: {}".format(machine_action)) # Computing the state state_next, reward, terminal, info = self.env.step( machine_action, user_input=user_action) # Computing the reward reward = reward if not terminal else -reward r_episode += reward state_next = np.reshape(state_next, [1, self.observation_space]) # Storing the step for experience replay self.dqn.remember(state, machine_action, reward, state_next, terminal) # Setting the current state to be the next state state = state_next # Post processing loss = 1 if (episode % 1 == 0) and terminal: print('Saving models...') loss, r_step, layer_outputs = self.dqn.experience_replay( save=True) else: loss, r_step, layer_outputs = self.dqn.experience_replay( save=False) if layer_outputs != -1: self.layer_outputs_list += layer_outputs # Checking if game over if terminal: print("Episode: {} Exploration: {} Score: {}".format( episode, self.dqn.exploration_rate, step)) self.reward_aggregation.append(r_episode) self.score_aggregation.append(step) episode += 1 # input() # Debugging at the end of every episode break # Adds loss to plot list, if replay buffer is ready for training if loss != -1: self.loss_aggregation.append(loss) # Getting ready for next state print("Reward: {} Step: {} Episode: {} Loss: {}".format( reward, step, episode, loss)) self.plot_data()