class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = "models\model%d" % agent_step self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.95, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 100000), learning_rate=0.01, momentum=0.8, minibatch_size=16, memory_size=15000, train_after=100, train_interval=100, target_update_interval=500, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._num_trains = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] ''' # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) ''' with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(7, init=he_uniform(scale=0.01)), Dense(8, init=he_uniform(scale=0.01)), #Dense(16, init=he_uniform(scale=0.01)), #Dense(32, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) #self._trainer.restore_from_checkpoint('models/oldmodels/model800000') def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 #print(self._num_actions_taken) return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: #if (agent_step % self._train_interval) == 0: print('\nTraining minibatch\n') client.setCarControls(zero_controls) pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) self._num_trains += 1 # Update the Target Network if needed if self._num_trains % 20 == 0: print('updating network') self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = dirname+"\model%d" % agent_step self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken)
class DQAgent(object): """docstring for DQAgent""" ############should modify @!@! def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = RepMem(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Dense(input_shape, init=he_uniform(scale=0.01)), Dense(input_shape), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01))]) self._action_value_net.update_signature(Tensor[input_shape]) self._target_net = self._action_value_net.clone(CloneMethod.freeze) @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): q_targets = compute_q_targets(post_states, rewards, terminals) q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) return huber_loss(q_targets, q_acted, 1.0) lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter(freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def act(self, state): self._history.append(state) if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: env_with_history = self._history.value q_values = self._action_value_net.eval( env_with_history.reshape((1,) + env_with_history.shape) ) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) action = q_values.argmax() self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): self._episode_rewards.append(reward) if done: if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] self._history.reset() self._memory.append(old_state, action, reward, done) def train(self): agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch(self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot(actions.reshape(-1,1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals ) ) if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone(CloneMethod.freeze) filename = "model\model%d" % agent_step # save ???? not good at using %d self._trainer.save_checkpoint(filename) def _plot_metrics(self): if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep', sum(self._episode_rewards), self._num_actions_taken)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(1, 0.1, 1000000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=200000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._minibatch_size = minibatch_size self._history = History(input_shape) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ Convolution2D((8, 8), 16, strides=4), Convolution2D((4, 4), 32, strides=2), Convolution2D((3, 3), 32, strides=1), Dense(256, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter( freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) def load(self, model_path): self._trainer.restore_from_checkpoint(model_path) def act(self, state, eval=False): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken) and not eval: action = self._explorer(self.nb_actions) q_values = None else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1, ) + env_with_history.shape)) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action, q_values def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self, checkpoint_dir): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: #print('training... number of steps: {}'.format(agent_step)) pre_states, actions, post_states, rewards, terminals = self._memory.minibatch( self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot( actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals)) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone( CloneMethod.freeze) filename = os.path.join(checkpoint_dir, "models\model%d" % agent_step) self._trainer.save_checkpoint(filename) def _plot_metrics(self): """Plot current buffers accumulated values to visualize agent learning """ if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) def get_depth_image(self, client): # get depth image from airsim responses = client.simGetImages([ airsim.ImageRequest("RCCamera", airsim.ImageType.DepthPerspective, True, False) ]) img1d = np.array(responses[0].image_data_float, dtype=np.float) img1d = 255 / np.maximum(np.ones(img1d.size), img1d) if img1d.size > 1: img2d = np.reshape(img1d, (responses[0].height, responses[0].width)) image = Image.fromarray(img2d) im_final = np.array(image.resize((84, 84)).convert('L')) im_final = im_final / 255.0 return im_final return np.zeros((84, 84)).astype(float) # Gets a coverage image from AirSim def get_cov_image(self, coverage_map): state, cov_reward = coverage_map.get_state_from_pose() #state = self.coverage_map.get_map_scaled() # debug only #im = Image.fromarray(np.uint8(state)) #im.save("DistributedRL\\debug\\{}.png".format(time.time())) # normalize state state = state / 255.0 return state, cov_reward
t += 1 # Stack together all inputs, hidden states, action gradients, and rewards for this episode epx = np.vstack(states) epl = np.vstack(labels).astype(np.float32) epr = np.vstack(rewards).astype(np.float32) # Compute the discounted reward backwards through time. discounted_epr = discount_rewards(epr) # Train the critic to predict the discounted reward from the observation critic_trainer.train_minibatch({ observations: epx, critic_target: discounted_epr }) baseline = critic.eval({observations: epx}) # Compute n-step targets n_step_targets = compute_n_step_targets(epr, baseline[0]) # Compute the baselined returns: A = n_step_targets - b(s). Weight the gradients by this value. baselined_returns = n_step_targets - baseline # Keep a running estimate over the variance of of the discounted rewards for r in baselined_returns: running_variance.add(r[0, 0]) # Forward pass arguments = { observations: epx, label: epl,
class LearningAgent(object): def __init__(self, state_dim, action_dim, gamma=0.99, learning_rate=1e-4, momentum=0.95): self.state_dim = state_dim self.action_dim = action_dim self.gamma = gamma with default_options(activation=relu, init=he_uniform()): # Convolution filter counts were halved to save on memory, no gpu :( self.model = Sequential([ Convolution2D((8, 8), 16, strides=4, name='conv1'), Convolution2D((4, 4), 32, strides=2, name='conv2'), Convolution2D((3, 3), 32, strides=1, name='conv3'), Dense(256, init=he_uniform(scale=0.01), name='dense1'), Dense(action_dim, activation=None, init=he_uniform(scale=0.01), name='actions') ]) self.model.update_signature(Tensor[state_dim]) # Create the target model as a copy of the online model self.target_model = None self.update_target() self.pre_states = input_variable(state_dim, name='pre_states') self.actions = input_variable(action_dim, name='actions') self.post_states = input_variable(state_dim, name='post_states') self.rewards = input_variable((), name='rewards') self.terminals = input_variable((), name='terminals') self.is_weights = input_variable((), name='is_weights') predicted_q = reduce_sum(self.model(self.pre_states) * self.actions, axis=0) # DQN - calculate target q values # post_q = reduce_max(self.target_model(self.post_states), axis=0) # DDQN - calculate target q values online_selection = one_hot( argmax(self.model(self.post_states), axis=0), self.action_dim) post_q = reduce_sum(self.target_model(self.post_states) * online_selection, axis=0) post_q = (1.0 - self.terminals) * post_q target_q = stop_gradient(self.rewards + self.gamma * post_q) # Huber loss delta = 1.0 self.td_error = minus(predicted_q, target_q, name='td_error') abs_error = abs(self.td_error) errors = element_select(less(abs_error, delta), square(self.td_error) * 0.5, delta * (abs_error - 0.5 * delta)) loss = errors * self.is_weights # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_scheule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) self._learner = adam(self.model.parameters, lr_schedule, m_scheule, variance_momentum=vm_schedule) self.writer = TensorBoardProgressWriter(log_dir='metrics', model=self.model) self.trainer = Trainer(self.model, (loss, None), [self._learner], self.writer) def act(self, state, epsilon): """ Selects an action to take based on the epsilon greedy method :param state: The current state :param epsilon: Determines the amount of exploration. (1 - full exploration, 0 - no exploration) """ if np.random.randn(1) < epsilon: # Explore (random action) return np.random.choice(self.action_dim) else: # Exploit (greedy action based on knowledge) return self.model.eval(state).argmax() def train(self, s, a, r, s_, t, w): """ Updates the network weights using the given minibatch data :param s: Tensor[state_dim] Current state :param a: Tensor[int] Action taken at state s :param r: Tensor[float] State resulting from taking action a at state s :param s_: Tensor[state_dim] Reward received for taking action a at state s :param t: Tensor[boolean] True if s_ was a terminal state and false otherwise :param w: Tensor[float] Importance sampling weights """ a = Value.one_hot(a.tolist(), self.action_dim) td_error = self.trainer.train_minibatch( { self.pre_states: s, self.actions: a, self.rewards: r, self.post_states: s_, self.terminals: t, self.is_weights: w }, outputs=[self.td_error]) return td_error[0] def update_target(self): """ Update the target network using the online network weights """ self.target_model = self.model.clone(CloneMethod.freeze) def checkpoint(self, filename): self.trainer.save_checkpoint(filename) def save_model(self, filename): self.model.save(filename)
class DeepQAgent(object): """ Implementation of Deep Q Neural Network agent like in: Nature 518. "Human-level control through deep reinforcement learning" (Mnih & al. 2015) """ def __init__(self, input_shape, nb_actions, gamma=0.99, explorer=LinearEpsilonAnnealingExplorer(0.91, 0.1, 910000), fixpolicy=LinearEpsilonAnnealingExplorer(0.5, 0.1, 100000), learning_rate=0.00025, momentum=0.95, minibatch_size=32, memory_size=500000, train_after=10000, train_interval=4, target_update_interval=10000, monitor=True): self.input_shape = input_shape self.nb_actions = nb_actions self.gamma = gamma self._train_after = train_after self._train_interval = train_interval self._target_update_interval = target_update_interval self._explorer = explorer self._fixpolicy = fixpolicy self._minibatch_size = minibatch_size self._history = History(input_shape) print("input_shape:", input_shape) print("input_shape[1:]", input_shape[1:]) self._memory = ReplayMemory(memory_size, input_shape[1:], 4) self._num_actions_taken = 0 # Metrics accumulator self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Action Value model (used by agent to interact with the environment) with default_options(activation=relu, init=he_uniform()): self._action_value_net = Sequential([ #Convolution2D((8, 8), 16, strides=4), #Convolution2D((4, 4), 32, strides=2), #Convolution2D((1, 1), 16, strides=1), Dense(25, init=he_uniform(scale=0.01)), Dense(nb_actions, activation=None, init=he_uniform(scale=0.01)) ]) self._action_value_net.update_signature(Tensor[input_shape]) # Target model used to compute the target Q-values in training, updated # less frequently for increased stability. self._target_net = self._action_value_net.clone(CloneMethod.freeze) # Function computing Q-values targets as part of the computation graph @Function @Signature(post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def compute_q_targets(post_states, rewards, terminals): return element_select( terminals, rewards, gamma * reduce_max(self._target_net(post_states), axis=0) + rewards, ) # Define the loss, using Huber Loss (more robust to outliers) @Function @Signature(pre_states=Tensor[input_shape], actions=Tensor[nb_actions], post_states=Tensor[input_shape], rewards=Tensor[()], terminals=Tensor[()]) def criterion(pre_states, actions, post_states, rewards, terminals): # Compute the q_targets q_targets = compute_q_targets(post_states, rewards, terminals) # actions is a 1-hot encoding of the action done by the agent q_acted = reduce_sum(self._action_value_net(pre_states) * actions, axis=0) # Define training criterion as the Huber Loss function return huber_loss(q_targets, q_acted, 1.0) # Adam based SGD lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._action_value_net.parameters, lr_schedule, momentum=m_schedule, variance_momentum=vm_schedule) self._metrics_writer = TensorBoardProgressWriter( freq=1, log_dir='metrics', model=criterion) if monitor else None self._learner = l_sgd self._trainer = Trainer(criterion, (criterion, None), l_sgd, self._metrics_writer) #self._trainer.restore_from_checkpoint("models_heuristic_no_image\model") def act(self, state): """ This allows the agent to select the next action to perform in regard of the current state of the environment. It follows the terminology used in the Nature paper. Attributes: state (Tensor[input_shape]): The current environment state Returns: Int >= 0 : Next action to do """ # Append the state to the short term memory (ie. History) self._history.append(state) #if True: if self._fixpolicy.is_exploring(self._num_actions_taken): diff_x = state[3] - state[0] diff_y = state[4] - state[1] diff_z = state[5] - state[2] diff_arr = np.array([diff_x, diff_y, diff_z]) direction = np.argmax(np.absolute(diff_arr)) ''' abs_x = math.fabs(diff_x) abs_y = math.fabs(diff_y) abs_z = math.fabs(diff_z) diff = [diff_x, diff_y, diff_z] abs_diff = [abs_x, abs_y, abs_z] print(diff, abs_diff) m = max(abs_diff) direction = diff.index(m)''' print(diff_arr) if diff_arr[direction] < 0: fixaction = direction + 4 else: fixaction = direction + 1 self._num_actions_taken += 1 return fixaction # If policy requires agent to explore, sample random action if self._explorer.is_exploring(self._num_actions_taken): action = self._explorer(self.nb_actions) else: # Use the network to output the best action env_with_history = self._history.value q_values = self._action_value_net.eval( # Append batch axis with only one sample to evaluate env_with_history.reshape((1, ) + env_with_history.shape)) self._episode_q_means.append(np.mean(q_values)) self._episode_q_stddev.append(np.std(q_values)) # Return the value maximizing the expected reward action = q_values.argmax() # Keep track of interval action counter self._num_actions_taken += 1 return action def observe(self, old_state, action, reward, done): """ This allows the agent to observe the output of doing the action it selected through act() on the old_state Attributes: old_state (Tensor[input_shape]): Previous environment state action (int): Action done by the agent reward (float): Reward for doing this action in the old_state environment done (bool): Indicate if the action has terminated the environment """ self._episode_rewards.append(reward) # If done, reset short term memory (ie. History) if done: # Plot the metrics through Tensorboard and reset buffers if self._metrics_writer is not None: self._plot_metrics() self._episode_rewards, self._episode_q_means, self._episode_q_stddev = [], [], [] # Reset the short term memory self._history.reset() # Append to long term memory self._memory.append(old_state, action, reward, done) def train(self): """ This allows the agent to train itself to better understand the environment dynamics. The agent will compute the expected reward for the state(t+1) and update the expected reward at step t according to this. The target expectation is computed through the Target Network, which is a more stable version of the Action Value Network for increasing training stability. The Target Network is a frozen copy of the Action Value Network updated as regular intervals. """ agent_step = self._num_actions_taken print("agent_step = ", agent_step) #time.sleep(1) if agent_step >= self._train_after: if (agent_step % self._train_interval) == 0: pre_states, actions, post_states, rewards, terminals = self._memory.minibatch( self._minibatch_size) self._trainer.train_minibatch( self._trainer.loss_function.argument_map( pre_states=pre_states, actions=Value.one_hot( actions.reshape(-1, 1).tolist(), self.nb_actions), post_states=post_states, rewards=rewards, terminals=terminals)) # Update the Target Network if needed if (agent_step % self._target_update_interval) == 0: self._target_net = self._action_value_net.clone( CloneMethod.freeze) filename = "models_heuristic_no_image_less_exploration\model%d" % agent_step print( "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$filename=", filename) self._trainer.save_checkpoint(filename) #time.sleep(100) def _plot_metrics(self): global landing_count, episode_count """Plot current buffers accumulated values to visualize agent learning """ f = open('log__heuristic_no_image_less_exploration2', 'a+') f.write('episode:' + str(episode_count) + ': exploration rate= ' + str(self._explorer._rate) + ' heuristic fix rate= ' + str(self._fixpolicy._rate) + '\n') if len(self._episode_q_means) > 0: mean_q = np.asscalar(np.mean(self._episode_q_means)) self._metrics_writer.write_value('Mean Q per ep.', mean_q, self._num_actions_taken) print('Mean Q per ep.', mean_q, self._num_actions_taken) f.write('Mean Q per ep. ' + str(mean_q) + ' ' + str(self._num_actions_taken) + '\n') if len(self._episode_q_stddev) > 0: std_q = np.asscalar(np.mean(self._episode_q_stddev)) self._metrics_writer.write_value('Mean Std Q per ep.', std_q, self._num_actions_taken) print('Mean Std Q per ep.', std_q, self._num_actions_taken) f.write('Mean Std Q per ep. ' + str(std_q) + ' ' + str(self._num_actions_taken) + '\n') self._metrics_writer.write_value('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) print('Sum rewards per ep.', sum(self._episode_rewards), self._num_actions_taken) f.write('Sum rewards per ep. ' + str(sum(self._episode_rewards)) + ' ' + str(self._num_actions_taken) + '\n') if landing_count > 0: f.write('****************Success landing**********' + str(landing_count) + '\n') landing_count = 0 episode_count = 0 f.write('\n')