Exemple #1
0
 def experience(self,
                obs,
                act,
                rew,
                new_obs,
                done,
                terminal,
                om_pred,
                h,
                c,
                opp_act,
                policy,
                opp_policy,
                initial,
                h_in_ep_prev=None,
                c_in_ep_prev=None,
                h_in_ep=None,
                c_in_ep=None):
     # Ensure that we have an appropriate state to sample
     if initial:
         h, c = U.get_session().run(
             [self.om_debug['initial_h'], self.om_debug['initial_c']])
     # Store transition in the replay buffer.
     self.replay_buffer.add(obs, act, rew, new_obs, float(terminal or done),
                            np.squeeze(om_pred), h, c, opp_act, policy,
                            opp_policy, h_in_ep_prev, c_in_ep_prev, h_in_ep,
                            c_in_ep)
Exemple #2
0
 def train_opponent_model(self,
                          save_dir,
                          with_replacement=True,
                          summary_writer=None):
     # Collect data using the method defined above.
     batch = self.collect_data_for_om_training(save_dir, with_replacement)
     alternative_preprocessing = isinstance(batch, list)
     losses = []
     # Get initial state
     h, c = U.get_session().run(
         [self.om_debug['initial_h'], self.om_debug['initial_c']])
     # Work out how many steps it will take to work through one trajectory.
     # In the case that the length of the trajectory is not divisible by
     # the chunk length the remainder steps from the floor division are
     # discarded.
     iterations = batch['observations'].shape[1] // self._chunk_length
     # Always start by using the initial state.
     use_initial_state = True
     # Work through the trajectory iterations in chunks.
     for i in range(iterations):
         # Pick out the inputs for the current chunk.
         lstm_inputs = batch['lstm_inputs'][:,
                                            i * self._chunk_length:(i + 1) *
                                            self._chunk_length]
         observations = batch['observations'][:, i *
                                              self._chunk_length:(i + 1) *
                                              self._chunk_length]
         targets = batch['targets'][:, i * self._chunk_length:(i + 1) *
                                    self._chunk_length]
         # If necessary cast the target actions to one hot
         if self.args.train_lemol_om_on_oh:
             targets = np.eye(self.om_prediction_dim)[np.argmax(targets,
                                                                axis=-1)]
         # If we are using the triplet representation loss we decay the weight
         # placed on this loss in order to weight the period where learning is
         # more influential more heavily. Note that this is not used where
         # we do no use the alternative loss.ß
         base = self.args.representation_loss_weight_decay_base
         representation_loss_weight = base / (
             base + i) * self.args.representation_loss_weight
         # Run the training operation.
         loss, train_summary, h, c = self.om_train(
             lstm_inputs, observations, targets, h, c, use_initial_state,
             representation_loss_weight)
         # Increment the training step counter for logging purposes.
         # This count persists across opponent model training runs.
         self.om_learning_iter += 1
         # After the first chunk which starts the trajectory we no longer want to use
         # the initial state.
         use_initial_state = False
         # If the facilities are provided then log the training outcomes to tensorboard.
         if summary_writer is not None:
             summary_writer.add_summary(train_summary,
                                        self.om_learning_iter)
Exemple #3
0
 def om_step(self, om_inputs, obs, h, c, use_initial_lf_state):
     if use_initial_lf_state:
         # Get the initial state.
         # Not strictly needed as use_initial_lf_state should handle this.
         h, c = U.get_session().run(
             [self.om_debug['initial_h'], self.om_debug['initial_c']])
     om_outputs = self.get_om_outputs(om_inputs, obs, h, c,
                                      use_initial_lf_state)
     # Update the current LeMOL state which denotes the representation
     # of the opponent.
     self.current_h, self.current_c = om_outputs
     return om_outputs
Exemple #4
0
 def reset_p_and_q_networks(self):
     U.get_session().run(tf.variables_initializer(self.trainable_vars))
Exemple #5
0
 def get_initial_in_ep_state(self):
     assert self.recurrent_om, 'Model Must have a Recurrent Opponent Model to get initial state'
     self._initial_in_ep_state = U.get_session().run([
         self.om_debug['initial_h_in_ep'], self.om_debug['initial_c_in_ep']
     ])
     return self._initial_in_ep_state
Exemple #6
0
 def reset_lemol_state(self):
     self.current_h, self.current_c = U.get_session().run(
         [self.om_debug['initial_h'], self.om_debug['initial_c']])
Exemple #7
0
 def reset_p_and_q_networks(self):
     self.initial_exploration_done = False
     U.get_session().run(tf.variables_initializer(self.q_vars +
                                                  self.p_vars))