def learn_on_batch(self, samples): if log_once("learn_on_batch"): logger.info( "Training on concatenated sample batches:\n\n{}\n".format( summarize(samples))) if isinstance(samples, MultiAgentBatch): info_out = {} to_fetch = {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "learn_on_batch") else: builder = None for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue policy = self.policy_map[pid] if builder and hasattr(policy, "_build_learn_on_batch"): to_fetch[pid] = policy._build_learn_on_batch( builder, batch) else: info_out[pid] = policy.learn_on_batch(batch) info_out.update({k: builder.get(v) for k, v in to_fetch.items()}) else: info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch( samples) if log_once("learn_out"): logger.info("Training output:\n\n{}\n".format(summarize(info_out))) return info_out
def compute_gradients(self, samples): if log_once("compute_gradients"): logger.info("Compute gradients on:\n\n{}\n".format( summarize(samples))) if isinstance(samples, MultiAgentBatch): grad_out, info_out = {}, {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "compute_gradients") for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue grad_out[pid], info_out[pid] = ( self.policy_map[pid]._build_compute_gradients( builder, batch)) grad_out = {k: builder.get(v) for k, v in grad_out.items()} info_out = {k: builder.get(v) for k, v in info_out.items()} else: for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue grad_out[pid], info_out[pid] = ( self.policy_map[pid].compute_gradients(batch)) else: grad_out, info_out = ( self.policy_map[DEFAULT_POLICY_ID].compute_gradients(samples)) info_out["batch_count"] = samples.count if log_once("grad_out"): logger.info("Compute grad info:\n\n{}\n".format( summarize(info_out))) return grad_out, info_out
def learn_on_batch(self, samples): if log_once("learn_on_batch"): logger.info( "Training on concatenated sample batches:\n\n{}\n".format( summarize(samples))) if isinstance(samples, MultiAgentBatch): info_out = {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "learn_on_batch") for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue info_out[pid], _ = ( self.policy_map[pid]._build_learn_on_batch( builder, batch)) info_out = {k: builder.get(v) for k, v in info_out.items()} else: for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue info_out[pid], _ = ( self.policy_map[pid].learn_on_batch(batch)) else: info_out, _ = ( self.policy_map[DEFAULT_POLICY_ID].learn_on_batch(samples)) if log_once("learn_out"): logger.info("Training output:\n\n{}\n".format(summarize(info_out))) return info_out
def _do_policy_eval(tf_sess, to_eval, policies, active_episodes): """Call compute actions on observation batches to get next actions. Returns: eval_results: dict of policy to compute_action() outputs. """ eval_results = {} if tf_sess: builder = TFRunBuilder(tf_sess, "policy_eval") pending_fetches = {} else: builder = None if log_once("compute_actions_input"): logger.info("Inputs to compute_actions():\n\n{}\n".format( summarize(to_eval))) for policy_id, eval_data in to_eval.items(): rnn_in = [t.rnn_state for t in eval_data] policy = _get_or_raise(policies, policy_id) if builder and (policy.compute_actions.__code__ is TFPolicy.compute_actions.__code__): rnn_in_cols = _to_column_format(rnn_in) # TODO(ekl): how can we make info batch available to TF code? # TODO(sven): Return dict from _build_compute_actions. # it's becoming more and more unclear otherwise, what's where in # the return tuple. pending_fetches[policy_id] = policy._build_compute_actions( builder, obs_batch=[t.obs for t in eval_data], state_batches=rnn_in_cols, prev_action_batch=[t.prev_action for t in eval_data], prev_reward_batch=[t.prev_reward for t in eval_data], timestep=policy.global_timestep) else: # TODO(sven): Does this work for LSTM torch? rnn_in_cols = [ np.stack([row[i] for row in rnn_in]) for i in range(len(rnn_in[0])) ] eval_results[policy_id] = policy.compute_actions( [t.obs for t in eval_data], state_batches=rnn_in_cols, prev_action_batch=[t.prev_action for t in eval_data], prev_reward_batch=[t.prev_reward for t in eval_data], info_batch=[t.info for t in eval_data], episodes=[active_episodes[t.env_id] for t in eval_data], timestep=policy.global_timestep) if builder: for k, v in pending_fetches.items(): eval_results[k] = builder.get(v) if log_once("compute_actions_result"): logger.info("Outputs of compute_actions():\n\n{}\n".format( summarize(eval_results))) return eval_results
def _do_policy_eval( *, to_eval: Dict[PolicyID, List[PolicyEvalData]], policies: Dict[PolicyID, Policy], sample_collector, active_episodes: Dict[str, MultiAgentEpisode], tf_sess: Optional["tf.Session"] = None, ) -> Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]]: """Call compute_actions on collected episode/model data to get next action. Args: to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects (items in these lists will be the batch's items for the model forward pass). policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy obj. sample_collector (SampleCollector): The SampleCollector object to use. tf_sess (Optional[tf.Session]): Optional tensorflow session to use for batching TF policy evaluations. Returns: eval_results: dict of policy to compute_action() outputs. """ eval_results: Dict[PolicyID, TensorStructType] = {} if tf_sess: builder = TFRunBuilder(tf_sess, "policy_eval") pending_fetches: Dict[PolicyID, Any] = {} else: builder = None if log_once("compute_actions_input"): logger.info("Inputs to compute_actions():\n\n{}\n".format( summarize(to_eval))) for policy_id, eval_data in to_eval.items(): policy: Policy = _get_or_raise(policies, policy_id) input_dict = sample_collector.get_inference_input_dict(policy_id) eval_results[policy_id] = \ policy.compute_actions_from_input_dict( input_dict, timestep=policy.global_timestep, episodes=[active_episodes[t.env_id] for t in eval_data]) if builder: # type: PolicyID, Tuple[TensorStructType, StateBatch, dict] for pid, v in pending_fetches.items(): eval_results[pid] = builder.get(v) if log_once("compute_actions_result"): logger.info("Outputs of compute_actions():\n\n{}\n".format( summarize(eval_results))) return eval_results
def _do_policy_eval(tf_sess, to_eval, policies, active_episodes): """Call compute actions on observation batches to get next actions. Returns: eval_results: dict of policy to compute_action() outputs. """ eval_results = {} if tf_sess: builder = TFRunBuilder(tf_sess, "policy_eval") pending_fetches = {} else: builder = None if log_once("compute_actions_input"): logger.info("Inputs to compute_actions():\n\n{}\n".format( summarize(to_eval))) for policy_id, eval_data in to_eval.items(): rnn_in_cols = _to_column_format([t.rnn_state for t in eval_data]) policy = _get_or_raise(policies, policy_id) if builder and (policy.compute_actions.__code__ is TFPolicy.compute_actions.__code__): # TODO(ekl): how can we make info batch available to TF code? # print('First: ' + str(eval_data)) pending_fetches[policy_id] = policy._build_compute_actions( builder, [t.obs for t in eval_data], [t.neighbor_obs for t in eval_data], rnn_in_cols, prev_action_batch=[t.prev_action for t in eval_data], prev_reward_batch=[t.prev_reward for t in eval_data]) else: # print('Second: ' + str(eval_data)) eval_results[policy_id] = policy.compute_actions( [t.obs for t in eval_data], [t.neighbor_obs for t in eval_data], rnn_in_cols, prev_action_batch=[t.prev_action for t in eval_data], prev_reward_batch=[t.prev_reward for t in eval_data], info_batch=[t.info for t in eval_data], episodes=[active_episodes[t.env_id] for t in eval_data]) if builder: for k, v in pending_fetches.items(): eval_results[k] = builder.get(v) if log_once("compute_actions_result"): logger.info("Outputs of compute_actions():\n\n{}\n".format( summarize(eval_results))) return eval_results
def _do_policy_eval( *, to_eval: Dict[PolicyID, List[PolicyEvalData]], policies: Dict[PolicyID, Policy], policy_mapping_fn: Callable[[AgentID, "MultiAgentEpisode"], PolicyID], sample_collector, active_episodes: Dict[str, MultiAgentEpisode], ) -> Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]]: """Call compute_actions on collected episode/model data to get next action. Args: to_eval (Dict[PolicyID, List[PolicyEvalData]]): Mapping of policy IDs to lists of PolicyEvalData objects (items in these lists will be the batch's items for the model forward pass). policies (Dict[PolicyID, Policy]): Mapping from policy ID to Policy obj. sample_collector (SampleCollector): The SampleCollector object to use. Returns: eval_results: dict of policy to compute_action() outputs. """ eval_results: Dict[PolicyID, TensorStructType] = {} if log_once("compute_actions_input"): logger.info("Inputs to compute_actions():\n\n{}\n".format( summarize(to_eval))) for policy_id, eval_data in to_eval.items(): # In case the policyID has been removed from this worker, we need to # re-assign policy_id and re-lookup the Policy object to use. try: policy: Policy = _get_or_raise(policies, policy_id) except ValueError: policy_id = policy_mapping_fn(eval_data[0].agent_id, active_episodes[eval_data[0].env_id]) policy: Policy = _get_or_raise(policies, policy_id) input_dict = sample_collector.get_inference_input_dict(policy_id) eval_results[policy_id] = \ policy.compute_actions_from_input_dict( input_dict, timestep=policy.global_timestep, episodes=[active_episodes[t.env_id] for t in eval_data]) if log_once("compute_actions_result"): logger.info("Outputs of compute_actions():\n\n{}\n".format( summarize(eval_results))) return eval_results
def _initialize_loss(self, loss: TensorType, loss_inputs: List[Tuple[str, TensorType]]) -> None: """Initializes the loss op from given loss tensor and placeholders. Args: loss (TensorType): The loss op generated by some loss function. loss_inputs (List[Tuple[str, TensorType]]): The list of Tuples: (name, tf1.placeholders) needed for calculating the loss. """ self._loss_input_dict = dict(loss_inputs) self._loss_input_dict_no_rnn = { k: v for k, v in self._loss_input_dict.items() if (v not in self._state_inputs and v != self._seq_lens) } for i, ph in enumerate(self._state_inputs): self._loss_input_dict["state_in_{}".format(i)] = ph if self.model and not isinstance(self.model, tf.keras.Model): self._loss = self.model.custom_loss(loss, self._loss_input_dict) self._stats_fetches.update({"model": self.model.metrics()}) else: self._loss = loss if self._optimizer is None: self._optimizer = self.optimizer() self._grads_and_vars = [ (g, v) for (g, v) in self.gradients(self._optimizer, self._loss) if g is not None ] self._grads = [g for (g, v) in self._grads_and_vars] if self.model: self._variables = ray.experimental.tf_utils.TensorFlowVariables( [], self.get_session(), self.variables()) # Gather update ops for any batch norm layers. if len(self.devices) <= 1: if not self._update_ops: self._update_ops = tf1.get_collection( tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name) if self._update_ops: logger.info("Update ops to run on apply gradient: {}".format( self._update_ops)) with tf1.control_dependencies(self._update_ops): self._apply_op = self.build_apply_op(self._optimizer, self._grads_and_vars) if log_once("loss_used"): logger.debug( "These tensors were used in the loss_fn:\n\n{}\n".format( summarize(self._loss_input_dict))) self.get_session().run(tf1.global_variables_initializer()) self._optimizer_variables = None if self._optimizer: self._optimizer_variables = \ ray.experimental.tf_utils.TensorFlowVariables( self._optimizer.variables(), self.get_session())
def postprocess_batch_so_far(self, episode): """Apply policy postprocessors to any unprocessed rows. This pushes the postprocessed per-agent batches onto the per-policy builders, clearing per-agent state. Arguments: episode: current MultiAgentEpisode object or None """ # Materialize the batches so far pre_batches = {} for agent_id, builder in self.agent_builders.items(): pre_batches[agent_id] = ( self.policy_map[self.agent_to_policy[agent_id]], builder.build_and_reset()) # Apply postprocessor post_batches = {} if self.clip_rewards: for _, (_, pre_batch) in pre_batches.items(): pre_batch["rewards"] = np.sign(pre_batch["rewards"]) for agent_id, (_, pre_batch) in pre_batches.items(): other_batches = pre_batches.copy() del other_batches[agent_id] policy = self.policy_map[self.agent_to_policy[agent_id]] if any(pre_batch["dones"][:-1]) or len(set( pre_batch["eps_id"])) > 1: raise ValueError( "Batches sent to postprocessing must only contain steps " "from a single trajectory.", pre_batch) post_batches[agent_id] = policy.postprocess_trajectory( pre_batch, other_batches, episode) # Call the Policy's Exploration's postprocess method. if getattr(policy, "exploration", None) is not None: policy.exploration.postprocess_trajectory( policy, post_batches[agent_id], getattr(policy, "_sess", None)) if log_once("after_post"): logger.info( "Trajectory fragment after postprocess_trajectory():\n\n{}\n". format(summarize(post_batches))) # Append into policy batches and reset from ray.rllib.evaluation.rollout_worker import get_global_worker for agent_id, post_batch in sorted(post_batches.items()): self.callbacks.on_postprocess_trajectory( worker=get_global_worker(), episode=episode, agent_id=agent_id, policy_id=self.agent_to_policy[agent_id], policies=self.policy_map, postprocessed_batch=post_batch, original_batches=pre_batches) self.policy_builders[self.agent_to_policy[agent_id]].add_batch( post_batch) self.agent_builders.clear() self.agent_to_policy.clear()
def apply_gradients(self, grads: ModelGradients) -> Dict[PolicyID, Any]: """Applies the given gradients to this worker's weights. Examples: >>> samples = worker.sample() >>> grads, info = worker.compute_gradients(samples) >>> worker.apply_gradients(grads) """ if log_once("apply_gradients"): logger.info("Apply gradients:\n\n{}\n".format(summarize(grads))) if isinstance(grads, dict): if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "apply_gradients") outputs = { pid: self.policy_map[pid]._build_apply_gradients( builder, grad) for pid, grad in grads.items() } return {k: builder.get(v) for k, v in outputs.items()} else: return { pid: self.policy_map[pid].apply_gradients(g) for pid, g in grads.items() } else: return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads)
def learn_on_batch(self, samples): if log_once("learn_on_batch"): logger.info( "Training on concatenated sample batches:\n\n{}\n".format( summarize(samples))) if isinstance(samples, MultiAgentBatch): info_out = {} to_fetch = {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "learn_on_batch") else: builder = None for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue ''' For Attention !!!!!!!!!!!!!!!!!!!! Build a dict for neighbor agents (also include current agent) observation ''' neighbor_batch_dic = {} neighbor_pid_list = [ pid_ for pid_ in self.traffic_light_node_dict[self.inter_num_2_id( int(pid.split('_')[1]))]['adjacency_row'] if pid_ != None ] for neighbor_pid in neighbor_pid_list: neighbor_batch_dic['policy_{}'.format( neighbor_pid)] = samples.policy_batches[ 'policy_{}'.format(neighbor_pid)] # neighbor_batch_dic[pid] = samples.policy_batches[pid] # ------------------------------------------------------------------ policy = self.policy_map[pid] if builder and hasattr(policy, "_build_learn_on_batch"): to_fetch[pid] = policy._build_learn_on_batch( builder, batch, neighbor_batch_dic) else: info_out[pid] = policy.learn_on_batch( batch, neighbor_batch_dic) info_out.update({k: builder.get(v) for k, v in to_fetch.items()}) else: info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch( samples) if log_once("learn_out"): logger.debug("Training out:\n\n{}\n".format(summarize(info_out))) return info_out
def sample(self): """Evaluate the current policies and return a batch of experiences. Return: SampleBatch|MultiAgentBatch from evaluating the current policies. """ if self._fake_sampler and self.last_batch is not None: return self.last_batch if log_once("sample_start"): logger.info("Generating sample batch of size {}".format( self.sample_batch_size)) batches = [self.input_reader.next()] steps_so_far = batches[0].count # In truncate_episodes mode, never pull more than 1 batch per env. # This avoids over-running the target batch size. if self.batch_mode == "truncate_episodes": max_batches = self.num_envs else: max_batches = float("inf") while steps_so_far < self.sample_batch_size and len( batches) < max_batches: batch = self.input_reader.next() steps_so_far += batch.count batches.append(batch) batch = batches[0].concat_samples(batches) if self.callbacks.get("on_sample_end"): self.callbacks["on_sample_end"]({ "evaluator": self, "samples": batch }) # Always do writes prior to compression for consistency and to allow # for better compression inside the writer. self.output_writer.write(batch) # Do off-policy estimation if needed if self.reward_estimators: for sub_batch in batch.split_by_episode(): for estimator in self.reward_estimators: estimator.process(sub_batch) if log_once("sample_end"): logger.info("Completed sample batch:\n\n{}\n".format( summarize(batch))) if self.compress_observations == "bulk": batch.compress(bulk=True) elif self.compress_observations: batch.compress() if self._fake_sampler: self.last_batch = batch return batch
def compute_gradients( self, samples: SampleBatchType) -> Tuple[ModelGradients, dict]: """Returns a gradient computed w.r.t the specified samples. Returns: (grads, info): A list of gradients that can be applied on a compatible worker. In the multi-agent case, returns a dict of gradients keyed by policy ids. An info dictionary of extra metadata is also returned. Examples: >>> batch = worker.sample() >>> grads, info = worker.compute_gradients(samples) """ if log_once("compute_gradients"): logger.info("Compute gradients on:\n\n{}\n".format( summarize(samples))) if isinstance(samples, MultiAgentBatch): grad_out, info_out = {}, {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "compute_gradients") for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue grad_out[pid], info_out[pid] = ( self.policy_map[pid]._build_compute_gradients( builder, batch)) grad_out = {k: builder.get(v) for k, v in grad_out.items()} info_out = {k: builder.get(v) for k, v in info_out.items()} else: for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue grad_out[pid], info_out[pid] = ( self.policy_map[pid].compute_gradients(batch)) else: grad_out, info_out = ( self.policy_map[DEFAULT_POLICY_ID].compute_gradients(samples)) info_out["batch_count"] = samples.count if log_once("grad_out"): logger.info("Compute grad info:\n\n{}\n".format( summarize(info_out))) return grad_out, info_out
def learn_on_batch(self, samples: SampleBatchType) -> dict: """Update policies based on the given batch. This is the equivalent to apply_gradients(compute_gradients(samples)), but can be optimized to avoid pulling gradients into CPU memory. Returns: info: dictionary of extra metadata from compute_gradients(). Examples: >>> batch = worker.sample() >>> worker.learn_on_batch(samples) """ if log_once("learn_on_batch"): logger.info( "Training on concatenated sample batches:\n\n{}\n".format( summarize(samples))) if isinstance(samples, MultiAgentBatch): info_out = {} to_fetch = {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "learn_on_batch") else: builder = None for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue policy = self.policy_map[pid] if builder and hasattr(policy, "_build_learn_on_batch"): to_fetch[pid] = policy._build_learn_on_batch( builder, batch) else: info_out[pid] = policy.learn_on_batch(batch) info_out.update({k: builder.get(v) for k, v in to_fetch.items()}) else: info_out = { DEFAULT_POLICY_ID: self.policy_map[DEFAULT_POLICY_ID] .learn_on_batch(samples) } if log_once("learn_out"): logger.debug("Training out:\n\n{}\n".format(summarize(info_out))) return info_out
def postprocess_batch_so_far(self, episode): """Apply policy postprocessors to any unprocessed rows. This pushes the postprocessed per-agent batches onto the per-policy builders, clearing per-agent state. Arguments: episode: current MultiAgentEpisode object or None """ # Materialize the batches so far pre_batches = {} for agent_id, builder in self.agent_builders.items(): pre_batches[agent_id] = ( self.policy_map[self.agent_to_policy[agent_id]], builder.build_and_reset()) # Apply postprocessor post_batches = {} if self.clip_rewards: for _, (_, pre_batch) in pre_batches.items(): pre_batch["rewards"] = np.sign(pre_batch["rewards"]) for agent_id, (_, pre_batch) in pre_batches.items(): other_batches = pre_batches.copy() del other_batches[agent_id] policy = self.policy_map[self.agent_to_policy[agent_id]] if any(pre_batch["dones"][:-1]) or len(set( pre_batch["eps_id"])) > 1: raise ValueError( "Batches sent to postprocessing must only contain steps " "from a single trajectory.", pre_batch) post_batches[agent_id] = policy.postprocess_trajectory( pre_batch, other_batches, episode) if log_once("after_post"): logger.info( "Trajectory fragment after postprocess_trajectory():\n\n{}\n". format(summarize(post_batches))) # Append into policy batches and reset for agent_id, post_batch in sorted(post_batches.items()): self.policy_builders[self.agent_to_policy[agent_id]].add_batch( post_batch) if self.postp_callback: self.postp_callback({ "episode": episode, "agent_id": agent_id, "pre_batch": pre_batches[agent_id], "post_batch": post_batch, "all_pre_batches": pre_batches, }) self.agent_builders.clear() self.agent_to_policy.clear()
def learn_on_batch(self, samples): if log_once("learn_on_batch"): logger.info( "Training on concatenated sample batches:\n\n{}\n".format( summarize(samples))) if isinstance(samples, MultiAgentBatch): info_out = {} to_fetch = {} if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "learn_on_batch") else: builder = None for pid, batch in samples.policy_batches.items(): if pid not in self.policies_to_train: continue policy = self.policy_map[pid] if builder and hasattr(policy, "_build_learn_on_batch"): to_fetch[pid] = policy._build_learn_on_batch( builder, batch) else: info_out[pid] = policy.learn_on_batch(batch) info_out.update({k: builder.get(v) for k, v in to_fetch.items()}) else: learn_on_batch_outputs = self.policy_map[ DEFAULT_POLICY_ID].learn_on_batch(samples) if isinstance(learn_on_batch_outputs, tuple): info_out, beholder_arrays = learn_on_batch_outputs else: info_out, beholder_arrays = learn_on_batch_outputs, {} if self.policy_config["evaluation_config"]["beholder"]: with self.tf_sess.graph.as_default(): if self._beholder is None: self._beholder = Beholder(self.io_context.log_dir) self._beholder.update(self.tf_sess, arrays=beholder_arrays or None) if log_once("learn_out"): logger.info("Training output:\n\n{}\n".format(summarize(info_out))) return info_out
def _initialize_loss(self, loss, loss_inputs): self._loss_inputs = loss_inputs self._loss_input_dict = dict(self._loss_inputs) for i, ph in enumerate(self._state_inputs): self._loss_input_dict["state_in_{}".format(i)] = ph if self.model: self._loss = self.model.custom_loss(loss, self._loss_input_dict) self._stats_fetches.update({ "model": self.model.metrics() if isinstance(self.model, ModelV2) else self.model.custom_stats() }) else: self._loss = loss self._optimizer = self.optimizer() self._grads_and_vars = [ (g, v) for (g, v) in self.gradients(self._optimizer, self._loss) if g is not None ] self._grads = [g for (g, v) in self._grads_and_vars] # TODO(sven/ekl): Deprecate support for v1 models. if hasattr(self, "model") and isinstance(self.model, ModelV2): self._variables = ray.experimental.tf_utils.TensorFlowVariables( [], self._sess, self.variables()) else: self._variables = ray.experimental.tf_utils.TensorFlowVariables( self._loss, self._sess) # gather update ops for any batch norm layers if not self._update_ops: self._update_ops = tf1.get_collection( tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name) if self._update_ops: logger.info("Update ops to run on apply gradient: {}".format( self._update_ops)) with tf1.control_dependencies(self._update_ops): self._apply_op = self.build_apply_op(self._optimizer, self._grads_and_vars) if log_once("loss_used"): logger.debug( "These tensors were used in the loss_fn:\n\n{}\n".format( summarize(self._loss_input_dict))) self._sess.run(tf1.global_variables_initializer()) self._optimizer_variables = None if self._optimizer: self._optimizer_variables = \ ray.experimental.tf_utils.TensorFlowVariables( self._optimizer.variables(), self._sess)
def postprocess_trajectories_so_far( self, episode: Optional[MultiAgentEpisode] = None) -> None: # Loop through each per-policy collector and create a view (for each # agent as SampleBatch) from its buffers for post-processing all_agent_batches = {} for pid, rc in self.policy_sample_collectors.items(): policy = self.policy_map[pid] view_reqs = policy.training_view_requirements agent_batches = rc.get_postprocessing_sample_batches( episode, view_reqs) for agent_key, batch in agent_batches.items(): other_batches = None if len(agent_batches) > 1: other_batches = agent_batches.copy() del other_batches[agent_key] agent_batches[agent_key] = policy.postprocess_trajectory( batch, other_batches, episode) # Call the Policy's Exploration's postprocess method. if getattr(policy, "exploration", None) is not None: agent_batches[ agent_key] = policy.exploration.postprocess_trajectory( policy, agent_batches[agent_key], getattr(policy, "_sess", None)) # Add new columns' data to buffers. for col in agent_batches[agent_key].new_columns: data = agent_batches[agent_key].data[col] rc._build_buffers({col: data[0]}) timesteps = data.shape[0] rc.buffers[col][rc.shift_before:rc.shift_before + timesteps, rc.agent_key_to_slot[ agent_key]] = data all_agent_batches.update(agent_batches) if log_once("after_post"): logger.info("Trajectory fragment after postprocess_trajectory():" "\n\n{}\n".format(summarize(all_agent_batches))) # Append into policy batches and reset from ray.rllib.evaluation.rollout_worker import get_global_worker for agent_key, batch in sorted(all_agent_batches.items()): self.callbacks.on_postprocess_trajectory( worker=get_global_worker(), episode=episode, agent_id=agent_key[0], policy_id=self.agent_to_policy[agent_key[0]], policies=self.policy_map, postprocessed_batch=batch, original_batches=None) # TODO: (sven) do we really need this?
def _get_loss_inputs_dict(self, batch): feed_dict = {} if self._batch_divisibility_req > 1: meets_divisibility_reqs = ( len(batch[SampleBatch.CUR_OBS]) % self._batch_divisibility_req == 0 and max(batch[SampleBatch.AGENT_INDEX]) == 0) # not multiagent else: meets_divisibility_reqs = True # Simple case: not RNN nor do we need to pad if not self._state_inputs and meets_divisibility_reqs: for k, ph in self._loss_inputs: feed_dict[ph] = batch[k] return feed_dict if self._state_inputs: max_seq_len = self._max_seq_len dynamic_max = True else: max_seq_len = self._batch_divisibility_req dynamic_max = False # RNN or multi-agent case feature_keys = [k for k, v in self._loss_inputs] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] feature_sequences, initial_states, seq_lens = chop_into_sequences( batch[SampleBatch.EPS_ID], batch[SampleBatch.UNROLL_ID], batch[SampleBatch.AGENT_INDEX], [batch[k] for k in feature_keys], [batch[k] for k in state_keys], max_seq_len, dynamic_max=dynamic_max) for k, v in zip(feature_keys, feature_sequences): feed_dict[self._loss_input_dict[k]] = v for k, v in zip(state_keys, initial_states): feed_dict[self._loss_input_dict[k]] = v feed_dict[self._seq_lens] = seq_lens if log_once("rnn_feed_dict"): logger.info("Padded input for RNN:\n\n{}\n".format( summarize({ "features": feature_sequences, "initial_states": initial_states, "seq_lens": seq_lens, "max_seq_len": max_seq_len, }))) return feed_dict
def apply_gradients(self, grads): if log_once("apply_gradients"): logger.info("Apply gradients:\n\n{}\n".format(summarize(grads))) if isinstance(grads, dict): if self.tf_sess is not None: builder = TFRunBuilder(self.tf_sess, "apply_gradients") outputs = { pid: self.policy_map[pid]._build_apply_gradients( builder, grad) for pid, grad in grads.items() } return {k: builder.get(v) for k, v in outputs.items()} else: return { pid: self.policy_map[pid].apply_gradients(g) for pid, g in grads.items() } else: return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads)
def _env_runner(base_env, extra_batch_callback, policies, policy_mapping_fn, unroll_length, horizon, preprocessors, obs_filters, clip_rewards, clip_actions, pack, callbacks, tf_sess=None): """This implements the common experience collection logic. Args: base_env (BaseEnv): env implementing BaseEnv. extra_batch_callback (fn): function to send extra batch data to. policies (dict): Map of policy ids to PolicyGraph instances. policy_mapping_fn (func): Function that maps agent ids to policy ids. This is called when an agent first enters the environment. The agent is then "bound" to the returned policy for the episode. unroll_length (int): Number of episode steps before `SampleBatch` is yielded. Set to infinity to yield complete episodes. horizon (int): Horizon of the episode. preprocessors (dict): Map of policy id to preprocessor for the observations prior to filtering. obs_filters (dict): Map of policy id to filter used to process observations for the policy. clip_rewards (bool): Whether to clip rewards before postprocessing. pack (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `unroll_length` in size. clip_actions (bool): Whether to clip actions to the space range. callbacks (dict): User callbacks to run on episode events. tf_sess (Session|None): Optional tensorflow session to use for batching TF policy evaluations. Yields: rollout (SampleBatch): Object containing state, action, reward, terminal condition, and other fields as dictated by `policy`. """ try: if not horizon: horizon = (base_env.get_unwrapped()[0].spec.max_episode_steps) except Exception: logger.debug("no episode horizon specified, assuming inf") if not horizon: horizon = float("inf") # Pool of batch builders, which can be shared across episodes to pack # trajectory data. batch_builder_pool = [] def get_batch_builder(): if batch_builder_pool: return batch_builder_pool.pop() else: return MultiAgentSampleBatchBuilder(policies, clip_rewards) def new_episode(): episode = MultiAgentEpisode(policies, policy_mapping_fn, get_batch_builder, extra_batch_callback) if callbacks.get("on_episode_start"): callbacks["on_episode_start"]({ "env": base_env, "policy": policies, "episode": episode, }) return episode active_episodes = defaultdict(new_episode) while True: # Get observations from all ready agents unfiltered_obs, rewards, dones, infos, off_policy_actions = \ base_env.poll() if log_once("env_returns"): logger.info("Raw obs from env: {}".format( summarize(unfiltered_obs))) logger.info("Info return from env: {}".format(summarize(infos))) # Process observations and prepare for policy evaluation active_envs, to_eval, outputs = _process_observations( base_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, unroll_length, pack, callbacks) for o in outputs: yield o # Do batched policy eval eval_results = _do_policy_eval(tf_sess, to_eval, policies, active_episodes) # Process results and update episode state actions_to_send = _process_policy_eval_results( to_eval, eval_results, active_episodes, active_envs, off_policy_actions, policies, clip_actions) # Return computed actions to ready envs. We also send to envs that have # taken off-policy actions; those envs are free to ignore the action. base_env.send_actions(actions_to_send)
def load_data(self, sess, inputs, state_inputs): """Bulk loads the specified inputs into device memory. The shape of the inputs must conform to the shapes of the input placeholders this optimizer was constructed with. The data is split equally across all the devices. If the data is not evenly divisible by the batch size, excess data will be discarded. Args: sess: TensorFlow session. inputs: List of arrays matching the input placeholders, of shape [BATCH_SIZE, ...]. state_inputs: List of RNN input arrays. These arrays have size [BATCH_SIZE / MAX_SEQ_LEN, ...]. Returns: The number of tuples loaded per device. """ if log_once("load_data"): logger.info( "Training on concatenated sample batches:\n\n{}\n".format( summarize({ "placeholders": self.loss_inputs, "inputs": inputs, "state_inputs": state_inputs }))) feed_dict = {} assert len(self.loss_inputs) == len(inputs + state_inputs), \ (self.loss_inputs, inputs, state_inputs) # Let's suppose we have the following input data, and 2 devices: # 1 2 3 4 5 6 7 <- state inputs shape # A A A B B B C C C D D D E E E F F F G G G <- inputs shape # The data is truncated and split across devices as follows: # |---| seq len = 3 # |---------------------------------| seq batch size = 6 seqs # |----------------| per device batch size = 9 tuples if len(state_inputs) > 0: smallest_array = state_inputs[0] seq_len = len(inputs[0]) // len(state_inputs[0]) self._loaded_max_seq_len = seq_len else: smallest_array = inputs[0] self._loaded_max_seq_len = 1 sequences_per_minibatch = (self.max_per_device_batch_size // self._loaded_max_seq_len * len(self.devices)) if sequences_per_minibatch < 1: logger.warning( ("Target minibatch size is {}, however the rollout sequence " "length is {}, hence the minibatch size will be raised to " "{}.").format(self.max_per_device_batch_size, self._loaded_max_seq_len, self._loaded_max_seq_len * len(self.devices))) sequences_per_minibatch = 1 if len(smallest_array) < sequences_per_minibatch: # Dynamically shrink the batch size if insufficient data sequences_per_minibatch = make_divisible_by( len(smallest_array), len(self.devices)) if log_once("data_slicing"): logger.info( ("Divided {} rollout sequences, each of length {}, among " "{} devices.").format(len(smallest_array), self._loaded_max_seq_len, len(self.devices))) if sequences_per_minibatch < len(self.devices): raise ValueError( "Must load at least 1 tuple sequence per device. Try " "increasing `sgd_minibatch_size` or reducing `max_seq_len` " "to ensure that at least one sequence fits per device.") self._loaded_per_device_batch_size = (sequences_per_minibatch // len(self.devices) * self._loaded_max_seq_len) if len(state_inputs) > 0: # First truncate the RNN state arrays to the sequences_per_minib. state_inputs = [ make_divisible_by(arr, sequences_per_minibatch) for arr in state_inputs ] # Then truncate the data inputs to match inputs = [arr[:len(state_inputs[0]) * seq_len] for arr in inputs] assert len(state_inputs[0]) * seq_len == len(inputs[0]), \ (len(state_inputs[0]), sequences_per_minibatch, seq_len, len(inputs[0])) for ph, arr in zip(self.loss_inputs, inputs + state_inputs): feed_dict[ph] = arr truncated_len = len(inputs[0]) else: for ph, arr in zip(self.loss_inputs, inputs + state_inputs): truncated_arr = make_divisible_by(arr, sequences_per_minibatch) feed_dict[ph] = truncated_arr truncated_len = len(truncated_arr) sess.run([t.init_op for t in self._towers], feed_dict=feed_dict) self.num_tuples_loaded = truncated_len tuples_per_device = truncated_len // len(self.devices) assert tuples_per_device > 0, "No data loaded?" assert tuples_per_device % self._loaded_per_device_batch_size == 0 return tuples_per_device
def _process_observations( *, worker: "RolloutWorker", base_env: BaseEnv, policies: Dict[PolicyID, Policy], active_episodes: Dict[str, MultiAgentEpisode], unfiltered_obs: Dict[EnvID, Dict[AgentID, EnvObsType]], rewards: Dict[EnvID, Dict[AgentID, float]], dones: Dict[EnvID, Dict[AgentID, bool]], infos: Dict[EnvID, Dict[AgentID, EnvInfoDict]], horizon: int, preprocessors: Dict[PolicyID, Preprocessor], obs_filters: Dict[PolicyID, Filter], multiple_episodes_in_batch: bool, callbacks: "DefaultCallbacks", soft_horizon: bool, no_done_at_end: bool, observation_fn: "ObservationFunction", sample_collector: SampleCollector, ) -> Tuple[Set[EnvID], Dict[PolicyID, List[PolicyEvalData]], List[Union[ RolloutMetrics, SampleBatchType]]]: """Record new data from the environment and prepare for policy evaluation. Args: worker (RolloutWorker): Reference to the current rollout worker. base_env (BaseEnv): Env implementing BaseEnv. policies (dict): Map of policy ids to Policy instances. batch_builder_pool (List[SampleBatchBuilder]): List of pooled SampleBatchBuilder object for recycling. active_episodes (Dict[str, MultiAgentEpisode]): Mapping from episode ID to currently ongoing MultiAgentEpisode object. unfiltered_obs (dict): Doubly keyed dict of env-ids -> agent ids -> unfiltered observation tensor, returned by a `BaseEnv.poll()` call. rewards (dict): Doubly keyed dict of env-ids -> agent ids -> rewards tensor, returned by a `BaseEnv.poll()` call. dones (dict): Doubly keyed dict of env-ids -> agent ids -> boolean done flags, returned by a `BaseEnv.poll()` call. infos (dict): Doubly keyed dict of env-ids -> agent ids -> info dicts, returned by a `BaseEnv.poll()` call. horizon (int): Horizon of the episode. preprocessors (dict): Map of policy id to preprocessor for the observations prior to filtering. obs_filters (dict): Map of policy id to filter used to process observations for the policy. rollout_fragment_length (int): Number of episode steps before `SampleBatch` is yielded. Set to infinity to yield complete episodes. multiple_episodes_in_batch (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `rollout_fragment_length` in size. callbacks (DefaultCallbacks): User callbacks to run on episode events. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. observation_fn (ObservationFunction): Optional multi-agent observation func to use for preprocessing observations. sample_collector (SampleCollector): The SampleCollector object used to store and retrieve environment samples. Returns: Tuple: - active_envs: Set of non-terminated env ids. - to_eval: Map of policy_id to list of agent PolicyEvalData. - outputs: List of metrics and samples to return from the sampler. """ # Output objects. active_envs: Set[EnvID] = set() to_eval: Dict[PolicyID, List[PolicyEvalData]] = defaultdict(list) outputs: List[Union[RolloutMetrics, SampleBatchType]] = [] # For each (vectorized) sub-environment. # type: EnvID, Dict[AgentID, EnvObsType] for env_id, all_agents_obs in unfiltered_obs.items(): is_new_episode: bool = env_id not in active_episodes episode: MultiAgentEpisode = active_episodes[env_id] if not is_new_episode: sample_collector.episode_step(episode) episode._add_agent_rewards(rewards[env_id]) # Check episode termination conditions. if dones[env_id]["__all__"] or episode.length >= horizon: hit_horizon = (episode.length >= horizon and not dones[env_id]["__all__"]) all_agents_done = True atari_metrics: List[RolloutMetrics] = _fetch_atari_metrics( base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics, {}, episode.hist_data, episode.media)) else: hit_horizon = False all_agents_done = False active_envs.add(env_id) # Custom observation function is applied before preprocessing. if observation_fn: all_agents_obs: Dict[AgentID, EnvObsType] = observation_fn( agent_obs=all_agents_obs, worker=worker, base_env=base_env, policies=policies, episode=episode) if not isinstance(all_agents_obs, dict): raise ValueError( "observe() must return a dict of agent observations") # For each agent in the environment. # type: AgentID, EnvObsType for agent_id, raw_obs in all_agents_obs.items(): assert agent_id != "__all__" last_observation: EnvObsType = episode.last_observation_for( agent_id) agent_done = bool(all_agents_done or dones[env_id].get(agent_id)) # A new agent (initial obs) is already done -> Skip entirely. if last_observation is None and agent_done: continue policy_id: PolicyID = episode.policy_for(agent_id) prep_obs: EnvObsType = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs: EnvObsType = _get_or_raise(obs_filters, policy_id)(prep_obs) if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) # Infos from the environment. agent_infos = infos[env_id].get(agent_id, {}) episode._set_last_info(agent_id, agent_infos) # Record transition info if applicable. if last_observation is None: sample_collector.add_init_obs(episode, agent_id, env_id, policy_id, episode.length - 1, filtered_obs) else: # Add actions, rewards, next-obs to collectors. values_dict = { "t": episode.length - 1, "env_id": env_id, "agent_index": episode._agent_index(agent_id), # Action (slot 0) taken at timestep t. "actions": episode.last_action_for(agent_id), # Reward received after taking a at timestep t. "rewards": rewards[env_id][agent_id], # After taking action=a, did we reach terminal? "dones": (False if (no_done_at_end or (hit_horizon and soft_horizon)) else agent_done), # Next observation. "new_obs": filtered_obs, } # Add extra-action-fetches to collectors. pol = policies[policy_id] for key, value in episode.last_pi_info_for(agent_id).items(): if key in pol.view_requirements: values_dict[key] = value # Env infos for this agent. if "infos" in pol.view_requirements: values_dict["infos"] = agent_infos sample_collector.add_action_reward_next_obs( episode.episode_id, agent_id, env_id, policy_id, agent_done, values_dict) if not agent_done: item = PolicyEvalData( env_id, agent_id, filtered_obs, agent_infos, None if last_observation is None else episode.rnn_state_for(agent_id), None if last_observation is None else episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0) to_eval[policy_id].append(item) # Invoke the `on_episode_step` callback after the step is logged # to the episode. # Exception: The very first env.poll() call causes the env to get reset # (no step taken yet, just a single starting observation logged). # We need to skip this callback in this case. if episode.length > 0: callbacks.on_episode_step(worker=worker, base_env=base_env, episode=episode, env_index=env_id) # Episode is done for all agents (dones[__all__] == True) # or we hit the horizon. if all_agents_done: is_done = dones[env_id]["__all__"] check_dones = is_done and not no_done_at_end # If, we are not allowed to pack the next episode into the same # SampleBatch (batch_mode=complete_episodes) -> Build the # MultiAgentBatch from a single episode and add it to "outputs". # Otherwise, just postprocess and continue collecting across # episodes. ma_sample_batch = sample_collector.postprocess_episode( episode, is_done=is_done or (hit_horizon and not soft_horizon), check_dones=check_dones, build=not multiple_episodes_in_batch) if ma_sample_batch: outputs.append(ma_sample_batch) # Call each policy's Exploration.on_episode_end method. for p in policies.values(): if getattr(p, "exploration", None) is not None: p.exploration.on_episode_end(policy=p, environment=base_env, episode=episode, tf_sess=getattr( p, "_sess", None)) # Call custom on_episode_end callback. callbacks.on_episode_end( worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_id, ) # Horizon hit and we have a soft horizon (no hard env reset). if hit_horizon and soft_horizon: episode.soft_reset() resetted_obs: Dict[AgentID, EnvObsType] = all_agents_obs else: del active_episodes[env_id] resetted_obs: Dict[AgentID, EnvObsType] = base_env.try_reset(env_id) # Reset not supported, drop this env from the ready list. if resetted_obs is None: if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") # Creates a new episode if this is not async return. # If reset is async, we will get its result in some future poll. elif resetted_obs != ASYNC_RESET_RETURN: new_episode: MultiAgentEpisode = active_episodes[env_id] if observation_fn: resetted_obs: Dict[AgentID, EnvObsType] = observation_fn( agent_obs=resetted_obs, worker=worker, base_env=base_env, policies=policies, episode=new_episode) # type: AgentID, EnvObsType for agent_id, raw_obs in resetted_obs.items(): policy_id: PolicyID = new_episode.policy_for(agent_id) prep_obs: EnvObsType = _get_or_raise( preprocessors, policy_id).transform(raw_obs) filtered_obs: EnvObsType = _get_or_raise( obs_filters, policy_id)(prep_obs) new_episode._set_last_observation(agent_id, filtered_obs) # Add initial obs to buffer. sample_collector.add_init_obs(new_episode, agent_id, env_id, policy_id, new_episode.length - 1, filtered_obs) item = PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), None, 0.0) to_eval[policy_id].append(item) # Try to build something. if multiple_episodes_in_batch: sample_batches = \ sample_collector.try_build_truncated_episode_multi_agent_batch() if sample_batches: outputs.extend(sample_batches) return active_envs, to_eval, outputs
def _env_runner( worker: "RolloutWorker", base_env: BaseEnv, extra_batch_callback: Callable[[SampleBatchType], None], policies: Dict[PolicyID, Policy], policy_mapping_fn: Callable[[AgentID], PolicyID], rollout_fragment_length: int, horizon: int, preprocessors: Dict[PolicyID, Preprocessor], obs_filters: Dict[PolicyID, Filter], clip_rewards: bool, clip_actions: bool, multiple_episodes_in_batch: bool, callbacks: "DefaultCallbacks", tf_sess: Optional["tf.Session"], perf_stats: _PerfStats, soft_horizon: bool, no_done_at_end: bool, observation_fn: "ObservationFunction", sample_collector: Optional[SampleCollector] = None, render: bool = None, ) -> Iterable[SampleBatchType]: """This implements the common experience collection logic. Args: worker (RolloutWorker): Reference to the current rollout worker. base_env (BaseEnv): Env implementing BaseEnv. extra_batch_callback (fn): function to send extra batch data to. policies (Dict[PolicyID, Policy]): Map of policy ids to Policy instances. policy_mapping_fn (func): Function that maps agent ids to policy ids. This is called when an agent first enters the environment. The agent is then "bound" to the returned policy for the episode. rollout_fragment_length (int): Number of episode steps before `SampleBatch` is yielded. Set to infinity to yield complete episodes. horizon (int): Horizon of the episode. preprocessors (dict): Map of policy id to preprocessor for the observations prior to filtering. obs_filters (dict): Map of policy id to filter used to process observations for the policy. clip_rewards (bool): Whether to clip rewards before postprocessing. multiple_episodes_in_batch (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `rollout_fragment_length` in size. clip_actions (bool): Whether to clip actions to the space range. callbacks (DefaultCallbacks): User callbacks to run on episode events. tf_sess (Session|None): Optional tensorflow session to use for batching TF policy evaluations. perf_stats (_PerfStats): Record perf stats into this object. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. no_done_at_end (bool): Ignore the done=True at the end of the episode and instead record done=False. observation_fn (ObservationFunction): Optional multi-agent observation func to use for preprocessing observations. sample_collector (Optional[SampleCollector]): An optional SampleCollector object to use. render (bool): Whether to try to render the environment after each step. Yields: rollout (SampleBatch): Object containing state, action, reward, terminal condition, and other fields as dictated by `policy`. """ # May be populated with used for image rendering simple_image_viewer: Optional["SimpleImageViewer"] = None # Try to get Env's `max_episode_steps` prop. If it doesn't exist, ignore # error and continue with max_episode_steps=None. max_episode_steps = None try: max_episode_steps = base_env.get_unwrapped()[0].spec.max_episode_steps except Exception: pass # Trainer has a given `horizon` setting. if horizon: # `horizon` is larger than env's limit. if max_episode_steps and horizon > max_episode_steps: # Try to override the env's own max-step setting with our horizon. # If this won't work, throw an error. try: base_env.get_unwrapped()[0].spec.max_episode_steps = horizon base_env.get_unwrapped()[0]._max_episode_steps = horizon except Exception: raise ValueError( "Your `horizon` setting ({}) is larger than the Env's own " "timestep limit ({}), which seems to be unsettable! Try " "to increase the Env's built-in limit to be at least as " "large as your wanted `horizon`.".format( horizon, max_episode_steps)) # Otherwise, set Trainer's horizon to env's max-steps. elif max_episode_steps: horizon = max_episode_steps logger.debug( "No episode horizon specified, setting it to Env's limit ({}).". format(max_episode_steps)) # No horizon/max_episode_steps -> Episodes may be infinitely long. else: horizon = float("inf") logger.debug("No episode horizon specified, assuming inf.") # Pool of batch builders, which can be shared across episodes to pack # trajectory data. batch_builder_pool: List[MultiAgentSampleBatchBuilder] = [] def get_batch_builder(): if batch_builder_pool: return batch_builder_pool.pop() else: return None def new_episode(env_id): episode = MultiAgentEpisode(policies, policy_mapping_fn, get_batch_builder, extra_batch_callback, env_id=env_id) # Call each policy's Exploration.on_episode_start method. # type: Policy for p in policies.values(): if getattr(p, "exploration", None) is not None: p.exploration.on_episode_start(policy=p, environment=base_env, episode=episode, tf_sess=getattr( p, "_sess", None)) callbacks.on_episode_start( worker=worker, base_env=base_env, policies=policies, episode=episode, env_index=env_id, ) return episode active_episodes: Dict[str, MultiAgentEpisode] = \ NewEpisodeDefaultDict(new_episode) while True: perf_stats.iters += 1 t0 = time.time() # Get observations from all ready agents. # type: MultiEnvDict, MultiEnvDict, MultiEnvDict, MultiEnvDict, ... unfiltered_obs, rewards, dones, infos, off_policy_actions = \ base_env.poll() perf_stats.env_wait_time += time.time() - t0 if log_once("env_returns"): logger.info("Raw obs from env: {}".format( summarize(unfiltered_obs))) logger.info("Info return from env: {}".format(summarize(infos))) # Process observations and prepare for policy evaluation. t1 = time.time() # type: Set[EnvID], Dict[PolicyID, List[PolicyEvalData]], # List[Union[RolloutMetrics, SampleBatchType]] active_envs, to_eval, outputs = \ _process_observations( worker=worker, base_env=base_env, policies=policies, active_episodes=active_episodes, unfiltered_obs=unfiltered_obs, rewards=rewards, dones=dones, infos=infos, horizon=horizon, preprocessors=preprocessors, obs_filters=obs_filters, multiple_episodes_in_batch=multiple_episodes_in_batch, callbacks=callbacks, soft_horizon=soft_horizon, no_done_at_end=no_done_at_end, observation_fn=observation_fn, sample_collector=sample_collector, ) perf_stats.raw_obs_processing_time += time.time() - t1 for o in outputs: yield o # Do batched policy eval (accross vectorized envs). t2 = time.time() # type: Dict[PolicyID, Tuple[TensorStructType, StateBatch, dict]] eval_results = _do_policy_eval( to_eval=to_eval, policies=policies, sample_collector=sample_collector, active_episodes=active_episodes, tf_sess=tf_sess, ) perf_stats.inference_time += time.time() - t2 # Process results and update episode state. t3 = time.time() actions_to_send: Dict[EnvID, Dict[AgentID, EnvActionType]] = \ _process_policy_eval_results( to_eval=to_eval, eval_results=eval_results, active_episodes=active_episodes, active_envs=active_envs, off_policy_actions=off_policy_actions, policies=policies, clip_actions=clip_actions, ) perf_stats.action_processing_time += time.time() - t3 # Return computed actions to ready envs. We also send to envs that have # taken off-policy actions; those envs are free to ignore the action. t4 = time.time() base_env.send_actions(actions_to_send) perf_stats.env_wait_time += time.time() - t4 # Try to render the env, if required. if render: t5 = time.time() # Render can either return an RGB image (uint8 [w x h x 3] numpy # array) or take care of rendering itself (returning True). rendered = base_env.try_render() # Rendering returned an image -> Display it in a SimpleImageViewer. if isinstance(rendered, np.ndarray) and len(rendered.shape) == 3: # ImageViewer not defined yet, try to create one. if simple_image_viewer is None: try: from gym.envs.classic_control.rendering import \ SimpleImageViewer simple_image_viewer = SimpleImageViewer() except (ImportError, ModuleNotFoundError): render = False # disable rendering logger.warning( "Could not import gym.envs.classic_control." "rendering! Try `pip install gym[all]`.") if simple_image_viewer: simple_image_viewer.imshow(rendered) perf_stats.env_render_time += time.time() - t5
def _get_loss_inputs_dict(self, batch, neighbor_batch_dic, shuffle): """Return a feed dict from a batch. Arguments: batch (SampleBatch): batch of data to derive inputs from neighbor_batch_dic (dict, SampleBatch): batch of data for neighbor of the main policy shuffle (bool): whether to shuffle batch sequences. Shuffle may be done in-place. This only makes sense if you're further applying minibatch SGD after getting the outputs. Returns: feed dict of data """ feed_dict = {} if self._batch_divisibility_req > 1: meets_divisibility_reqs = ( len(batch[SampleBatch.CUR_OBS]) % self._batch_divisibility_req == 0 and max(batch[SampleBatch.AGENT_INDEX]) == 0) # not multiagent else: meets_divisibility_reqs = True neighbor_list = [None] * 5 neighbor_count = 0 for k in neighbor_batch_dic: neighbor_list[neighbor_count] = k neighbor_count += 1 tmp_dic = {} # Simple case: not RNN nor do we need to pad if not self._state_inputs and meets_divisibility_reqs: if shuffle: batch.shuffle() for k, ph in self._loss_inputs: ''' For Attention 这里是用于Q target的replay buffer, 从SampleBatch中整理出 neighbor_obs的信息 ''' if 'neighbor_obs' in k: tmp_dic[ph] = {} feed_dict[ph] = [] # neighbor_id = neighbor_list[int(k.split('_')[2])] # if neighbor_id is None: # continue # else: for neighbor_id in neighbor_list: tmp_dic[ph][neighbor_id] = neighbor_batch_dic[ neighbor_id]['obs'] neighbor_id = neighbor_list[0] # [neighbor, batch, feather] -> [batch, neighbor, feather] for batch_item in range(len(tmp_dic[ph][neighbor_id])): feed_dict[ph].append([]) for neighbor_id in neighbor_list: feed_dict[ph][batch_item].append( tmp_dic[ph][neighbor_id][batch_item]) feed_dict[ph] = np.array(feed_dict[ph]) # feed_dict[ph] = Lambda(lambda x: K.permute_dimensions(x, (0, 2, 1)))(tmp_dic[ph]) # ---------------------------------------------------------------- else: feed_dict[ph] = batch[k] return feed_dict if self._state_inputs: max_seq_len = self._max_seq_len dynamic_max = True else: max_seq_len = self._batch_divisibility_req dynamic_max = False # RNN or multi-agent case feature_keys = [k for k, v in self._loss_inputs] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] feature_sequences, initial_states, seq_lens = chop_into_sequences( batch[SampleBatch.EPS_ID], batch[SampleBatch.UNROLL_ID], batch[SampleBatch.AGENT_INDEX], [batch[k] for k in feature_keys], [batch[k] for k in state_keys], max_seq_len, dynamic_max=dynamic_max, shuffle=shuffle) for k, v in zip(feature_keys, feature_sequences): feed_dict[self._loss_input_dict[k]] = v for k, v in zip(state_keys, initial_states): feed_dict[self._loss_input_dict[k]] = v feed_dict[self._seq_lens] = seq_lens if log_once("rnn_feed_dict"): logger.info("Padded input for RNN:\n\n{}\n".format( summarize({ "features": feature_sequences, "initial_states": initial_states, "seq_lens": seq_lens, "max_seq_len": max_seq_len, }))) return feed_dict
def postprocess_episode( self, episode: MultiAgentEpisode, is_done: bool = False, check_dones: bool = False, build: bool = False) -> Union[None, SampleBatch, MultiAgentBatch]: episode_id = episode.episode_id policy_collector_group = episode.batch_builder # TODO: (sven) Once we implement multi-agent communication channels, # we have to resolve the restriction of only sending other agent # batches from the same policy to the postprocess methods. # Build SampleBatches for the given episode. pre_batches = {} for (eps_id, agent_id), collector in self.agent_collectors.items(): # Build only if there is data and agent is part of given episode. if collector.agent_steps == 0 or eps_id != episode_id: continue pid = self.agent_key_to_policy_id[(eps_id, agent_id)] policy = self.policy_map[pid] pre_batch = collector.build(policy.view_requirements) pre_batches[agent_id] = (policy, pre_batch) # Apply reward clipping before calling postprocessing functions. if self.clip_rewards is True: for _, (_, pre_batch) in pre_batches.items(): pre_batch["rewards"] = np.sign(pre_batch["rewards"]) elif self.clip_rewards: for _, (_, pre_batch) in pre_batches.items(): pre_batch["rewards"] = np.clip( pre_batch["rewards"], a_min=-self.clip_rewards, a_max=self.clip_rewards) post_batches = {} for agent_id, (_, pre_batch) in pre_batches.items(): # Entire episode is said to be done. # Error if no DONE at end of this agent's trajectory. if is_done and check_dones and \ not pre_batch[SampleBatch.DONES][-1]: raise ValueError( "Episode {} terminated for all agents, but we still don't " "don't have a last observation for agent {} (policy " "{}). ".format( episode_id, agent_id, self.agent_key_to_policy_id[( episode_id, agent_id)]) + "Please ensure that you include the last observations " "of all live agents when setting done[__all__] to " "True. Alternatively, set no_done_at_end=True to " "allow this.") # If (only this?) agent is done, erase its buffer entirely. if pre_batch[SampleBatch.DONES][-1]: del self.agent_collectors[(episode_id, agent_id)] other_batches = pre_batches.copy() del other_batches[agent_id] pid = self.agent_key_to_policy_id[(episode_id, agent_id)] policy = self.policy_map[pid] if any(pre_batch[SampleBatch.DONES][:-1]) or len( set(pre_batch[SampleBatch.EPS_ID])) > 1: raise ValueError( "Batches sent to postprocessing must only contain steps " "from a single trajectory.", pre_batch) # Call the Policy's Exploration's postprocess method. post_batches[agent_id] = pre_batch if getattr(policy, "exploration", None) is not None: policy.exploration.postprocess_trajectory( policy, post_batches[agent_id], getattr(policy, "_sess", None)) post_batches[agent_id] = policy.postprocess_trajectory( post_batches[agent_id], other_batches, episode) if log_once("after_post"): logger.info( "Trajectory fragment after postprocess_trajectory():\n\n{}\n". format(summarize(post_batches))) # Append into policy batches and reset. from ray.rllib.evaluation.rollout_worker import get_global_worker for agent_id, post_batch in sorted(post_batches.items()): pid = self.agent_key_to_policy_id[(episode_id, agent_id)] policy = self.policy_map[pid] self.callbacks.on_postprocess_trajectory( worker=get_global_worker(), episode=episode, agent_id=agent_id, policy_id=pid, policies=self.policy_map, postprocessed_batch=post_batch, original_batches=pre_batches) # Add the postprocessed SampleBatch to the policy collectors for # training. policy_collector_group.policy_collectors[ pid].add_postprocessed_batch_for_training( post_batch, policy.view_requirements) env_steps = self.episode_steps[episode_id] policy_collector_group.env_steps += env_steps agent_steps = self.agent_steps[episode_id] policy_collector_group.agent_steps += agent_steps if is_done: del self.episode_steps[episode_id] del self.agent_steps[episode_id] del self.episodes[episode_id] # Make PolicyCollectorGroup available for more agent batches in # other episodes. Do not reset count to 0. self.policy_collector_groups.append(policy_collector_group) else: self.episode_steps[episode_id] = self.agent_steps[episode_id] = 0 # Build a MultiAgentBatch from the episode and return. if build: return self._build_multi_agent_batch(episode)
def _initialize_loss_from_dummy_batch( self, auto_remove_unneeded_view_reqs: bool = True, stats_fn=None) -> None: # Create the optimizer/exploration optimizer here. Some initialization # steps (e.g. exploration postprocessing) may need this. self._optimizer = self.optimizer() # Test calls depend on variable init, so initialize model first. self._sess.run(tf1.global_variables_initializer()) if self.config["_use_trajectory_view_api"]: logger.info("Testing `compute_actions` w/ dummy batch.") actions, state_outs, extra_fetches = \ self.compute_actions_from_input_dict( self._dummy_batch, explore=False, timestep=0) for key, value in extra_fetches.items(): self._dummy_batch[key] = np.zeros_like(value) self._input_dict[key] = get_placeholder(value=value, name=key) if key not in self.view_requirements: logger.info("Adding extra-action-fetch `{}` to " "view-reqs.".format(key)) self.view_requirements[key] = \ ViewRequirement(space=gym.spaces.Box( -1.0, 1.0, shape=value.shape[1:], dtype=value.dtype)) dummy_batch = self._dummy_batch else: def fake_array(tensor): shape = tensor.shape.as_list() shape = [s if s is not None else 1 for s in shape] return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array( ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) state_batches.append(np.expand_dims(h, 0)) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) dummy_batch = SampleBatch(dummy_batch) batch_for_postproc = UsageTrackingDict(dummy_batch) batch_for_postproc.count = dummy_batch.count logger.info("Testing `postprocess_trajectory` w/ dummy batch.") self.exploration.postprocess_trajectory(self, batch_for_postproc, self._sess) postprocessed_batch = self.postprocess_trajectory(batch_for_postproc) # Add new columns automatically to (loss) input_dict. if self.config["_use_trajectory_view_api"]: for key in batch_for_postproc.added_keys: if key not in self._input_dict: self._input_dict[key] = get_placeholder( value=batch_for_postproc[key], name=key) if key not in self.view_requirements: self.view_requirements[key] = \ ViewRequirement(space=gym.spaces.Box( -1.0, 1.0, shape=batch_for_postproc[key].shape[1:], dtype=batch_for_postproc[key].dtype)) if not self.config["_use_trajectory_view_api"]: train_batch = UsageTrackingDict( dict({ SampleBatch.CUR_OBS: self._obs_input, }, **self._loss_input_dict)) if self._obs_include_prev_action_reward: train_batch.update({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) for k, v in postprocessed_batch.items(): if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF elif k == "seq_lens" or k.startswith("state_in_"): continue shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf1.placeholder(dtype, shape=shape, name=k) train_batch[k] = placeholder for i, si in enumerate(self._state_inputs): train_batch["state_in_{}".format(i)] = si else: train_batch = UsageTrackingDict( dict(self._input_dict, **self._loss_input_dict)) if self._state_inputs: train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(train_batch))) self._loss_input_dict.update({k: v for k, v in train_batch.items()}) loss = self._do_loss_init(train_batch) all_accessed_keys = \ train_batch.accessed_keys | batch_for_postproc.accessed_keys | \ batch_for_postproc.added_keys | set( self.model.view_requirements.keys()) TFPolicy._initialize_loss( self, loss, [(k, v) for k, v in train_batch.items() if k in all_accessed_keys]) if "is_training" in self._loss_input_dict: del self._loss_input_dict["is_training"] # Call the grads stats fn. # TODO: (sven) rename to simply stats_fn to match eager and torch. if self._grad_stats_fn: self._stats_fetches.update( self._grad_stats_fn(self, train_batch, self._grads)) # Add new columns automatically to view-reqs. if self.config["_use_trajectory_view_api"] and \ auto_remove_unneeded_view_reqs: # Add those needed for postprocessing and training. all_accessed_keys = train_batch.accessed_keys | \ batch_for_postproc.accessed_keys # Tag those only needed for post-processing (with some exceptions). for key in batch_for_postproc.accessed_keys: if key not in train_batch.accessed_keys and \ key not in self.model.view_requirements and \ key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, SampleBatch.REWARDS, SampleBatch.INFOS]: if key in self.view_requirements: self.view_requirements[key].used_for_training = False if key in self._loss_input_dict: del self._loss_input_dict[key] # Remove those not needed at all (leave those that are needed # by Sampler to properly execute sample collection). # Also always leave DONES, REWARDS, and INFOS, no matter what. for key in list(self.view_requirements.keys()): if key not in all_accessed_keys and key not in [ SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, SampleBatch.UNROLL_ID, SampleBatch.DONES, SampleBatch.REWARDS, SampleBatch.INFOS] and \ key not in self.model.view_requirements: # If user deleted this key manually in postprocessing # fn, warn about it and do not remove from # view-requirements. if key in batch_for_postproc.deleted_keys: logger.warning( "SampleBatch key '{}' was deleted manually in " "postprocessing function! RLlib will " "automatically remove non-used items from the " "data stream. Remove the `del` from your " "postprocessing function.".format(key)) else: del self.view_requirements[key] if key in self._loss_input_dict: del self._loss_input_dict[key] # Add those data_cols (again) that are missing and have # dependencies by view_cols. for key in list(self.view_requirements.keys()): vr = self.view_requirements[key] if (vr.data_col is not None and vr.data_col not in self.view_requirements): used_for_training = \ vr.data_col in train_batch.accessed_keys self.view_requirements[vr.data_col] = ViewRequirement( space=vr.space, used_for_training=used_for_training) self._loss_input_dict_no_rnn = { k: v for k, v in self._loss_input_dict.items() if (v not in self._state_inputs and v != self._seq_lens) } # Initialize again after loss init. self._sess.run(tf1.global_variables_initializer())
def _process_observations(base_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, unroll_length, pack, callbacks): """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ active_envs = set() to_eval = defaultdict(list) outputs = [] # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) if (episode.batch_builder.total() > max(1000, unroll_length * 10) and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " "that you set a horizon on your environment correctly. Note " "that in multi-agent environments, `sample_batch_size` sets " "the batch size based on environment steps, not the steps of " "individual agents, which can result in unexpectedly large " "batches.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: all_done = True atari_metrics = _fetch_atari_metrics(base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics)) else: all_done = False active_envs.add(env_id) # For each agent in the environment for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=agent_done, infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode if callbacks.get("on_episode_step"): callbacks["on_episode_step"]({"env": base_env, "episode": episode}) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_data(): if dones[env_id]["__all__"]: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ episode.batch_builder.count >= unroll_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) if callbacks.get("on_episode_end"): callbacks["on_episode_end"]({ "env": base_env, "policy": policies, "episode": episode }) del active_episodes[env_id] resetted_obs = base_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") else: # Creates a new episode episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) policy = _get_or_raise(policies, policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action(policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs
def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() shape[0] = 1 return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input), SampleBatch.NEXT_OBS: fake_array(self._obs_input), SampleBatch.DONES: np.array([False], dtype=np.bool), SampleBatch.ACTIONS: fake_array(ModelCatalog.get_action_placeholder(self.action_space)), SampleBatch.REWARDS: np.array([0], dtype=np.float32), } if self._obs_include_prev_action_reward: dummy_batch.update({ SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) if self._obs_include_prev_action_reward: batch_tensors = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: batch_tensors = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] for k, v in postprocessed_batch.items(): if k in batch_tensors: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) batch_tensors[k] = placeholder if log_once("loss_init"): logger.info( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(batch_tensors))) loss = self._do_loss_init(batch_tensors) for k in sorted(batch_tensors.accessed_keys): loss_inputs.append((k, batch_tensors[k])) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update(self._grad_stats_fn(self, self._grads)) self._sess.run(tf.global_variables_initializer())
def _get_loss_inputs_dict(self, batch, shuffle): """Return a feed dict from a batch. Arguments: batch (SampleBatch): batch of data to derive inputs from shuffle (bool): whether to shuffle batch sequences. Shuffle may be done in-place. This only makes sense if you're further applying minibatch SGD after getting the outputs. Returns: feed dict of data """ feed_dict = {} if self._batch_divisibility_req > 1: meets_divisibility_reqs = ( len(batch[SampleBatch.CUR_OBS]) % self._batch_divisibility_req == 0 and max(batch[SampleBatch.AGENT_INDEX]) == 0) # not multiagent else: meets_divisibility_reqs = True # Simple case: not RNN nor do we need to pad if not self._state_inputs and meets_divisibility_reqs: if shuffle: batch.shuffle() for k, ph in self._loss_inputs: feed_dict[ph] = batch[k] return feed_dict if self._state_inputs: max_seq_len = self._max_seq_len dynamic_max = True else: max_seq_len = self._batch_divisibility_req dynamic_max = False # RNN or multi-agent case feature_keys = [k for k, v in self._loss_inputs] state_keys = [ "state_in_{}".format(i) for i in range(len(self._state_inputs)) ] feature_sequences, initial_states, seq_lens = chop_into_sequences( batch[SampleBatch.EPS_ID], batch[SampleBatch.UNROLL_ID], batch[SampleBatch.AGENT_INDEX], [batch[k] for k in feature_keys], [batch[k] for k in state_keys], max_seq_len, dynamic_max=dynamic_max, shuffle=shuffle) for k, v in zip(feature_keys, feature_sequences): feed_dict[self._loss_input_dict[k]] = v for k, v in zip(state_keys, initial_states): feed_dict[self._loss_input_dict[k]] = v feed_dict[self._seq_lens] = seq_lens if log_once("rnn_feed_dict"): logger.info("Padded input for RNN:\n\n{}\n".format( summarize({ "features": feature_sequences, "initial_states": initial_states, "seq_lens": seq_lens, "max_seq_len": max_seq_len, }))) return feed_dict