def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], behavior_spec: BehaviorSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_decision_step, current_terminal_step = steps_from_proto( [current_pair_info.agent_info], behavior_spec ) next_decision_step, next_terminal_step = steps_from_proto( [next_pair_info.agent_info], behavior_spec ) previous_action = ( np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32 ) next_done = len(next_terminal_step) == 1 next_reward = 0 if len(next_terminal_step) == 1: next_reward = next_terminal_step.reward[0] else: next_reward = next_decision_step.reward[0] current_obs = None if len(current_terminal_step) == 1: current_obs = list(current_terminal_step.values())[0].obs else: current_obs = list(current_decision_step.values())[0].obs demo_raw_buffer["done"].append(next_done) demo_raw_buffer["rewards"].append(next_reward) split_obs = SplitObservations.from_observations(current_obs) for i, obs in enumerate(split_obs.visual_observations): demo_raw_buffer["visual_obs%d" % i].append(obs) demo_raw_buffer["vector_obs"].append(split_obs.vector_observations) demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions) demo_raw_buffer["prev_action"].append(previous_action) if next_done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer
def _append_to_update_buffer(self, agentbuffer_trajectory: AgentBuffer) -> None: """ Append an AgentBuffer to the update buffer. If the trainer isn't training, don't update to avoid a memory leak. """ if self.should_still_train: seq_len = ( self.trainer_settings.network_settings.memory.sequence_length if self.trainer_settings.network_settings.memory is not None else 1) agentbuffer_trajectory.resequence_and_append( self.update_buffer, training_length=seq_len)
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], group_spec: AgentGroupSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_step_info = batched_step_result_from_proto( [current_pair_info.agent_info], group_spec) next_step_info = batched_step_result_from_proto( [next_pair_info.agent_info], group_spec) previous_action = (np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32) curr_agent_id = current_step_info.agent_id[0] current_agent_step_info = current_step_info.get_agent_step_result( curr_agent_id) next_agent_id = next_step_info.agent_id[0] next_agent_step_info = next_step_info.get_agent_step_result( next_agent_id) demo_raw_buffer["done"].append(next_agent_step_info.done) demo_raw_buffer["rewards"].append(next_agent_step_info.reward) split_obs = SplitObservations.from_observations( current_agent_step_info.obs) for i, obs in enumerate(split_obs.visual_observations): demo_raw_buffer["visual_obs%d" % i].append(obs) demo_raw_buffer["vector_obs"].append(split_obs.vector_observations) demo_raw_buffer["actions"].append( current_pair_info.action_info.vector_actions) demo_raw_buffer["prev_action"].append(previous_action) if next_step_info.done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append(demo_processed_buffer, batch_size=None, training_length=sequence_length) return demo_processed_buffer
def create_buffer(brain_infos, brain_params, sequence_length, memory_size=8): buffer = AgentBuffer() update_buffer = AgentBuffer() # Make a buffer for idx, experience in enumerate(brain_infos): if idx > len(brain_infos) - 2: break current_brain_info = experience next_brain_info = brain_infos[idx + 1] buffer.last_brain_info = current_brain_info buffer["done"].append(next_brain_info.local_done[0]) buffer["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): buffer["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) buffer["next_visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: buffer["vector_obs"].append(current_brain_info.vector_observations[0]) buffer["next_vector_in"].append(current_brain_info.vector_observations[0]) fake_action_size = len(brain_params.vector_action_space_size) if brain_params.vector_action_space_type == "continuous": fake_action_size = brain_params.vector_action_space_size[0] buffer["actions"].append(np.zeros(fake_action_size, dtype=np.float32)) buffer["prev_action"].append(np.zeros(fake_action_size, dtype=np.float32)) buffer["masks"].append(1.0) buffer["advantages"].append(1.0) if brain_params.vector_action_space_type == "discrete": buffer["action_probs"].append( np.ones(sum(brain_params.vector_action_space_size), dtype=np.float32) ) else: buffer["action_probs"].append( np.ones(buffer["actions"][0].shape, dtype=np.float32) ) buffer["actions_pre"].append( np.ones(buffer["actions"][0].shape, dtype=np.float32) ) buffer["action_mask"].append( np.ones(np.sum(brain_params.vector_action_space_size), dtype=np.float32) ) buffer["memory"].append(np.ones(memory_size, dtype=np.float32)) buffer.resequence_and_append( update_buffer, batch_size=None, training_length=sequence_length ) return update_buffer
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], brain_params: BrainParameters, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, experience in enumerate(pair_infos): if idx > len(pair_infos) - 2: break current_pair_info = pair_infos[idx] next_pair_info = pair_infos[idx + 1] current_brain_info = BrainInfo.from_agent_proto( 0, [current_pair_info.agent_info], brain_params ) next_brain_info = BrainInfo.from_agent_proto( 0, [next_pair_info.agent_info], brain_params ) previous_action = ( np.array(pair_infos[idx].action_info.vector_actions, dtype=np.float32) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions, dtype=np.float32 ) demo_raw_buffer["done"].append(next_brain_info.local_done[0]) demo_raw_buffer["rewards"].append(next_brain_info.rewards[0]) for i in range(brain_params.number_visual_observations): demo_raw_buffer["visual_obs%d" % i].append( current_brain_info.visual_observations[i][0] ) if brain_params.vector_observation_space_size > 0: demo_raw_buffer["vector_obs"].append( current_brain_info.vector_observations[0] ) demo_raw_buffer["actions"].append(current_pair_info.action_info.vector_actions) demo_raw_buffer["prev_action"].append(previous_action) if next_brain_info.local_done[0]: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer
def make_demo_buffer( pair_infos: List[AgentInfoActionPairProto], behavior_spec: BehaviorSpec, sequence_length: int, ) -> AgentBuffer: # Create and populate buffer using experiences demo_raw_buffer = AgentBuffer() demo_processed_buffer = AgentBuffer() for idx, current_pair_info in enumerate(pair_infos): if idx > len(pair_infos) - 2: break next_pair_info = pair_infos[idx + 1] current_decision_step, current_terminal_step = steps_from_proto( [current_pair_info.agent_info], behavior_spec ) next_decision_step, next_terminal_step = steps_from_proto( [next_pair_info.agent_info], behavior_spec ) previous_action = ( np.array( pair_infos[idx].action_info.vector_actions_deprecated, dtype=np.float32 ) * 0 ) if idx > 0: previous_action = np.array( pair_infos[idx - 1].action_info.vector_actions_deprecated, dtype=np.float32, ) next_done = len(next_terminal_step) == 1 next_reward = 0 if len(next_terminal_step) == 1: next_reward = next_terminal_step.reward[0] else: next_reward = next_decision_step.reward[0] current_obs = None if len(current_terminal_step) == 1: current_obs = list(current_terminal_step.values())[0].obs else: current_obs = list(current_decision_step.values())[0].obs demo_raw_buffer["done"].append(next_done) demo_raw_buffer["rewards"].append(next_reward) for i, obs in enumerate(current_obs): demo_raw_buffer[ObsUtil.get_name_at(i)].append(obs) if ( len(current_pair_info.action_info.continuous_actions) == 0 and len(current_pair_info.action_info.discrete_actions) == 0 ): if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.vector_actions_deprecated ) else: if behavior_spec.action_spec.continuous_size > 0: demo_raw_buffer["continuous_action"].append( current_pair_info.action_info.continuous_actions ) if behavior_spec.action_spec.discrete_size > 0: demo_raw_buffer["discrete_action"].append( current_pair_info.action_info.discrete_actions ) demo_raw_buffer["prev_action"].append(previous_action) if next_done: demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) demo_raw_buffer.reset_agent() demo_raw_buffer.resequence_and_append( demo_processed_buffer, batch_size=None, training_length=sequence_length ) return demo_processed_buffer