def function(name, spec): auxiliary = ArrayDict() if self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: if name is None: name = 'action' # Mask, either part of states or default all true auxiliary['mask'] = states.pop( name + '_mask', np.ones(shape=(num_instances, ) + spec.shape + (spec.num_values, ), dtype=spec.np_type())) return auxiliary
def __init__(self, *, states, actions, l2_regularization, parallel_interactions, config, saver, summarizer, tracking): # Initialize global registries setattr(Module, '_MODULE_STACK', list()) setattr(Layer, '_REGISTERED_LAYERS', OrderedDict()) # Tensorforce config self._config = config Module._MODULE_STACK.clear() Module._MODULE_STACK.append(self.__class__) super().__init__(device=self._config.device, l2_regularization=l2_regularization, name=self._config.name) assert self.l2_regularization is not None self.is_trainable = True self.is_saved = True # Keep track of tensor names to check for collisions self.value_names = set() # Terminal specification self.terminal_spec = TensorSpec(type='int', shape=(), num_values=3) self.value_names.add('terminal') # Reward specification self.reward_spec = TensorSpec(type='float', shape=()) self.value_names.add('reward') # Parallel specification self.parallel_spec = TensorSpec(type='int', shape=(), num_values=parallel_interactions) self.value_names.add('parallel') # Deterministic specification self.deterministic_spec = TensorSpec(type='bool', shape=()) self.value_names.add('deterministic') # State space specification self.states_spec = states for name, spec in self.states_spec.items(): name = ('' if name is None else ' ' + name) if spec.type != 'float': continue elif spec.min_value is None: logging.warning( "No min_value bound specified for state{}.".format(name)) elif np.isinf(spec.min_value).any(): logging.warning( "Infinite min_value bound for state{}.".format(name)) elif spec.max_value is None: logging.warning( "No max_value bound specified for state{}.".format(name)) elif np.isinf(spec.max_value).any(): logging.warning( "Infinite max_value bound for state{}.".format(name)) # Check for name collisions if self.states_spec.is_singleton(): if 'state' in self.value_names: raise TensorforceError.exists(name='value name', value=name) self.value_names.add('state') else: for name in self.states_spec: if name in self.value_names: raise TensorforceError.exists(name='value name', value=name) self.value_names.add(name) # Action space specification self.actions_spec = actions for name, spec in self.actions_spec.items(): name = ('' if name is None else ' ' + name) if spec.type != 'float': continue elif spec.min_value is None: logging.warning( "No min_value specified for action{}.".format(name)) elif np.isinf(spec.min_value).any(): raise TensorforceError( "Infinite min_value bound for action{}.".format(name)) elif spec.max_value is None: logging.warning( "No max_value specified for action{}.".format(name)) elif np.isinf(spec.max_value).any(): raise TensorforceError( "Infinite max_value bound for action{}.".format(name)) # Check for name collisions if self.actions_spec.is_singleton(): if 'action' in self.value_names: raise TensorforceError.exists(name='value name', value=name) self.value_names.add('action') else: for name in self.actions_spec: if name in self.value_names: raise TensorforceError.exists(name='value name', value=name) self.value_names.add(name) # Internal state space specification self.internals_spec = TensorsSpec() self.initial_internals = ArrayDict() # Auxiliary value space specification self.auxiliaries_spec = TensorsSpec() for name, spec in self.actions_spec.items(): if self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: self.auxiliaries_spec[name] = TensorsSpec(mask=TensorSpec( type='bool', shape=(spec.shape + (spec.num_values, )))) # Parallel interactions assert isinstance(parallel_interactions, int) and parallel_interactions >= 1 self.parallel_interactions = parallel_interactions # Saver if isinstance(saver, str): saver = dict(directory=saver) if saver is None: self.saver = None elif not all(key in ('directory', 'filename', 'frequency', 'load', 'max_checkpoints', 'max_hour_frequency', 'unit') for key in saver): raise TensorforceError.value( name='agent', argument='saver', value=list(saver), hint= 'not from {directory,filename,frequency,load,max_checkpoints,' 'max_hour_frequency,unit}') elif 'directory' not in saver: raise TensorforceError.required(name='agent', argument='saver[directory]') else: self.saver = dict(saver) # Summarizer if isinstance(summarizer, str): summarizer = dict(directory=summarizer) if summarizer is None: self.summarizer = None self.summaries = frozenset() elif not all(key in ('directory', 'filename', 'flush', 'max_summaries', 'summaries') for key in summarizer): raise TensorforceError.value( name='agent', argument='summarizer', value=list(summarizer), hint= 'not from {directory,filename,flush,max_summaries,summaries}') elif 'directory' not in summarizer: raise TensorforceError.required(name='agent', argument='summarizer[directory]') else: self.summarizer = dict(summarizer) # Summary labels summaries = summarizer.get('summaries') if summaries is None or summaries == 'all': self.summaries = 'all' elif not all(isinstance(label, str) for label in summaries): raise TensorforceError.value(name='agent', argument='summarizer[summaries]', value=summaries) else: self.summaries = frozenset(summaries) # Tracking if tracking is None: self.tracking = frozenset() elif tracking == 'all': self.tracking = 'all' else: self.tracking = frozenset(tracking)
def _process_states_input(self, states, function_name): if self.states_spec.is_singleton() and not isinstance( states, dict) and not (util.is_iterable(x=states) and isinstance(states[0], dict)): # Single state states = np.asarray(states) if states.shape == self.states_spec.value().shape: # Single state is not batched states = ArrayDict(singleton=np.expand_dims(states, axis=0)) batched = False num_instances = 1 is_iter_of_dicts = None else: # Single state is batched, iter[state] assert states.shape[1:] == self.states_spec.value().shape assert type(states) in (tuple, list, np.ndarray) num_instances = states.shape[0] states = ArrayDict(singleton=states) batched = True is_iter_of_dicts = True # Default elif util.is_iterable(x=states): # States is batched, iter[dict[state]] batched = True num_instances = len(states) is_iter_of_dicts = True assert type(states) in (tuple, list) if num_instances == 0: raise TensorforceError.value(name=function_name, argument='len(states)', value=num_instances, hint='= 0') for n, state in enumerate(states): if not isinstance(state, dict): raise TensorforceError.type( name=function_name, argument='states[{}]'.format(n), dtype=type(state), hint='is not dict') # Turn iter of dicts into dict of arrays # (Doesn't use self.states_spec since states also contains auxiliaries) states = [ArrayDict(state) for state in states] states = states[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=states[1:]) elif isinstance(states, dict): # States is dict, turn into arrays states = ArrayDict(states) name, spec = self.states_spec.item() if name is None: name = 'state' if states[name].shape == spec.shape: # States is not batched, dict[state] states = states.fmap( function=(lambda state: np.expand_dims(state, axis=0))) batched = False num_instances = 1 is_iter_of_dicts = None else: # States is batched, dict[iter[state]] assert states[name].shape[1:] == spec.shape assert type(states[name]) in (tuple, list, np.ndarray) batched = True num_instances = states[name].shape[0] is_iter_of_dicts = False if num_instances == 0: raise TensorforceError.value(name=function_name, argument='len(states)', value=num_instances, hint='= 0') else: raise TensorforceError.type(name=function_name, argument='states', dtype=type(states), hint='is not array/tuple/list/dict') # Check number of inputs if any(state.shape[0] != num_instances for state in states.values()): raise TensorforceError.value( name=function_name, argument='len(states)', value=[state.shape[0] for state in states.values()], hint='inconsistent') return states, batched, num_instances, is_iter_of_dicts
def act(self, states, internals=None, parallel=0, independent=False, deterministic=True, **kwargs): # Independent and internals is_internals_none = (internals is None) if independent: if parallel != 0: raise TensorforceError.invalid(name='Agent.act', argument='parallel', condition='independent is true') if is_internals_none and len(self.internals_spec) > 0: raise TensorforceError.required( name='Agent.act', argument='internals', condition='independent is true') else: if not is_internals_none: raise TensorforceError.invalid( name='Agent.act', argument='internals', condition='independent is false') # Process states input and infer batching structure states, batched, num_parallel, is_iter_of_dicts = self._process_states_input( states=states, function_name='Agent.act') if independent: # Independent mode: handle internals argument if is_internals_none: # Default input internals=None pass elif is_iter_of_dicts or isinstance(internals, (tuple, list)): # Input structure iter[dict[internal]] if not isinstance(internals, (tuple, list)): raise TensorforceError.type(name='Agent.act', argument='internals', dtype=type(internals), hint='is not tuple/list') internals = [ArrayDict(internal) for internal in internals] internals = internals[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:]) else: # Input structure dict[iter[internal]] if not isinstance(internals, dict): raise TensorforceError.type(name='Agent.act', argument='internals', dtype=type(internals), hint='is not dict') internals = ArrayDict(internals) if not independent or not is_internals_none: # Expand inputs if not batched if not batched: internals = internals.fmap( function=(lambda x: np.expand_dims(x, axis=0))) # Check number of inputs for name, internal in internals.items(): if internal.shape[0] != num_parallel: raise TensorforceError.value( name='Agent.act', argument='len(internals[{}])'.format(name), value=internal.shape[0], hint='!= len(states)') else: # Non-independent mode: handle parallel input if batched: # Batched input parallel = np.asarray(parallel) elif parallel == 0: # Default input parallel=0 if batched: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray([parallel]) else: # Expand input if not batched parallel = np.asarray([parallel]) # Check number of inputs if parallel.shape[0] != num_parallel: raise TensorforceError.value(name='Agent.act', argument='len(parallel)', value=len(parallel), hint='!= len(states)') # If not independent, check whether previous timesteps were completed if not independent: if not self.timestep_completed[parallel].all(): raise TensorforceError( message= "Calling agent.act must be preceded by agent.observe for training, or " "agent.act argument 'independent' must be passed as True.") self.timestep_completed[parallel] = False # Buffer inputs for recording if self.recorder is not None and not independent and \ self.num_episodes >= self.recorder.get('start', 0): for n in range(num_parallel): for name in self.states_spec: self.buffers['states'][name][parallel[n]].append( states[name][n]) # fn_act() if self._is_agent: actions, internals = self.fn_act( states=states, internals=internals, parallel=parallel, independent=independent, deterministic=deterministic, is_internals_none=is_internals_none, num_parallel=num_parallel) else: if batched: assert False else: states = states.fmap(function=( lambda x: x[0].item() if x.shape == (1, ) else x[0])) actions = self.fn_act(states.to_kwargs()) if self.actions_spec.is_singleton(): actions = ArrayDict(singleton=np.asarray([actions])) else: actions = ArrayDict(actions) actions = actions.fmap( function=(lambda x: np.asarray([x]))) # Buffer outputs for recording if self.recorder is not None and not independent and \ self.num_episodes >= self.recorder.get('start', 0): for n in range(num_parallel): for name in self.actions_spec: self.buffers['actions'][name][parallel[n]].append( actions[name][n]) # Unbatch actions if batched: # If inputs were batched, turn dict of lists into list of dicts function = (lambda x: x.item() if x.shape == () else x) # TODO: recursive if self.actions_spec.is_singleton(): actions = actions.singleton() if is_iter_of_dicts: actions = [ function(actions[n]) for n in range(num_parallel) ] else: if is_iter_of_dicts: actions = [ OrderedDict(((name, function(x[n])) for name, x in actions.items())) for n in range(num_parallel) ] else: actions = OrderedDict(actions.items()) if independent and not is_internals_none: if is_iter_of_dicts: # TODO: recursive internals = [ OrderedDict(((name, function(x[n])) for name, x in internals.items())) for n in range(num_parallel) ] else: internals = OrderedDict(internals.items()) else: # If inputs were not batched, unbatch outputs function = (lambda x: x.item() if x.shape == (1, ) else x[0]) if self.actions_spec.is_singleton(): actions = function(actions.singleton()) else: actions = actions.fmap(function=function, cls=OrderedDict) if independent and not is_internals_none: internals = internals.fmap(function=function, cls=OrderedDict) if independent and not is_internals_none: return actions, internals else: return actions
def pretrain(self, directory, num_iterations, num_traces=1, num_updates=1, extension='.npz'): """ Simple pretraining approach as a combination of `experience()` and `update`, akin to behavioral cloning, using experience traces obtained e.g. via recording agent interactions ([see documentation](https://tensorforce.readthedocs.io/en/latest/basics/features.html#record-pretrain)). For the given number of iterations, load the given number of trace files (which each contain recorder[frequency] episodes), feed the experience to the agent's internal memory, and subsequently trigger the given number of updates (which will use the experience in the internal memory, fed in this or potentially previous iterations). See the [record-and-pretrain script](https://github.com/tensorforce/tensorforce/blob/master/examples/record_and_pretrain.py) for an example application. Args: directory (path): Directory with experience traces, e.g. obtained via recorder; episode length has to be consistent with agent configuration (<span style="color:#C00000"><b>required</b></span>). num_iterations (int > 0): Number of iterations consisting of loading new traces and performing multiple updates (<span style="color:#C00000"><b>required</b></span>). num_traces (int > 0): Number of traces to load per iteration; has to at least satisfy the update batch size (<span style="color:#00C000"><b>default</b></span>: 1). num_updates (int > 0): Number of updates per iteration (<span style="color:#00C000"><b>default</b></span>: 1). extension (str): Traces file extension to filter the given directory for (<span style="color:#00C000"><b>default</b></span>: ".npz"). """ if not os.path.isdir(directory): raise TensorforceError.value(name='agent.pretrain', argument='directory', value=directory) files = sorted( os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and os.path.splitext(f)[1] == extension) indices = list(range(len(files))) for _ in range(num_iterations): shuffle(indices) if num_traces is None: selection = indices else: selection = indices[:num_traces] batch = None for index in selection: trace = ArrayDict(np.load(files[index])) if batch is None: batch = trace else: batch = batch.fmap( function=(lambda x, y: np.concatenate([x, y], axis=0)), zip_values=(trace, )) for name, value in batch.pop('auxiliaries', dict()).items(): assert name.endswith('/mask') batch['states'][name[:-5] + '_mask'] = value self.experience(**batch.to_kwargs()) for _ in range(num_updates): self.update()
def experience(self, states, actions, terminal, reward, internals=None): """ Feed experience traces. See the [act-experience-update script](https://github.com/tensorforce/tensorforce/blob/master/examples/act_experience_update_interface.py) for an example application as part of the act-experience-update interface, which is an alternative to the act-observe interaction pattern. Args: states (dict[array[state]]): Dictionary containing arrays of states (<span style="color:#C00000"><b>required</b></span>). actions (dict[array[action]]): Dictionary containing arrays of actions (<span style="color:#C00000"><b>required</b></span>). terminal (array[bool]): Array of terminals (<span style="color:#C00000"><b>required</b></span>). reward (array[float]): Array of rewards (<span style="color:#C00000"><b>required</b></span>). internals (dict[state]): Dictionary containing arrays of internal agent states (<span style="color:#C00000"><b>required</b></span> if agent has internal states). """ if not all(len(buffer) == 0 for buffer in self.terminal_buffer): raise TensorforceError( message="Calling agent.experience is not possible mid-episode." ) # Process states input and infer batching structure states, batched, num_instances, is_iter_of_dicts = self._process_states_input( states=states, function_name='Agent.experience') if is_iter_of_dicts: # Input structure iter[dict[input]] # Internals if internals is None: internals = ArrayDict(self.initial_internals()) internals = internals.fmap(function=(lambda x: np.repeat( np.expand_dims(x, axis=0), repeats=num_instances, axis=0))) elif not isinstance(internals, (tuple, list)): raise TensorforceError.type(name='Agent.experience', argument='internals', dtype=type(internals), hint='is not tuple/list') else: internals = [ArrayDict(internal) for internal in internals] internals = internals[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:]) # Actions if isinstance(actions, np.ndarray): actions = ArrayDict(singleton=actions) elif not isinstance(actions, (tuple, list)): raise TensorforceError.type(name='Agent.experience', argument='actions', dtype=type(actions), hint='is not tuple/list') elif not isinstance(actions[0], dict): actions = ArrayDict(singleton=np.asarray(actions)) else: actions = [ArrayDict(action) for action in actions] actions = actions[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=actions[1:]) else: # Input structure dict[iter[input]] # Internals if internals is None: internals = ArrayDict(self.initial_internals()) internals = internals.fmap(function=(lambda x: np.tile( np.expand_dims(x, axis=0), reps=(num_instances, )))) elif not isinstance(internals, dict): raise TensorforceError.type(name='Agent.experience', argument='internals', dtype=type(internals), hint='is not dict') else: internals = ArrayDict(internals) # Actions if not isinstance(actions, np.ndarray): actions = ArrayDict(singleton=actions) elif not isinstance(actions, dict): raise TensorforceError.type(name='Agent.experience', argument='actions', dtype=type(actions), hint='is not dict') else: actions = ArrayDict(actions) # Expand inputs if not batched if not batched: internals = internals.fmap( function=(lambda x: np.expand_dims(x, axis=0))) actions = actions.fmap( function=(lambda x: np.expand_dims(x, axis=0))) terminal = np.asarray([terminal]) reward = np.asarray([reward]) else: terminal = np.asarray(terminal) reward = np.asarray(reward) # Check number of inputs for name, internal in internals.items(): if internal.shape[0] != num_instances: raise TensorforceError.value( name='Agent.experience', argument='len(internals[{}])'.format(name), value=internal.shape[0], hint='!= len(states)') for name, action in actions.items(): if action.shape[0] != num_instances: raise TensorforceError.value( name='Agent.experience', argument='len(actions[{}])'.format(name), value=action.shape[0], hint='!= len(states)') if terminal.shape[0] != num_instances: raise TensorforceError.value(name='Agent.experience', argument='len(terminal)'.format(name), value=terminal.shape[0], hint='!= len(states)') if reward.shape[0] != num_instances: raise TensorforceError.value(name='Agent.experience', argument='len(reward)'.format(name), value=reward.shape[0], hint='!= len(states)') def function(name, spec): auxiliary = ArrayDict() if self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: if name is None: name = 'action' # Mask, either part of states or default all true auxiliary['mask'] = states.pop( name + '_mask', np.ones(shape=(num_instances, ) + spec.shape + (spec.num_values, ), dtype=spec.np_type())) return auxiliary auxiliaries = self.actions_spec.fmap(function=function, cls=ArrayDict, with_names=True) if self.states_spec.is_singleton() and not states.is_singleton(): states[None] = states.pop('state') # Convert terminal to int if necessary if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) terminal = np.where(terminal, ones, zeros) if terminal[-1] == 0: raise TensorforceError( message="Agent.experience() requires full episodes as input.") # Batch experiences split into episodes and at most size buffer_observe last = 0 for index in range(1, len(terminal) + 1): if terminal[index - 1] == 0: continue function = (lambda x: x[last:index]) states_batch = states.fmap(function=function) internals_batch = internals.fmap(function=function) auxiliaries_batch = auxiliaries.fmap(function=function) actions_batch = actions.fmap(function=function) terminal_batch = function(terminal) reward_batch = function(reward) last = index # Inputs to tensors states_batch = self.states_spec.to_tensor( value=states_batch, batched=True, name='Agent.experience states') internals_batch = self.internals_spec.to_tensor( value=internals_batch, batched=True, recover_empty=True, name='Agent.experience internals') auxiliaries_batch = self.auxiliaries_spec.to_tensor( value=auxiliaries_batch, batched=True, name='Agent.experience auxiliaries') actions_batch = self.actions_spec.to_tensor( value=actions_batch, batched=True, name='Agent.experience actions') terminal_batch = self.terminal_spec.to_tensor( value=terminal_batch, batched=True, name='Agent.experience terminal') reward_batch = self.reward_spec.to_tensor( value=reward_batch, batched=True, name='Agent.experience reward') # Model.experience() timesteps, episodes = self.model.experience( states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch, reward=reward_batch) self.timesteps = timesteps.numpy().item() self.episodes = episodes.numpy().item() if self.model.saver is not None: self.model.save()
def internals_init(self): return ArrayDict()
def pretrain(self, directory, num_iterations, num_traces=1, num_updates=1, extension='.npz'): """ Naive pretraining approach as a combination of `experience()` and `update`, uses experience traces obtained e.g. via recorder argument. See [record-and-pretrain script](https://github.com/tensorforce/tensorforce/blob/master/examples/record_and_pretrain.py) for illustrative example. Args: directory (path): Directory with experience traces, e.g. obtained via recorder; episode length has to be consistent with agent configuration (<span style="color:#C00000"><b>required</b></span>). num_iterations (int > 0): Number of iterations consisting of loading new traces and performing multiple updates (<span style="color:#C00000"><b>required</b></span>). num_traces (int > 0): Number of traces to load per iteration; has to at least satisfy the update batch size (<span style="color:#00C000"><b>default</b></span>: 1). num_updates (int > 0): Number of updates per iteration (<span style="color:#00C000"><b>default</b></span>: 1). extension (str): Traces file extension to filter the given directory for (<span style="color:#00C000"><b>default</b></span>: ".npz"). """ if not os.path.isdir(directory): raise TensorforceError.value(name='agent.pretrain', argument='directory', value=directory) files = sorted( os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and os.path.splitext(f)[1] == extension) indices = list(range(len(files))) for _ in range(num_iterations): shuffle(indices) if num_traces is None: selection = indices else: selection = indices[:num_traces] # function = (lambda x: list()) # values = ListDict() # values['states'] = self.states_spec.fmap(function=function, cls=ListDict) # values['auxiliaries'] = self.auxiliaries_spec.fmap(function=function, cls=ListDict) # values['actions'] = self.actions_spec.fmap(function=function, cls=ListDict) # values['terminal'] = list() # values['reward'] = list() batch = None for index in selection: trace = ArrayDict(np.load(files[index])) if batch is None: batch = trace else: batch = batch.fmap( function=(lambda x, y: np.concatenate([x, y], axis=0)), zip_values=(trace, )) for name, value in batch.pop('auxiliaries', dict()).items(): assert name.endswith('/mask') batch['states'][name[:-5] + '_mask'] = value # values = values.fmap(function=np.concatenate, cls=ArrayDict) self.experience(**batch.to_kwargs()) for _ in range(num_updates): self.update()
def act( self, states, internals=None, parallel=0, independent=False, # Deprecated deterministic=None, evaluation=None ): """ Returns action(s) for the given state(s), needs to be followed by `observe()` unless independent mode. Args: states (dict[state] | iter[dict[state]]): Dictionary containing state(s) to be acted on (<span style="color:#C00000"><b>required</b></span>). internals (dict[internal] | iter[dict[internal]]): Dictionary containing current internal agent state(s), either given by `initial_internals()` at the beginning of an episode or as return value of the preceding `act()` call (<span style="color:#C00000"><b>required</b></span> if independent mode and agent has internal states). parallel (int | iter[int]): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). independent (bool): Whether act is not part of the main agent-environment interaction, and this call is thus not followed by observe (<span style="color:#00C000"><b>default</b></span>: false). Returns: dict[action] | iter[dict[action]], dict[internal] | iter[dict[internal]] if `internals` argument given: Dictionary containing action(s), dictionary containing next internal agent state(s) if independent mode. """ if deterministic is not None: raise TensorforceError.deprecated( name='Agent.act', argument='deterministic', replacement='independent' ) if evaluation is not None: raise TensorforceError.deprecated( name='Agent.act', argument='evaluation', replacement='independent' ) # Independent and internals if independent: if parallel != 0: raise TensorforceError.invalid( name='Agent.act', argument='parallel', condition='independent is true' ) is_internals_none = (internals is None) if is_internals_none and len(self.internals_spec) > 0: raise TensorforceError.required( name='Agent.act', argument='internals', condition='independent is true' ) else: if internals is not None: raise TensorforceError.invalid( name='Agent.act', argument='internals', condition='independent is false' ) # Process states input and infer batching structure states, batched, num_parallel, is_iter_of_dicts, input_type = self._process_states_input( states=states, function_name='Agent.act' ) if independent: # Independent mode: handle internals argument if is_internals_none: # Default input internals=None pass elif is_iter_of_dicts: # Input structure iter[dict[internal]] if not isinstance(internals, (tuple, list)): raise TensorforceError.type( name='Agent.act', argument='internals', dtype=type(internals), hint='is not tuple/list' ) internals = [ArrayDict(internal) for internal in internals] internals = internals[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=internals[1:] ) else: # Input structure dict[iter[internal]] if not isinstance(internals, dict): raise TensorforceError.type( name='Agent.act', argument='internals', dtype=type(internals), hint='is not dict' ) internals = ArrayDict(internals) if not independent or not is_internals_none: # Expand inputs if not batched if not batched: internals = internals.fmap(function=(lambda x: np.expand_dims(x, axis=0))) # Check number of inputs for name, internal in internals.items(): if internal.shape[0] != num_parallel: raise TensorforceError.value( name='Agent.act', argument='len(internals[{}])'.format(name), value=internal.shape[0], hint='!= len(states)' ) else: # Non-independent mode: handle parallel input if parallel == 0: # Default input parallel=0 if batched: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray([parallel]) elif batched: # Batched input parallel = np.asarray(parallel) else: # Expand input if not batched parallel = np.asarray([parallel]) # Check number of inputs if parallel.shape[0] != num_parallel: raise TensorforceError.value( name='Agent.act', argument='len(parallel)', value=len(parallel), hint='!= len(states)' ) def function(name, spec): auxiliary = ArrayDict() if self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: # Mask, either part of states or default all true auxiliary['mask'] = states.pop(name + '_mask', np.ones( shape=(num_parallel,) + spec.shape + (spec.num_values,), dtype=spec.np_type() )) return auxiliary auxiliaries = self.actions_spec.fmap(function=function, cls=ArrayDict, with_names=True) # If not independent, check whether previous timesteps were completed if not independent: if not self.timestep_completed[parallel].all(): raise TensorforceError( message="Calling agent.act must be preceded by agent.observe." ) self.timestep_completed[parallel] = False # Buffer inputs for recording if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): for n in range(num_parallel): for name in self.states_spec: self.buffers['states'][name][parallel[n]].append(states[name][n]) for name in self.auxiliaries_spec: self.buffers['auxiliaries'][name][parallel[n]].append(auxiliaries[name][n]) # Inputs to tensors states = self.states_spec.to_tensor(value=states, batched=True) if independent and not is_internals_none: internals = self.internals_spec.to_tensor(value=internals, batched=True) auxiliaries = self.auxiliaries_spec.to_tensor(value=auxiliaries, batched=True) parallel_tensor = self.parallel_spec.to_tensor(value=parallel, batched=True) # Model.act() if not independent: actions, timesteps = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel_tensor ) self.timesteps = timesteps.numpy().item() elif len(self.internals_spec) > 0: if len(self.auxiliaries_spec) > 0: actions_internals = self.model.independent_act( states=states, internals=internals, auxiliaries=auxiliaries ) else: assert len(auxiliaries) == 0 actions_internals = self.model.independent_act(states=states, internals=internals) actions_internals = TensorDict(actions_internals) actions = actions_internals['actions'] internals = actions_internals['internals'] else: if len(self.auxiliaries_spec) > 0: actions = self.model.independent_act(states=states, auxiliaries=auxiliaries) else: assert len(auxiliaries) == 0 actions = self.model.independent_act(states=states) actions = TensorDict(actions) # Outputs from tensors # print(actions) actions = self.actions_spec.from_tensor(tensor=actions, batched=True) # Buffer outputs for recording if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): for n in range(num_parallel): for name in self.actions_spec: self.buffers['actions'][name][parallel[n]].append(actions[name][n]) # Unbatch actions if batched: # If inputs were batched, turn list of dicts into dict of lists function = (lambda x: x.item() if x.shape == () else x) if self.single_action: actions = input_type(function(actions['action'][n]) for n in range(num_parallel)) else: # TODO: recursive actions = input_type( OrderedDict(((name, function(x[n])) for name, x in actions.items())) for n in range(num_parallel) ) if independent and not is_internals_none and is_iter_of_dicts: # TODO: recursive internals = input_type( OrderedDict(((name, function(x[n])) for name, x in internals.items())) for n in range(num_parallel) ) else: # If inputs were not batched, unbatch outputs function = (lambda x: x.item() if x.shape == (1,) else x[0]) if self.single_action: actions = function(actions['action']) else: actions = actions.fmap(function=function, cls=OrderedDict) if independent and not is_internals_none: internals = internals.fmap(function=function, cls=OrderedDict) if self.model.saver is not None: self.model.save() if independent and not is_internals_none: return actions, internals else: return actions