def fn(query=None, **kwargs): # Feed_dict dictionary feed_dict = dict() for key, arg in kwargs.items(): if arg is None: continue elif isinstance(arg, dict): # Support single nesting (for states, internals, actions) for key, arg in arg.items(): feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg else: feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg if not all(isinstance(x, str) and x.endswith('-input:0') for x in feed_dict): raise TensorforceError.unexpected() # Fetches value/tuple fetches = util.fmap(function=(lambda x: x.name), xs=results) if query is not None: # If additional tensors are to be fetched query = util.fmap( function=(lambda x: util.join_scopes(name, x) + '-output:0'), xs=query ) if util.is_iterable(x=fetches): fetches = tuple(fetches) + (query,) else: fetches = (fetches, query) if not util.reduce_all( predicate=(lambda x: isinstance(x, str) and x.endswith('-output:0')), xs=fetches ): raise TensorforceError.unexpected() # TensorFlow session call fetched = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) return fetched
def observe(self, reward, terminal=False, parallel=0, query=None, **kwargs): """ Observes reward and whether a terminal state is reached, needs to be preceded by `act(...)`. Args: reward (float): Reward (<span style="color:#C00000"><b>required</b></span>). terminal (bool | 0 | 1 | 2): Whether a terminal state is reached or 2 if the episode was aborted (<span style="color:#00C000"><b>default</b></span>: false). parallel (int): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: (bool, optional list[str]): Whether an update was performed, plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=reward) if query is not None and self.parallel_interactions > 1: raise TensorforceError.unexpected() if isinstance(terminal, bool): terminal = int(terminal) # Update terminal/reward buffer index = self.buffer_indices[parallel] self.terminal_buffers[parallel, index] = terminal self.reward_buffers[parallel, index] = reward index += 1 if self.max_episode_timesteps is not None and index > self.max_episode_timesteps: raise TensorforceError.unexpected() if terminal > 0 or index == self.buffer_observe or query is not None: terminal = self.terminal_buffers[parallel, :index] reward = self.reward_buffers[parallel, :index] if self.recorder_spec is not None and \ self.episodes >= self.recorder_spec.get('start', 0): for name in self.states_spec: self.record_states[name].append( np.array(self.states_buffers[name][parallel, :index]) ) for name, spec in self.actions_spec.items(): self.record_actions[name].append( np.array(self.actions_buffers[name][parallel, :index]) ) if spec['type'] == 'int': self.record_states[name + '_mask'].append( np.array(self.states_buffers[name + '_mask'][parallel, :index]) ) self.record_terminal.append(np.array(terminal)) self.record_reward.append(np.array(reward)) if terminal[-1] > 0: self.num_episodes += 1 if self.num_episodes == self.recorder_spec.get('frequency', 1): directory = self.recorder_spec['directory'] if os.path.isdir(directory): files = sorted( f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.startswith('trace-') ) else: os.makedirs(directory) files = list() max_traces = self.recorder_spec.get('max-traces') if max_traces is not None and len(files) > max_traces - 1: for filename in files[:-max_traces + 1]: filename = os.path.join(directory, filename) os.remove(filename) filename = 'trace-{}-{}.npz'.format( self.episodes, time.strftime('%Y%m%d-%H%M%S') ) filename = os.path.join(directory, filename) self.record_states = util.fmap( function=np.concatenate, xs=self.record_states, depth=1 ) self.record_actions = util.fmap( function=np.concatenate, xs=self.record_actions, depth=1 ) self.record_terminal = np.concatenate(self.record_terminal) self.record_reward = np.concatenate(self.record_reward) np.savez_compressed( filename, **self.record_states, **self.record_actions, terminal=self.record_terminal, reward=self.record_reward ) self.record_states = util.fmap( function=(lambda x: list()), xs=self.record_states, depth=1 ) self.record_actions = util.fmap( function=(lambda x: list()), xs=self.record_actions, depth=1 ) self.record_terminal = list() self.record_reward = list() self.num_episodes = 0 # Model.observe() if query is None: updated, self.episodes, self.updates = self.model.observe( terminal=terminal, reward=reward, parallel=parallel, **kwargs ) else: updated, self.episodes, self.updates, queried = self.model.observe( terminal=terminal, reward=reward, parallel=parallel, query=query, **kwargs ) # Reset buffer index self.buffer_indices[parallel] = 0 else: # Increment buffer index self.buffer_indices[parallel] = index updated = False if query is None: return updated else: return updated, queried
def act( self, states, parallel=0, deterministic=False, independent=False, evaluation=False, query=None, **kwargs ): """ Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless `independent` is true. Args: states (dict[state]): Dictionary containing state(s) to be acted on (<span style="color:#C00000"><b>required</b></span>). parallel (int): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). deterministic (bool): Whether to apply exploration and sampling (<span style="color:#00C000"><b>default</b></span>: false). independent (bool): Whether action is not remembered, and this call is thus not followed by observe (<span style="color:#00C000"><b>default</b></span>: false). evaluation (bool): Whether the agent is currently evaluated, implies and overwrites deterministic and independent (<span style="color:#00C000"><b>default</b></span>: false). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=states) # self.current_internals = self.next_internals if evaluation: if deterministic or independent: raise TensorforceError.unexpected() deterministic = independent = True # Auxiliaries auxiliaries = OrderedDict() if isinstance(states, dict): states = dict(states) for name, spec in self.actions_spec.items(): if spec['type'] == 'int' and name + '_mask' in states: auxiliaries[name + '_mask'] = states.pop(name + '_mask') # Normalize states dictionary states = util.normalize_values( value_type='state', values=states, values_spec=self.states_spec ) # Batch states states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1) auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1) # Model.act() if query is None: actions, self.timesteps = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, independent=independent, **kwargs ) else: actions, self.timesteps, queried = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, independent=independent, query=query, **kwargs ) if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): index = self.buffer_indices[parallel] for name in self.states_spec: self.states_buffers[name][parallel, index] = states[name][0] for name, spec in self.actions_spec.items(): self.actions_buffers[name][parallel, index] = actions[name][0] if spec['type'] == 'int': name = name + '_mask' if name in auxiliaries: self.states_buffers[name][parallel, index] = auxiliaries[name][0] else: shape = (1,) + spec['shape'] + (spec['num_values'],) self.states_buffers[name][parallel, index] = np.full( shape=shape, fill_value=True, dtype=util.np_dtype(dtype='bool') ) # Unbatch actions actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1) # Reverse normalized actions dictionary actions = util.unpack_values( value_type='action', values=actions, values_spec=self.actions_spec ) # if independent, return processed state as well? if query is None: return actions else: return actions, queried
def experience( self, states, actions, terminal, reward, internals=None, query=None, **kwargs ): """ Feed experience traces. Args: states (dict[array[state]]): Dictionary containing arrays of states (<span style="color:#C00000"><b>required</b></span>). actions (dict[array[action]]): Dictionary containing arrays of actions (<span style="color:#C00000"><b>required</b></span>). terminal (array[bool]): Array of terminals (<span style="color:#C00000"><b>required</b></span>). reward (array[float]): Array of rewards (<span style="color:#C00000"><b>required</b></span>). internals (dict[state]): Dictionary containing arrays of internal agent states (<span style="color:#00C000"><b>default</b></span>: no internal states). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. """ assert (self.buffer_indices == 0).all() assert util.reduce_all(predicate=util.not_nan_inf, xs=states) assert internals is None or util.reduce_all(predicate=util.not_nan_inf, xs=internals) assert util.reduce_all(predicate=util.not_nan_inf, xs=actions) assert util.reduce_all(predicate=util.not_nan_inf, xs=reward) # Auxiliaries auxiliaries = OrderedDict() if isinstance(states, dict): for name, spec in self.actions_spec.items(): if spec['type'] == 'int' and name + '_mask' in states: auxiliaries[name + '_mask'] = np.asarray(states.pop(name + '_mask')) auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1) # Normalize states/actions dictionaries states = util.normalize_values( value_type='state', values=states, values_spec=self.states_spec ) actions = util.normalize_values( value_type='action', values=actions, values_spec=self.actions_spec ) if internals is None: internals = OrderedDict() if isinstance(terminal, (bool, int)): states = util.fmap(function=(lambda x: [x]), xs=states, depth=1) internals = util.fmap(function=(lambda x: [x]), xs=internals, depth=1) auxiliaries = util.fmap(function=(lambda x: [x]), xs=auxiliaries, depth=1) actions = util.fmap(function=(lambda x: [x]), xs=actions, depth=1) terminal = [terminal] reward = [reward] states = util.fmap(function=np.asarray, xs=states, depth=1) internals = util.fmap(function=np.asarray, xs=internals, depth=1) auxiliaries = util.fmap(function=np.asarray, xs=auxiliaries, depth=1) actions = util.fmap(function=np.asarray, xs=actions, depth=1) if isinstance(terminal, np.ndarray): if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='long')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='long')) terminal = np.where(terminal, ones, zeros) else: terminal = np.asarray([int(x) if isinstance(x, bool) else x for x in terminal]) reward = np.asarray(reward) # Batch experiences split into episodes and at most size buffer_observe last = 0 for index in range(1, len(terminal) + 1): if terminal[index - 1] == 0 and index - last < self.experience_size: continue # Include terminal in batch if possible if index < len(terminal) and terminal[index - 1] == 0 and terminal[index] > 0 and \ index - last < self.experience_size: index += 1 function = (lambda x: x[last: index]) states_batch = util.fmap(function=function, xs=states, depth=1) internals_batch = util.fmap(function=function, xs=internals, depth=1) auxiliaries_batch = util.fmap(function=function, xs=auxiliaries, depth=1) actions_batch = util.fmap(function=function, xs=actions, depth=1) terminal_batch = terminal[last: index] reward_batch = reward[last: index] last = index # Model.experience() if query is None: self.timesteps, self.episodes, self.updates = self.model.experience( states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch, reward=reward_batch, **kwargs ) else: self.timesteps, self.episodes, self.updates, queried = self.model.experience( states=states_batch, internals=internals_batch, auxiliaries=auxiliaries_batch, actions=actions_batch, terminal=terminal_batch, reward=reward_batch, query=query, **kwargs ) if query is not None: return queried
def act(self, states, internals=None, parallel=0, independent=False, deterministic=False, evaluation=False, query=None, **kwargs): """ Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless independent mode set via `independent`/`evaluation`. Args: states (dict[state] | iter[dict[state]]): Dictionary containing state(s) to be acted on (<span style="color:#C00000"><b>required</b></span>). internals (dict[internal] | iter[dict[internal]]): Dictionary containing current internal agent state(s) (<span style="color:#C00000"><b>required</b></span> if independent mode). parallel (int | iter[int]): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). independent (bool): Whether act is not part of the main agent-environment interaction, and this call is thus not followed by observe (<span style="color:#00C000"><b>default</b></span>: false). deterministic (bool): Ff independent mode, whether to act deterministically, so no exploration and sampling (<span style="color:#00C000"><b>default</b></span>: false). evaluation (bool): Whether the agent is currently evaluated, implies independent and deterministic (<span style="color:#00C000"><b>default</b></span>: false). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: dict[action] | iter[dict[action]], if independent mode dict[internal] | iter[dict[internal]], plus optional list[str]: Dictionary containing action(s), dictionary containing next internal agent state(s) if independent mode, plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=states) if evaluation: if deterministic: raise TensorforceError.invalid(name='agent.act', argument='deterministic', condition='evaluation = true') if independent: raise TensorforceError.invalid(name='agent.act', argument='independent', condition='evaluation = true') deterministic = independent = True if not independent: if internals is not None: raise TensorforceError.invalid(name='agent.act', argument='internals', condition='independent = false') if deterministic: raise TensorforceError.invalid(name='agent.act', argument='deterministic', condition='independent = false') if independent: internals_is_none = (internals is None) if internals_is_none: internals = OrderedDict() # Batch states batched = (not isinstance(parallel, int)) if batched: if len(parallel) == 0: raise TensorforceError.value(name='agent.act', argument='parallel', value=parallel, hint='zero-length') parallel = np.asarray(list(parallel)) if isinstance(states[0], dict): states = OrderedDict( ((name, np.asarray( [states[n][name] for n in range(len(parallel))])) for name in states[0])) else: states = np.asarray(states) if independent: internals = OrderedDict( ((name, np.asarray( [internals[n][name] for n in range(len(parallel))])) for name in internals[0])) else: parallel = np.asarray([parallel]) states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=int(isinstance(states, dict))) if independent: internals = util.fmap(function=(lambda x: np.asarray([x])), xs=internals, depth=1) if not independent and not all(self.timestep_completed[n] for n in parallel): raise TensorforceError( message="Calling agent.act must be preceded by agent.observe.") # Auxiliaries auxiliaries = OrderedDict() if isinstance(states, dict): states = dict(states) for name, spec in self.actions_spec.items(): if spec['type'] == 'int' and name + '_mask' in states: auxiliaries[name + '_mask'] = states.pop(name + '_mask') # Normalize states dictionary states = util.normalize_values(value_type='state', values=states, values_spec=self.states_spec) # Model.act() if independent: if query is None: actions, internals = self.model.independent_act( states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, **kwargs) else: actions, internals, queried = self.model.independent_act( states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, query=query, **kwargs) else: if query is None: actions, self.timesteps = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, **kwargs) else: actions, self.timesteps, queried = self.model.act( states=states, auxiliaries=auxiliaries, parallel=parallel, query=query, **kwargs) if not independent: for n in parallel: self.timestep_completed[n] = False if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): for n in range(len(parallel)): index = self.buffer_indices[parallel[n]] for name in self.states_spec: self.states_buffers[name][parallel[n], index] = states[name][n] for name, spec in self.actions_spec.items(): self.actions_buffers[name][parallel[n], index] = actions[name][n] if spec['type'] == 'int': name = name + '_mask' if name in auxiliaries: self.states_buffers[name][ parallel[n], index] = auxiliaries[name][n] else: shape = (1, ) + spec['shape'] + ( spec['num_values'], ) self.states_buffers[name][parallel[n], index] = np.full( shape=shape, fill_value=True, dtype=util.np_dtype( dtype='bool')) # Reverse normalized actions dictionary actions = util.unpack_values(value_type='action', values=actions, values_spec=self.actions_spec) # Unbatch actions if batched: if isinstance(actions, dict): actions = [ OrderedDict(((name, actions[name][n]) for name in actions)) for n in range(len(parallel)) ] else: actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=int(isinstance(actions, dict))) if independent: internals = util.fmap(function=(lambda x: x[0]), xs=internals, depth=1) if independent and not internals_is_none: if query is None: return actions, internals else: return actions, internals, queried else: if query is None: return actions else: return actions, queried
def observe(self, reward, terminal=False, parallel=0, query=None, **kwargs): """ Observes reward and whether a terminal state is reached, needs to be preceded by `act(...)`. Args: reward (float | iter[float]): Reward (<span style="color:#C00000"><b>required</b></span>). terminal (bool | 0 | 1 | 2 | iter[...]): Whether a terminal state is reached or 2 if the episode was aborted (<span style="color:#00C000"><b>default</b></span>: false). parallel (int, iter[int]): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: (bool | int, optional list[str]): Whether an update was performed, plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=reward) if query is not None and self.parallel_interactions > 1: raise TensorforceError.invalid( name='agent.observe', argument='query', condition='parallel_interactions > 1') batched = (not isinstance(parallel, int)) if batched: if len(parallel) == 0: raise TensorforceError.value(name='agent.observe', argument='parallel', value=parallel, hint='zero-length') if query is not None: raise TensorforceError.invalid(name='agent.observe', argument='query', condition='len(parallel) > 1') else: terminal = [terminal] reward = [reward] parallel = [parallel] if any(self.timestep_completed[n] for n in parallel): raise TensorforceError( message="Calling agent.observe must be preceded by agent.act.") num_updates = 0 # TODO: Differently if not buffer_observe for terminal, reward, parallel in zip(terminal, reward, parallel): # Update terminal/reward buffer if isinstance(terminal, bool): terminal = int(terminal) index = self.buffer_indices[parallel] self.terminal_buffers[parallel, index] = terminal self.reward_buffers[parallel, index] = reward index += 1 self.buffer_indices[parallel] = index if self.max_episode_timesteps is not None and index > self.max_episode_timesteps: raise TensorforceError.value( name='agent.observe', argument='index', value=index, condition='> max_episode_timesteps') if terminal > 0 or index == self.buffer_observe or query is not None: self.timestep_completed[parallel] = True if query is None: updated = self.model_observe(parallel=parallel, **kwargs) else: updated, queried = self.model_observe(parallel=parallel, query=query, **kwargs) else: # Increment buffer index self.timestep_completed[parallel] = True updated = False num_updates += int(updated) if batched: updated = num_updates else: assert num_updates <= 1 updated = (num_updates == 1) if query is None: return updated else: return updated, queried