def __init__( # Environment self, states, actions, max_episode_timesteps=None, # TensorFlow etc parallel_interactions=1, buffer_observe=True, seed=None, recorder=None ): assert hasattr(self, 'spec') if seed is not None: assert isinstance(seed, int) random.seed(a=seed) np.random.seed(seed=seed) # States/actions specification self.states_spec = util.valid_values_spec( values_spec=states, value_type='state', return_normalized=True ) self.actions_spec = util.valid_values_spec( values_spec=actions, value_type='action', return_normalized=True ) self.max_episode_timesteps = max_episode_timesteps # Check for name overlap for name in self.states_spec: if name in self.actions_spec: TensorforceError.collision( name='name', value=name, group1='states', group2='actions' ) # Parallel episodes if isinstance(parallel_interactions, int): if parallel_interactions <= 0: raise TensorforceError.value( name='parallel_interactions', value=parallel_interactions ) self.parallel_interactions = parallel_interactions else: raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) # Buffer observe if isinstance(buffer_observe, bool): if not buffer_observe and self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None and self.parallel_interactions > 1: raise TensorforceError.unexpected() if not buffer_observe: self.buffer_observe = 1 elif self.max_episode_timesteps is None: self.buffer_observe = 100 else: self.buffer_observe = self.max_episode_timesteps elif isinstance(buffer_observe, int): if buffer_observe <= 0: raise TensorforceError.value(name='buffer_observe', value=buffer_observe) if self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None: self.buffer_observe = buffer_observe else: self.buffer_observe = min(buffer_observe, self.max_episode_timesteps) else: raise TensorforceError.type(name='buffer_observe', value=buffer_observe) # Parallel terminal/reward buffers self.terminal_buffers = np.ndarray( shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='long') ) self.reward_buffers = np.ndarray( shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='float') ) # Parallel buffer indices self.buffer_indices = np.zeros( shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int') ) self.timesteps = 0 self.episodes = 0 self.updates = 0 # Recorder if recorder is None: pass elif not all(key in ('directory', 'frequency', 'max-traces') for key in recorder): raise TensorforceError.value(name='recorder', value=list(recorder)) self.recorder_spec = recorder if self.recorder_spec is not None: self.record_states = OrderedDict(((name, list()) for name in self.states_spec)) for name, spec in self.actions_spec.items(): if spec['type'] == 'int': self.record_states[name + '_mask'] = list() self.record_actions = OrderedDict(((name, list()) for name in self.actions_spec)) self.record_terminal = list() self.record_reward = list() self.num_episodes = 0
def __init__( # Environment self, states, actions, max_episode_timesteps=None, # TensorFlow etc parallel_interactions=1, buffer_observe=True, seed=None, recorder=None ): assert hasattr(self, 'spec') if seed is not None: assert isinstance(seed, int) random.seed(a=seed) np.random.seed(seed=seed) # States/actions specification self.states_spec = util.valid_values_spec( values_spec=states, value_type='state', return_normalized=True ) self.actions_spec = util.valid_values_spec( values_spec=actions, value_type='action', return_normalized=True ) self.max_episode_timesteps = max_episode_timesteps # Check for name overlap for name in self.states_spec: if name in self.actions_spec: TensorforceError.collision( name='name', value=name, group1='states', group2='actions' ) # Parallel episodes if isinstance(parallel_interactions, int): if parallel_interactions <= 0: raise TensorforceError.value( name='parallel_interactions', value=parallel_interactions ) self.parallel_interactions = parallel_interactions else: raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) # Buffer observe if isinstance(buffer_observe, bool): if not buffer_observe and self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None and self.parallel_interactions > 1: raise TensorforceError.unexpected() if not buffer_observe: self.buffer_observe = 1 elif self.max_episode_timesteps is None: self.buffer_observe = 100 else: self.buffer_observe = self.max_episode_timesteps elif isinstance(buffer_observe, int): if buffer_observe <= 0: raise TensorforceError.value(name='buffer_observe', value=buffer_observe) if self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None: self.buffer_observe = buffer_observe else: self.buffer_observe = min(buffer_observe, self.max_episode_timesteps) else: raise TensorforceError.type(name='buffer_observe', value=buffer_observe) # Recorder if recorder is None: pass elif not all(key in ('directory', 'frequency', 'max-traces', 'start') for key in recorder): raise TensorforceError.value(name='recorder', value=list(recorder)) self.recorder_spec = recorder if recorder is None else dict(recorder) self.is_initialized = False
def __init__(self, states, actions, parallel_interactions=1, buffer_observe=1000, seed=None): """ Agent constructor. Args: states (specification): States specification, arbitrarily nested dictionary of state descriptions with the following attributes: - type ('bool' | 'int' | 'float'): state data type (default: 'float'). - shape (int | iter[int]): state shape (required). - num_states (int > 0): number of discrete state values (required for type 'int'). - min_value/max_value (float): minimum/maximum state value (optional for type 'float'). actions (specification): Actions specification, arbitrarily nested dictionary of action descriptions with the following attributes: - type ('bool' | 'int' | 'float'): action data type (required). - shape (int > 0 | iter[int > 0]): action shape (default: []). - num_actions (int > 0): number of discrete action values (required for type 'int'). - min_value/max_value (float): minimum/maximum action value (optional for type 'float'). parallel_interactions (int > 0): Maximum number of parallel interactions to support, for instance, to enable multiple parallel episodes, environments or (centrally controlled) agents within an environment. buffer_observe (int > 0): Maximum number of timesteps within an episode to buffer before executing internal observe operations, to reduce calls to TensorFlow for improved performance. """ if seed is not None: assert isinstance(seed, int) random.seed(n=seed) np.random.seed(seed=seed) tf.random.set_random_seed(seed=seed) # States/actions specification self.states_spec = util.valid_values_spec(values_spec=states, value_type='state', return_normalized=True) self.actions_spec = util.valid_values_spec(values_spec=actions, value_type='action', return_normalized=True) # Check for name overlap for name in self.states_spec: if name in self.actions_spec: TensorforceError.collision(name='name', value=name, group1='states', group2='actions') # Parallel episodes if isinstance(parallel_interactions, int): if parallel_interactions <= 0: raise TensorforceError.value(name='parallel_interactions', value=parallel_interactions) self.parallel_interactions = parallel_interactions else: raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) # Buffer observe if isinstance(buffer_observe, bool): # if update_mode['unit'] == 'episodes': # self.buffer_observe = 1000 if buffer_observe else 1 # else: # self.buffer_observe = update_mode['batch_size'] self.buffer_observe = 1000 if buffer_observe else 1 elif isinstance(buffer_observe, int): if buffer_observe <= 0: raise TensorforceError.value(name='buffer_observe', value=buffer_observe) self.buffer_observe = buffer_observe else: raise TensorforceError.type(name='buffer_observe', value=buffer_observe) # Parallel terminal/reward buffers self.terminal_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='bool')) self.reward_buffers = np.ndarray(shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='float')) # Parallel buffer indices self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ), dtype=util.np_dtype(dtype='int')) self.timestep = 0 self.episode = 0