def _process_states_input(self, states, function_name): if self.states_spec.is_singleton() and not isinstance( states, dict) and not (util.is_iterable(x=states) and isinstance(states[0], dict)): # Single state input_type = type(states) states = np.asarray(states) if states.shape == self.states_spec.value().shape: # Single state is not batched states = ArrayDict(singleton=np.expand_dims(states, axis=0)) batched = False num_instances = 1 is_iter_of_dicts = None input_type = None else: # Single state is batched, iter[state] assert states.shape[1:] == self.states_spec.value().shape assert input_type in (tuple, list, np.ndarray) num_instances = states.shape[0] states = ArrayDict(singleton=states) batched = True is_iter_of_dicts = True # Default elif util.is_iterable(x=states): # States is batched, iter[dict[state]] batched = True num_instances = len(states) is_iter_of_dicts = True input_type = type(states) assert input_type in (tuple, list) if num_instances == 0: raise TensorforceError.value(name=function_name, argument='len(states)', value=num_instances, hint='= 0') for n, state in enumerate(states): if not isinstance(state, dict): raise TensorforceError.type( name=function_name, argument='states[{}]'.format(n), dtype=type(state), hint='is not dict') # Turn iter of dicts into dict of arrays # (Doesn't use self.states_spec since states also contains auxiliaries) states = [ArrayDict(state) for state in states] states = states[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=states[1:]) elif isinstance(states, dict): # States is dict, turn into arrays some_state = next(iter(states.values())) input_type = type(some_state) states = ArrayDict(states) name, spec = self.states_spec.item() if name is None: name = 'state' if states[name].shape == spec.shape: # States is not batched, dict[state] states = states.fmap( function=(lambda state: np.expand_dims(state, axis=0))) batched = False num_instances = 1 is_iter_of_dicts = None input_type = None else: # States is batched, dict[iter[state]] assert states[name].shape[1:] == spec.shape assert input_type in (tuple, list, np.ndarray) batched = True num_instances = states[name].shape[0] is_iter_of_dicts = False if num_instances == 0: raise TensorforceError.value(name=function_name, argument='len(states)', value=num_instances, hint='= 0') else: raise TensorforceError.type(name=function_name, argument='states', dtype=type(states), hint='is not array/tuple/list/dict') # Check number of inputs if any(state.shape[0] != num_instances for state in states.values()): raise TensorforceError.value( name=function_name, argument='len(states)', value=[state.shape[0] for state in states.values()], hint='inconsistent') return states, batched, num_instances, is_iter_of_dicts, input_type
def get_module_class_and_args(*, name, module=None, modules=None, default_module=None, disable_first_arg=False, **kwargs): # name if not isinstance(name, str): raise TensorforceError.type(name='Module.add_module', argument='name', dtype=type(name)) # modules if modules is not None and not isinstance(modules, dict): raise TensorforceError.type(name='Module.add_module', argument='modules', dtype=type(modules)) # default_module if default_module is not None and default_module not in modules and \ not issubclass(default_module, Module): raise TensorforceError.value(name='Module.add_module', argument='default_module', value=default_module) # disable_first_arg if not isinstance(disable_first_arg, bool): raise TensorforceError.type(name='Module.add_module', argument='disable_first_arg', dtype=type(disable_first_arg)) # module if isinstance(module, dict): # Dictionary module specification (type either given via 'type' or 'default_module') util.deep_disjoint_update(target=kwargs, source=module) module = kwargs.pop('type', default_module) return Module.get_module_class_and_args( name=name, module=module, modules=modules, default_module=default_module, disable_first_arg=True, **kwargs) elif isinstance(module, str): if os.path.isfile(module): # JSON file module specification with open(module, 'r') as fp: module = json.load(fp=fp) return Module.get_module_class_and_args( name=name, module=module, modules=modules, default_module=default_module, disable_first_arg=True, **kwargs) elif '.' in module: # Library module specification library_name, module_name = module.rsplit('.', 1) library = importlib.import_module(name=library_name) module = getattr(library, module_name) return Module.get_module_class_and_args( name=name, module=module, modules=modules, default_module=default_module, disable_first_arg=True, **kwargs) elif modules is not None and module in modules: # Keyword module specification return Module.get_module_class_and_args( name=name, module=modules[module], modules=modules, default_module=default_module, disable_first_arg=True, **kwargs) elif 'default' in modules or default_module is not None: # Default module specification if '_first_arg' in kwargs: raise TensorforceError.invalid(name='Module.add_module', argument='_first_arg') if module is not None: if disable_first_arg: raise TensorforceError.value(name='Module.add_module', argument='module', value=module) kwargs['_first_arg'] = module if default_module is None: default_module = modules['default'] return Module.get_module_class_and_args(name=name, module=default_module, modules=modules, **kwargs) else: raise TensorforceError.value(name='Module.add_module', argument='module', value=module) elif not callable(module) and ('default' in modules or default_module is not None): # Default module specification if '_first_arg' in kwargs: raise TensorforceError.invalid(name='Module.add_module', argument='_first_arg') if module is not None: kwargs['_first_arg'] = module if default_module is None: default_module = modules['default'] return Module.get_module_class_and_args(name=name, module=default_module, modules=modules, **kwargs) elif callable(module): if '_first_arg' in kwargs: args = (kwargs.pop('_first_arg'), ) else: args = () kwargs['name'] = name return module, args, kwargs else: raise TensorforceError.value(name='Module.add_module', argument='module', value=module)
def summary(self, *, label, name, data, step): # label if not isinstance(label, str): raise TensorforceError.type(name='Module.summary', argument='label', dtype=type(label)) # name if not isinstance(name, (str, tuple, list)): raise TensorforceError.type(name='Module.summary', argument='name', dtype=type(name)) if isinstance(name, str): names = None else: names = name name = name[0] # data if not tf_util.is_tensor(x=data) and not callable(data): raise TensorforceError.type(name='Module.summary', argument='data', dtype=type(data)) # step if step not in self.root.units: raise TensorforceError.value(name='Module.summary', argument='step', value=step) if self.root.summaries == 'all' or label in self.root.summaries: if name not in self.summary_steps: raise TensorforceError.value(name='Module.summary', argument='name', value=name, hint='is not registered') unit = self.root.units[step] def fn_summary(): if callable(data): value = data() else: value = data dependencies = list() with self.root.summarizer.as_default(): if names is None: dependencies.append( tf.summary.scalar(name=name, data=value, step=unit)) else: for n, x in zip(names, value): dependencies.append( tf.summary.scalar(name=n, data=x, step=unit)) previous = self.summary_steps[name] dependencies.append( previous.assign(value=unit, read_value=False)) return tf.group(*dependencies) pred = unit > self.summary_steps[name] return [tf.cond(pred=pred, true_fn=fn_summary, false_fn=tf.no_op)] else: return list()
def receive_execute(self): if self._expect_receive == 'reset': self._expect_receive = None if self._num_parallel is None: states = self.reset() else: parallel, states = self.reset(num_parallel=num_parallel) if self._reset_output_check: self._check_states_output(states=states, function='reset') if self._num_parallel is not None: TensorSpec(type='int', shape=(), num_values=self._num_parallel).np_assert( x=parallel, batched=True, message=(function + ': invalid {issue} for parallel.')) self._reset_output_check = False if self._num_parallel is None: return states, -1, None else: return parallel, states, -1, None elif self._expect_receive == 'execute': self._expect_receive = None assert self._actions is not None if self._num_parallel is None: states, terminal, reward = self.execute(actions=self._actions) else: parallel, states, terminal, reward = self.execute( actions=self._actions) if self._execute_output_check: self._check_states_output(states=states, function='execute') if self._num_parallel is None: if isinstance(reward, (np.generic, np.ndarray)): reward = reward.item() if isinstance(terminal, (np.generic, np.ndarray)): terminal = terminal.item() if not isinstance(terminal, bool) and \ (not isinstance(terminal, int) or terminal < 0 or terminal > 2): raise TensorforceError( 'Environment.execute: invalid value {} for terminal.' .format(terminal)) if not isinstance(reward, (float, int)): raise TensorforceError( 'Environment.execute: invalid type {} for reward.'. format(type(reward))) else: TensorSpec(type='int', shape=(), num_values=self._num_parallel).np_assert( x=parallel, batched=True, message=(function + ': invalid {issue} for parallel.')) TensorSpec(type='bool', shape=()).np_assert( x=terminal, batched=True, message=(function + ': invalid {issue} for terminal.')) TensorSpec(type='float', shape=()).np_assert( x=reward, batched=True, message=(function + ': invalid {issue} for reward.')) self._execute_output_check = False self._actions = None if self._num_parallel is None: return states, int(terminal), reward else: return parallel, states, terminal, reward else: raise TensorforceError.unexpected()
def remote(cls, connection, environment, max_episode_timesteps=None, reward_shaping=None, **kwargs): try: env = None env = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) while True: attribute, kwargs = cls.remote_receive(connection=connection) if attribute in ('reset', 'execute'): environment_start = time.time() try: result = getattr(env, attribute) if callable(result): if kwargs is None: result = None else: result = result(**kwargs) elif kwargs is None: pass elif len(kwargs) == 1 and 'value' in kwargs: setattr(env, attribute, kwargs['value']) result = None else: raise TensorforceError( message="Invalid remote attribute/function access." ) except AttributeError: if kwargs is None or len( kwargs) != 1 or 'value' not in kwargs: raise TensorforceError( message="Invalid remote attribute/function access." ) setattr(env, attribute, kwargs['value']) result = None if attribute in ('reset', 'execute'): seconds = time.time() - environment_start if attribute == 'reset': result = (result, seconds) else: result += (seconds, ) cls.remote_send(connection=connection, success=True, result=result) if attribute == 'close': break except BaseException: etype, value, traceback = sys.exc_info() cls.remote_send(connection=connection, success=False, result=(str(etype), str(value), format_tb(traceback))) try: if env is not None: env.close() except BaseException: pass finally: etype, value, traceback = sys.exc_info() cls.remote_send(connection=connection, success=False, result=(str(etype), str(value), format_tb(traceback))) finally: cls.remote_close(connection=connection)
def __init__( # Required self, states, actions, memory, batch_size, # Environment max_episode_timesteps=None, # Network network='auto', # Optimization update_frequency='batch_size', start_updating=None, learning_rate=1e-3, huber_loss=0.0, # Reward estimation horizon=1, discount=0.99, predict_terminal_values=False, # Target network target_sync_frequency=1, target_update_weight=1.0, # Preprocessing preprocessing='linear_normalization', # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # Parallel interactions parallel_interactions=1, # Config, saver, summarizer, recorder config=None, saver=None, summarizer=None, recorder=None, # Deprecated estimate_terminal=None, **kwargs): if estimate_terminal is not None: raise TensorforceError.deprecated( name='DuelingDQN', argument='estimate_terminal', replacement='predict_terminal_values') self.spec = OrderedDict( agent='dueling_dqn', states=states, actions=actions, memory=memory, batch_size=batch_size, max_episode_timesteps=max_episode_timesteps, network=network, update_frequency=update_frequency, start_updating=start_updating, learning_rate=learning_rate, huber_loss=huber_loss, horizon=horizon, discount=discount, predict_terminal_values=predict_terminal_values, target_sync_frequency=target_sync_frequency, target_update_weight=target_update_weight, preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, parallel_interactions=parallel_interactions, config=config, saver=saver, summarizer=summarizer, recorder=recorder) distributions = dict( int=dict(type='categorical', advantage_based=True)) policy = dict(network=network, distributions=distributions, temperature=0.0) memory = dict(type='replay', capacity=memory) update = dict(unit='timesteps', batch_size=batch_size) if update_frequency != 'batch_size': update['frequency'] = update_frequency if start_updating is not None: update['start'] = start_updating optimizer = dict(type='adam', learning_rate=learning_rate) objective = dict(type='value', value='action', huber_loss=huber_loss) reward_estimation = dict( horizon=horizon, discount=discount, predict_horizon_values='late', estimate_advantage=False, predict_action_values=True, predict_terminal_values=predict_terminal_values) baseline_policy = policy baseline_optimizer = dict(type='synchronization', sync_frequency=target_sync_frequency, update_weight=target_update_weight) baseline_objective = None super().__init__( # Agent states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, parallel_interactions=parallel_interactions, config=config, recorder=recorder, # Model preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization, saver=saver, summarizer=summarizer, # TensorforceModel policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, reward_estimation=reward_estimation, baseline_policy=baseline_policy, baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective, entropy_regularization=entropy_regularization, **kwargs) if any(spec['type'] != 'int' for spec in self.actions_spec.values()): raise TensorforceError.value(name='DuelingDQN', argument='actions', value=actions, hint='contains non-int action')
def start_reset(self, num_parallel=None): if self._expect_receive is not None: raise TensorforceError.unexpected() self._expect_receive = 'reset' assert num_parallel is None or self.is_vectorizable() self._num_parallel = num_parallel
def act( self, states, parallel=0, deterministic=False, independent=False, evaluation=False, query=None, **kwargs ): """ Returns action(s) for the given state(s), needs to be followed by `observe(...)` unless `independent` is true. Args: states (dict[state]): Dictionary containing state(s) to be acted on (<span style="color:#C00000"><b>required</b></span>). parallel (int): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). deterministic (bool): Whether to apply exploration and sampling (<span style="color:#00C000"><b>default</b></span>: false). independent (bool): Whether action is not remembered, and this call is thus not followed by observe (<span style="color:#00C000"><b>default</b></span>: false). evaluation (bool): Whether the agent is currently evaluated, implies and overwrites deterministic and independent (<span style="color:#00C000"><b>default</b></span>: false). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: (dict[action], plus optional list[str]): Dictionary containing action(s), plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=states) # self.current_internals = self.next_internals if evaluation: if deterministic or independent: raise TensorforceError.unexpected() deterministic = independent = True # Auxiliaries auxiliaries = OrderedDict() if isinstance(states, dict): states = dict(states) for name, spec in self.actions_spec.items(): if spec['type'] == 'int' and name + '_mask' in states: auxiliaries[name + '_mask'] = states.pop(name + '_mask') # Normalize states dictionary states = util.normalize_values( value_type='state', values=states, values_spec=self.states_spec ) # Batch states states = util.fmap(function=(lambda x: np.asarray([x])), xs=states, depth=1) auxiliaries = util.fmap(function=(lambda x: np.asarray([x])), xs=auxiliaries, depth=1) # Model.act() if query is None: actions, self.timesteps = self.model.act( states=states, auxiliaries=auxiliaries, parallel=[parallel], deterministic=deterministic, independent=independent, **kwargs ) else: actions, self.timesteps, queried = self.model.act( states=states, auxiliaries=auxiliaries, parallel=[parallel], deterministic=deterministic, independent=independent, query=query, **kwargs ) if self.recorder_spec is not None and not independent and \ self.episodes >= self.recorder_spec.get('start', 0): index = self.buffer_indices[parallel] for name in self.states_spec: self.states_buffers[name][parallel, index] = states[name][0] for name, spec in self.actions_spec.items(): self.actions_buffers[name][parallel, index] = actions[name][0] if spec['type'] == 'int': name = name + '_mask' if name in auxiliaries: self.states_buffers[name][parallel, index] = auxiliaries[name][0] else: shape = (1,) + spec['shape'] + (spec['num_values'],) self.states_buffers[name][parallel, index] = np.full( shape=shape, fill_value=True, dtype=util.np_dtype(dtype='bool') ) # Unbatch actions actions = util.fmap(function=(lambda x: x[0]), xs=actions, depth=1) # Reverse normalized actions dictionary actions = util.unpack_values( value_type='action', values=actions, values_spec=self.actions_spec ) # if independent, return processed state as well? if query is None: return actions else: return actions, queried
def observe(self, reward, terminal=False, parallel=0, query=None, **kwargs): """ Observes reward and whether a terminal state is reached, needs to be preceded by `act(...)`. Args: reward (float): Reward (<span style="color:#C00000"><b>required</b></span>). terminal (bool | 0 | 1 | 2): Whether a terminal state is reached or 2 if the episode was aborted (<span style="color:#00C000"><b>default</b></span>: false). parallel (int): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). query (list[str]): Names of tensors to retrieve (<span style="color:#00C000"><b>default</b></span>: none). kwargs: Additional input values, for instance, for dynamic hyperparameters. Returns: (bool, optional list[str]): Whether an update was performed, plus queried tensor values if requested. """ assert util.reduce_all(predicate=util.not_nan_inf, xs=reward) if query is not None and self.parallel_interactions > 1: raise TensorforceError.unexpected() if isinstance(terminal, bool): terminal = int(terminal) # Update terminal/reward buffer index = self.buffer_indices[parallel] self.terminal_buffers[parallel, index] = terminal self.reward_buffers[parallel, index] = reward index += 1 if self.max_episode_timesteps is not None and index > self.max_episode_timesteps: raise TensorforceError.unexpected() if terminal > 0 or index == self.buffer_observe or query is not None: terminal = self.terminal_buffers[parallel, :index] reward = self.reward_buffers[parallel, :index] if self.recorder_spec is not None and \ self.episodes >= self.recorder_spec.get('start', 0): for name in self.states_spec: self.record_states[name].append( np.array(self.states_buffers[name][parallel, :index]) ) for name, spec in self.actions_spec.items(): self.record_actions[name].append( np.array(self.actions_buffers[name][parallel, :index]) ) if spec['type'] == 'int': self.record_states[name + '_mask'].append( np.array(self.states_buffers[name + '_mask'][parallel, :index]) ) self.record_terminal.append(np.array(terminal)) self.record_reward.append(np.array(reward)) if terminal[-1] > 0: self.num_episodes += 1 if self.num_episodes == self.recorder_spec.get('frequency', 1): directory = self.recorder_spec['directory'] if os.path.isdir(directory): files = sorted( f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and f.startswith('trace-') ) else: os.makedirs(directory) files = list() max_traces = self.recorder_spec.get('max-traces') if max_traces is not None and len(files) > max_traces - 1: for filename in files[:-max_traces + 1]: filename = os.path.join(directory, filename) os.remove(filename) filename = 'trace-{}-{}.npz'.format( self.episodes, time.strftime('%Y%m%d-%H%M%S') ) filename = os.path.join(directory, filename) self.record_states = util.fmap( function=np.concatenate, xs=self.record_states, depth=1 ) self.record_actions = util.fmap( function=np.concatenate, xs=self.record_actions, depth=1 ) self.record_terminal = np.concatenate(self.record_terminal) self.record_reward = np.concatenate(self.record_reward) np.savez_compressed( filename, **self.record_states, **self.record_actions, terminal=self.record_terminal, reward=self.record_reward ) self.record_states = util.fmap( function=(lambda x: list()), xs=self.record_states, depth=1 ) self.record_actions = util.fmap( function=(lambda x: list()), xs=self.record_actions, depth=1 ) self.record_terminal = list() self.record_reward = list() self.num_episodes = 0 # Model.observe() if query is None: updated, self.episodes, self.updates = self.model.observe( terminal=terminal, reward=reward, parallel=[parallel], **kwargs ) else: updated, self.episodes, self.updates, queried = self.model.observe( terminal=terminal, reward=reward, parallel=[parallel], query=query, **kwargs ) # Reset buffer index self.buffer_indices[parallel] = 0 else: # Increment buffer index self.buffer_indices[parallel] = index updated = False if query is None: return updated else: return updated, queried
def __init__( # Environment self, states, actions, max_episode_timesteps=None, # TensorFlow etc parallel_interactions=1, buffer_observe=True, seed=None, recorder=None ): assert hasattr(self, 'spec') if seed is not None: assert isinstance(seed, int) random.seed(a=seed) np.random.seed(seed=seed) # States/actions specification self.states_spec = util.valid_values_spec( values_spec=states, value_type='state', return_normalized=True ) self.actions_spec = util.valid_values_spec( values_spec=actions, value_type='action', return_normalized=True ) self.max_episode_timesteps = max_episode_timesteps # Check for name overlap for name in self.states_spec: if name in self.actions_spec: TensorforceError.collision( name='name', value=name, group1='states', group2='actions' ) # Parallel episodes if isinstance(parallel_interactions, int): if parallel_interactions <= 0: raise TensorforceError.value( name='parallel_interactions', value=parallel_interactions ) self.parallel_interactions = parallel_interactions else: raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions) # Buffer observe if isinstance(buffer_observe, bool): if not buffer_observe and self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None and self.parallel_interactions > 1: raise TensorforceError.unexpected() if not buffer_observe: self.buffer_observe = 1 elif self.max_episode_timesteps is None: self.buffer_observe = 100 else: self.buffer_observe = self.max_episode_timesteps elif isinstance(buffer_observe, int): if buffer_observe <= 0: raise TensorforceError.value(name='buffer_observe', value=buffer_observe) if self.parallel_interactions > 1: raise TensorforceError.unexpected() if self.max_episode_timesteps is None: self.buffer_observe = buffer_observe else: self.buffer_observe = min(buffer_observe, self.max_episode_timesteps) else: raise TensorforceError.type(name='buffer_observe', value=buffer_observe) # Recorder if recorder is None: pass elif not all(key in ('directory', 'frequency', 'max-traces', 'start') for key in recorder): raise TensorforceError.value(name='recorder', value=list(recorder)) self.recorder_spec = recorder if recorder is None else dict(recorder) self.is_initialized = False
def initialize(self): """ Initializes the agent. """ if self.is_initialized: raise TensorforceError.unexpected() self.is_initialized = True # Parallel terminal/reward buffers self.terminal_buffers = np.ndarray( shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='long') ) self.reward_buffers = np.ndarray( shape=(self.parallel_interactions, self.buffer_observe), dtype=util.np_dtype(dtype='float') ) # Recorder buffers if required if self.recorder_spec is not None: self.states_buffers = OrderedDict() self.actions_buffers = OrderedDict() for name, spec in self.states_spec.items(): shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] self.states_buffers[name] = np.ndarray( shape=shape, dtype=util.np_dtype(dtype=spec['type']) ) for name, spec in self.actions_spec.items(): shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] self.actions_buffers[name] = np.ndarray( shape=shape, dtype=util.np_dtype(dtype=spec['type']) ) if spec['type'] == 'int': shape = (self.parallel_interactions, self.buffer_observe) + spec['shape'] + \ (spec['num_values'],) self.states_buffers[name + '_mask'] = np.ndarray( shape=shape, dtype=util.np_dtype(dtype='bool') ) self.num_episodes = 0 self.record_states = OrderedDict(((name, list()) for name in self.states_spec)) self.record_actions = OrderedDict(((name, list()) for name in self.actions_spec)) for name, spec in self.actions_spec.items(): if spec['type'] == 'int': self.record_states[name + '_mask'] = list() self.record_terminal = list() self.record_reward = list() # Parallel buffer indices self.buffer_indices = np.zeros( shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int') ) self.timesteps = 0 self.episodes = 0 self.updates = 0 # Setup Model if not hasattr(self, 'model'): raise TensorforceError.missing(name='Agent', value='model') self.model.initialize() if self.model.saver_directory is not None: file = os.path.join(self.model.saver_directory, self.model.saver_filename + '.json') with open(file, 'w') as fp: json.dump(obj=self.spec, fp=fp) self.reset()
def __init__( # Required self, states, actions, max_episode_timesteps, batch_size, # Network network='auto', use_beta_distribution=False, # Memory memory='minimum', # Optimization update_frequency=1.0, learning_rate=1e-3, multi_step=10, subsampling_fraction=0.33, # Reward estimation likelihood_ratio_clipping=0.25, discount=0.99, return_processing=None, advantage_processing=None, predict_terminal_values=False, # Baseline baseline=None, baseline_optimizer=None, # Preprocessing state_preprocessing='linear_normalization', reward_preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # Parallel interactions parallel_interactions=1, # Config, saver, summarizer, tracking, recorder config=None, saver=None, summarizer=None, tracking=None, recorder=None, # Deprecated **kwargs): if 'optimization_steps' in kwargs: raise TensorforceError.deprecated(name='PPO', argument='optimization_steps', replacement='multi_step') if 'estimate_terminal' in kwargs: raise TensorforceError.deprecated( name='PPO', argument='estimate_terminal', replacement='predict_terminal_values') if 'critic_network' in kwargs: raise TensorforceError.deprecated(name='PPO', argument='critic_network', replacement='baseline') if 'baseline_network' in kwargs: raise TensorforceError.deprecated(name='PPO', argument='baseline_network', replacement='baseline') if 'critic_optimizer' in kwargs: raise TensorforceError.deprecated(name='PPO', argument='critic_optimizer', replacement='baseline_optimizer') self.spec = OrderedDict( agent='ppo', states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, batch_size=batch_size, network=network, use_beta_distribution=use_beta_distribution, memory=memory, update_frequency=update_frequency, learning_rate=learning_rate, multi_step=multi_step, subsampling_fraction=subsampling_fraction, likelihood_ratio_clipping=likelihood_ratio_clipping, discount=discount, return_processing=return_processing, advantage_processing=advantage_processing, predict_terminal_values=predict_terminal_values, baseline=baseline, baseline_optimizer=baseline_optimizer, state_preprocessing=state_preprocessing, reward_preprocessing=reward_preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, parallel_interactions=parallel_interactions, config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder) policy = dict(type='parametrized_distributions', network=network, temperature=1.0, use_beta_distribution=use_beta_distribution) if memory == 'minimum': memory = dict(type='recent') else: memory = dict(type='recent', capacity=memory) update = dict(unit='episodes', batch_size=batch_size, frequency=update_frequency) optimizer = dict(optimizer='adam', learning_rate=learning_rate, multi_step=multi_step, subsampling_fraction=subsampling_fraction) objective = dict(type='policy_gradient', importance_sampling=True, clipping_value=likelihood_ratio_clipping) if baseline is None: assert not predict_terminal_values reward_estimation = dict(horizon='episode', discount=discount, predict_horizon_values=False, estimate_advantage=False) assert baseline_optimizer is None baseline_objective = None else: reward_estimation = dict( horizon='episode', discount=discount, predict_horizon_values='early', estimate_advantage=True, predict_action_values=False, predict_terminal_values=predict_terminal_values) baseline = dict(type='parametrized_state_value', network=baseline) assert baseline_optimizer is not None baseline_objective = dict(type='state_value') super().__init__( # Agent states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, parallel_interactions=parallel_interactions, config=config, recorder=recorder, # TensorforceModel policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, reward_estimation=reward_estimation, baseline=baseline, baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective, l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, state_preprocessing=state_preprocessing, reward_preprocessing=reward_preprocessing, exploration=exploration, variable_noise=variable_noise, saver=saver, summarizer=summarizer, tracking=tracking, **kwargs)
def __init__( self, # Model name, device, parallel_interactions, buffer_observe, seed, execution, saver, summarizer, config, states, actions, preprocessing, exploration, variable_noise, l2_regularization, # TensorforceModel policy, memory, update, optimizer, objective, reward_estimation, baseline_policy, baseline_optimizer, baseline_objective, entropy_regularization ): # Policy internals specification policy_cls, first_arg, kwargs = Module.get_module_class_and_kwargs( name='policy', module=policy, modules=policy_modules, states_spec=states, actions_spec=actions ) if first_arg is None: internals = policy_cls.internals_spec(name='policy', **kwargs) else: internals = policy_cls.internals_spec(first_arg, name='policy', **kwargs) if any(name.startswith('baseline-') for name in internals): raise TensorforceError.unexpected() # Baseline internals specification if baseline_policy is None: pass else: baseline_cls, first_arg, kwargs = Module.get_module_class_and_kwargs( name='baseline', module=baseline_policy, modules=policy_modules, states_spec=states, actions_spec=actions ) if first_arg is None: baseline_internals = baseline_cls.internals_spec(name='baseline', **kwargs) else: baseline_internals = baseline_cls.internals_spec( first_arg, name='baseline', **kwargs ) for name, spec in baseline_internals.items(): if name in internals: raise TensorforceError( "Name overlap between policy and baseline internals: {}.".format(name) ) internals[name] = spec super().__init__( # Model name=name, device=device, parallel_interactions=parallel_interactions, buffer_observe=buffer_observe, seed=seed, execution=execution, saver=saver, summarizer=summarizer, config=config, states=states, internals=internals, actions=actions, preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization ) # Policy self.policy = self.add_module( name='policy', module=policy, modules=policy_modules, states_spec=self.states_spec, actions_spec=self.actions_spec ) # Memory self.memory = self.add_module( name='memory', module=memory, modules=memory_modules, is_trainable=False, values_spec=self.values_spec ) # Update mode if not all(key in ('batch_size', 'frequency', 'start', 'unit') for key in update): raise TensorforceError.value(name='update', value=list(update)) # update: unit elif 'unit' not in update: raise TensorforceError.required(name='update', value='unit') elif update['unit'] not in ('timesteps', 'episodes'): raise TensorforceError.value( name='update', argument='unit', value=update['unit'] ) # update: batch_size elif 'batch_size' not in update: raise TensorforceError.required(name='update', value='batch_size') self.update_unit = update['unit'] self.update_batch_size = self.add_module( name='update-batch-size', module=update['batch_size'], modules=parameter_modules, is_trainable=False, dtype='long' ) if 'frequency' in update and update['frequency'] == 'never': self.update_frequency = 'never' else: self.update_frequency = self.add_module( name='update-frequency', module=update.get('frequency', update['batch_size']), modules=parameter_modules, is_trainable=False, dtype='long' ) self.update_start = self.add_module( name='update-start', module=update.get('start', 0), modules=parameter_modules, is_trainable=False, dtype='long' ) # Optimizer self.optimizer = self.add_module( name='optimizer', module=optimizer, modules=optimizer_modules, is_trainable=False ) # Objective self.objective = self.add_module( name='objective', module=objective, modules=objective_modules, is_trainable=False ) # Estimator if not all(key in ( 'capacity', 'discount', 'estimate_actions', 'estimate_advantage', 'estimate_horizon', 'estimate_terminal', 'horizon' ) for key in reward_estimation): raise TensorforceError.value(name='reward_estimation', value=list(reward_estimation)) if baseline_policy is None and baseline_optimizer is None and baseline_objective is None: estimate_horizon = False else: estimate_horizon = 'late' self.estimator = self.add_module( name='estimator', module=Estimator, is_trainable=False, is_saved=False, values_spec=self.values_spec, horizon=reward_estimation['horizon'], discount=reward_estimation.get('discount', 1.0), estimate_horizon=reward_estimation.get('estimate_horizon', estimate_horizon), estimate_actions=reward_estimation.get('estimate_actions', False), estimate_terminal=reward_estimation.get('estimate_terminal', False), estimate_advantage=reward_estimation.get('estimate_advantage', False), capacity=reward_estimation['capacity'] ) # Baseline if (baseline_policy is not None or baseline_objective is not None) and \ (baseline_optimizer is None or isinstance(baseline_optimizer, float)): # since otherwise not part of training assert self.estimator.estimate_advantage or baseline_objective is not None is_trainable = True else: is_trainable = False if baseline_policy is None: self.baseline_policy = self.policy else: self.baseline_policy = self.add_module( name='baseline', module=baseline_policy, modules=policy_modules, is_trainable=is_trainable, is_subscope=True, states_spec=self.states_spec, actions_spec=self.actions_spec ) # Baseline optimizer if baseline_optimizer is None: self.baseline_optimizer = None self.baseline_loss_weight = 1.0 elif isinstance(baseline_optimizer, float): self.baseline_optimizer = None self.baseline_loss_weight = baseline_optimizer else: self.baseline_optimizer = self.add_module( name='baseline-optimizer', module=baseline_optimizer, modules=optimizer_modules, is_trainable=False, is_subscope=True ) # Baseline objective if baseline_objective is None: self.baseline_objective = None else: self.baseline_objective = self.add_module( name='baseline-objective', module=baseline_objective, modules=objective_modules, is_trainable=False, is_subscope=True ) # Entropy regularization entropy_regularization = 0.0 if entropy_regularization is None else entropy_regularization self.entropy_regularization = self.add_module( name='entropy-regularization', module=entropy_regularization, modules=parameter_modules, is_trainable=False, dtype='float' ) # Internals initialization self.internals_init.update(self.policy.internals_init()) self.internals_init.update(self.baseline_policy.internals_init()) if any(internal_init is None for internal_init in self.internals_init.values()): raise TensorforceError.unexpected() # Register global tensors Module.register_tensor(name='update', spec=dict(type='long', shape=()), batched=False) Module.register_tensor( name='optimization', spec=dict(type='bool', shape=()), batched=False ) Module.register_tensor( name='dependency_starts', spec=dict(type='long', shape=()), batched=True ) Module.register_tensor( name='dependency_lengths', spec=dict(type='long', shape=()), batched=True )
def __init__( # Required self, states, actions, memory, # Environment max_episode_timesteps=None, # Network network='auto', # Optimization batch_size=32, update_frequency=None, start_updating=None, learning_rate=3e-4, # Reward estimation horizon=0, discount=0.99, estimate_terminal=False, # Critic critic_network='auto', critic_optimizer=1.0, # Preprocessing preprocessing=None, # Exploration exploration=0.0, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # TensorFlow etc name='agent', device=None, parallel_interactions=1, seed=None, execution=None, saver=None, summarizer=None, recorder=None, config=None): self.spec = OrderedDict(agent='dpg', states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, network=network, memory=memory, batch_size=batch_size, update_frequency=update_frequency, start_updating=start_updating, learning_rate=learning_rate, horizon=horizon, discount=discount, estimate_terminal=estimate_terminal, critic_network=critic_network, critic_optimizer=critic_optimizer, preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, name=name, device=device, parallel_interactions=parallel_interactions, seed=seed, execution=execution, saver=saver, summarizer=summarizer, recorder=recorder, config=config) # TODO: action type and shape assert max_episode_timesteps is None or \ memory >= batch_size + max_episode_timesteps + horizon policy = dict(network=network, temperature=0.0) memory = dict(type='replay', capacity=memory) update = dict(unit='timesteps', batch_size=batch_size) if update_frequency is not None: update['frequency'] = update_frequency if start_updating is not None: update['start'] = start_updating optimizer = dict(type='adam', learning_rate=learning_rate) objective = 'det_policy_gradient' reward_estimation = dict(horizon=horizon, discount=discount, estimate_horizon='late', estimate_terminal=estimate_terminal, estimate_actions=True) # Action value doesn't exist for Beta baseline_policy = dict(network=critic_network, distributions=dict(float='gaussian')) baseline_objective = dict(type='value', value='action') super().__init__( # Agent states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, parallel_interactions=parallel_interactions, buffer_observe=True, seed=seed, recorder=recorder, config=config, # Model name=name, device=device, execution=execution, saver=saver, summarizer=summarizer, preprocessing=preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization, # TensorforceModel policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, reward_estimation=reward_estimation, baseline_policy=baseline_policy, baseline_optimizer=critic_optimizer, baseline_objective=baseline_objective, entropy_regularization=entropy_regularization) action_spec = next(iter(self.actions_spec.values())) if len(self.actions_spec) > 1 or action_spec['type'] != 'float' or \ action_spec['shape'] != (): raise TensorforceError.unexpected()
def get_output_spec(self, input_spec): if len(self.tensors) == 1: return Module.get_tensor_spec(name=self.tensors[0]) # Get tensor types and shapes dtypes = list() shapes = list() for tensor in self.tensors: # Tensor specification if tensor == '*': spec = input_spec else: spec = Module.get_tensor_spec(name=tensor) dtypes.append(spec['type']) shapes.append(spec['shape']) # Check tensor types if all(dtype == dtypes[0] for dtype in dtypes): dtype = dtypes[0] else: raise TensorforceError.value(name='retrieve', argument='tensor types', value=dtypes) if self.aggregation == 'concat': if any(len(shape) != len(shapes[0]) for shape in shapes): raise TensorforceError.value(name='retrieve', argument='tensor shapes', value=shapes) elif any(shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape)) if n != self.axis): raise TensorforceError.value(name='retrieve', argument='tensor shapes', value=shapes) shape = tuple( sum(shape[n] for shape in shapes) if n == self.axis else shapes[0][n] for n in range(len(shapes[0]))) elif self.aggregation == 'stack': if any(len(shape) != len(shapes[0]) for shape in shapes): raise TensorforceError.value(name='retrieve', argument='tensor shapes', value=shapes) elif any(shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape))): raise TensorforceError.value(name='retrieve', argument='tensor shapes', value=shapes) shape = tuple( len(shapes) if n == self.axis else shapes[0][n - int( n > self.axis)] for n in range(len(shapes[0]) + 1)) else: # Check and unify tensor shapes for shape in shapes: if len(shape) != len(shapes[0]): raise TensorforceError.value(name='retrieve', argument='tensor shapes', value=shapes) if any(x != y and x != 1 and y != 1 for x, y in zip(shape, shapes[0])): raise TensorforceError.value(name='retrieve', argument='tensor shapes', value=shapes) shape = tuple( max(shape[n] for shape in shapes) for n in range(len(shapes[0]))) # Missing num_values, min/max_value!!! return dict(type=dtype, shape=shape)
def __init__( # Required self, states, actions, memory, batch_size, # Environment max_episode_timesteps=None, # Network network='auto', use_beta_distribution=True, # Optimization update_frequency=1.0, start_updating=None, learning_rate=1e-3, # Reward estimation horizon=1, discount=0.99, return_processing=None, predict_terminal_values=False, # Critic critic='auto', critic_optimizer=1.0, # Preprocessing state_preprocessing='linear_normalization', reward_preprocessing=None, # Exploration exploration=0.1, variable_noise=0.0, # Regularization l2_regularization=0.0, entropy_regularization=0.0, # Parallel interactions parallel_interactions=1, # Config, saver, summarizer, tracking, recorder config=None, saver=None, summarizer=None, tracking=None, recorder=None, # Deprecated **kwargs ): if 'estimate_terminal' in kwargs: raise TensorforceError.deprecated( name='DPG', argument='estimate_terminal', replacement='predict_terminal_values' ) if 'critic_network' in kwargs: raise TensorforceError.deprecated( name='DPG', argument='critic_network', replacement='critic' ) self.spec = OrderedDict( agent='dpg', states=states, actions=actions, memory=memory, batch_size=batch_size, max_episode_timesteps=max_episode_timesteps, network=network, use_beta_distribution=use_beta_distribution, update_frequency=update_frequency, start_updating=start_updating, learning_rate=learning_rate, horizon=horizon, discount=discount, return_processing=return_processing, predict_terminal_values=predict_terminal_values, critic=critic, critic_optimizer=critic_optimizer, state_preprocessing=state_preprocessing, reward_preprocessing=reward_preprocessing, exploration=exploration, variable_noise=variable_noise, l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, parallel_interactions=parallel_interactions, config=config, saver=saver, summarizer=summarizer, tracking=tracking, recorder=recorder ) policy = dict( type='parametrized_distributions', network=network, temperature=0.0, use_beta_distribution=use_beta_distribution ) memory = dict(type='replay', capacity=memory) update = dict( unit='timesteps', batch_size=batch_size, frequency=update_frequency, start=start_updating ) optimizer = dict(type='adam', learning_rate=learning_rate) objective = 'deterministic_policy_gradient' reward_estimation = dict( horizon=horizon, discount=discount, predict_horizon_values='late', estimate_advantage=False, predict_action_values=True, predict_terminal_values=predict_terminal_values ) baseline = dict(type='parametrized_action_value', network=critic) baseline_optimizer = critic_optimizer baseline_objective = dict(type='value', value='action') super().__init__( # Agent states=states, actions=actions, max_episode_timesteps=max_episode_timesteps, parallel_interactions=parallel_interactions, config=config, recorder=recorder, # TensorforceModel policy=policy, memory=memory, update=update, optimizer=optimizer, objective=objective, reward_estimation=reward_estimation, baseline=baseline, baseline_optimizer=baseline_optimizer, baseline_objective=baseline_objective, l2_regularization=l2_regularization, entropy_regularization=entropy_regularization, state_preprocessing=state_preprocessing, reward_preprocessing=reward_preprocessing, exploration=exploration, variable_noise=variable_noise, saver=saver, summarizer=summarizer, tracking=tracking, **kwargs )
def __init__(self, *, name=None, action_spec=None, input_spec=None): assert action_spec.type == 'float' and action_spec.min_value is not None and \ action_spec.max_value is not None parameters_spec = TensorsSpec( alpha=TensorSpec(type='float', shape=action_spec.shape), beta=TensorSpec(type='float', shape=action_spec.shape), alpha_beta=TensorSpec(type='float', shape=action_spec.shape), log_norm=TensorSpec(type='float', shape=action_spec.shape)) conditions_spec = TensorsSpec() super().__init__(name=name, action_spec=action_spec, input_spec=input_spec, parameters_spec=parameters_spec, conditions_spec=conditions_spec) if len(self.input_spec.shape) == 1: # Single embedding action_size = util.product(xs=self.action_spec.shape, empty=0) self.alpha = self.submodule(name='alpha', module='linear', modules=layer_modules, size=action_size, initialization_scale=0.01, input_spec=self.input_spec) self.beta = self.submodule(name='beta', module='linear', modules=layer_modules, size=action_size, initialization_scale=0.01, input_spec=self.input_spec) else: # Embedding per action if len(self.input_spec.shape) < 1 or len( self.input_spec.shape) > 3: raise TensorforceError.value(name=name, argument='input_spec.shape', value=self.input_spec.shape, hint='invalid rank') if self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: size = self.action_spec.shape[-1] elif self.input_spec.shape[:-1] == self.action_spec.shape: size = 0 else: raise TensorforceError.value( name=name, argument='input_spec.shape', value=self.input_spec.shape, hint='not flattened and incompatible with action shape') self.alpha = self.submodule(name='alpha', module='linear', modules=layer_modules, size=size, initialization_scale=0.01, input_spec=self.input_spec) self.beta = self.submodule(name='beta', module='linear', modules=layer_modules, size=size, initialization_scale=0.01, input_spec=self.input_spec)
def __init__( self, *, stddev_mode='predicted', bounded_transform='tanh', name=None, action_spec=None, input_spec=None ): assert action_spec.type == 'float' parameters_spec = TensorsSpec( mean=TensorSpec(type='float', shape=action_spec.shape), stddev=TensorSpec(type='float', shape=action_spec.shape), log_stddev=TensorSpec(type='float', shape=action_spec.shape) ) conditions_spec = TensorsSpec() super().__init__( name=name, action_spec=action_spec, input_spec=input_spec, parameters_spec=parameters_spec, conditions_spec=conditions_spec ) self.stddev_mode = stddev_mode if bounded_transform is None: bounded_transform = 'tanh' if bounded_transform not in ('clipping', 'tanh'): raise TensorforceError.value( name='Gaussian', argument='bounded_transform', value=bounded_transform, hint='not in {clipping,tanh}' ) elif bounded_transform == 'tanh' and ( (self.action_spec.min_value is not None) is not (self.action_spec.max_value is not None) ): raise TensorforceError.value( name='Gaussian', argument='bounded_transform', value=bounded_transform, condition='one-sided bounded action space' ) elif self.action_spec.min_value is None and self.action_spec.max_value is None: bounded_transform = None self.bounded_transform = bounded_transform if self.input_spec.rank == 1: # Single embedding self.mean = self.submodule( name='mean', module='linear', modules=layer_modules, size=self.action_spec.size, initialization_scale=0.01, input_spec=self.input_spec ) if self.stddev_mode == 'predicted': self.softplus_stddev = self.submodule( name='softplus_stddev', module='linear', modules=layer_modules, size=self.action_spec.size, initialization_scale=0.01, input_spec=self.input_spec ) else: # Embedding per action if self.input_spec.rank < 1 or self.input_spec.rank > 3: raise TensorforceError.value( name=name, argument='input_spec.shape', value=self.embedding_shape, hint='invalid rank' ) elif self.input_spec.shape[:-1] == self.action_spec.shape[:-1]: size = self.action_spec.shape[-1] elif self.input_spec.shape[:-1] == self.action_spec.shape: size = 0 else: raise TensorforceError.value( name=name, argument='input_spec.shape', value=self.input_spec.shape, hint='not flattened and incompatible with action shape' ) self.mean = self.submodule( name='mean', module='linear', modules=layer_modules, size=size, initialization_scale=0.01, input_spec=self.input_spec ) if self.stddev_mode == 'predicted': self.softplus_stddev = self.submodule( name='softplus_stddev', module='linear', modules=layer_modules, size=size, initialization_scale=0.01, input_spec=self.input_spec )
def create(environment=None, max_episode_timesteps=None, reward_shaping=None, remote=None, blocking=False, host=None, port=None, **kwargs): """ Creates an environment from a specification. In case of "socket-server" remote mode, runs environment in server communication loop until closed. Args: environment (specification | Environment class/object): JSON file, specification key, configuration dictionary, library module, `Environment` class/object, or gym.Env (<span style="color:#C00000"><b>required</b></span>, invalid for "socket-client" remote mode). max_episode_timesteps (int > 0): Maximum number of timesteps per episode, overwrites the environment default if defined (<span style="color:#00C000"><b>default</b></span>: environment default, invalid for "socket-client" remote mode). reward_shaping (callable[(s,a,t,r,s') -> r|(r,t)] | str): Reward shaping function mapping state, action, terminal, reward and next state to shaped reward and terminal, or a string expression with arguments "states", "actions", "terminal", "reward" and "next_states", e.g. "-1.0 if terminal else max(reward, 0.0)" (<span style="color:#00C000"><b>default</b></span>: no reward shaping). remote ("multiprocessing" | "socket-client" | "socket-server"): Communication mode for remote environment execution of parallelized environment execution, "socket-client" mode requires a corresponding "socket-server" running, and "socket-server" mode runs environment in server communication loop until closed (<span style="color:#00C000"><b>default</b></span>: local execution). blocking (bool): Whether remote environment calls should be blocking (<span style="color:#00C000"><b>default</b></span>: not blocking, invalid unless "multiprocessing" or "socket-client" remote mode). host (str): Socket server hostname or IP address (<span style="color:#C00000"><b>required</b></span> only for "socket-client" remote mode). port (int): Socket server port (<span style="color:#C00000"><b>required</b></span> only for "socket-client/server" remote mode). kwargs: Additional arguments. """ if remote not in ('multiprocessing', 'socket-client'): if blocking: raise TensorforceError.invalid( name='Environment.create', argument='blocking', condition='no multiprocessing/socket-client instance') if remote not in ('socket-client', 'socket-server'): if host is not None: raise TensorforceError.invalid(name='Environment.create', argument='host', condition='no socket instance') elif port is not None: raise TensorforceError.invalid(name='Environment.create', argument='port', condition='no socket instance') if remote == 'multiprocessing': from tensorforce.environments import MultiprocessingEnvironment environment = MultiprocessingEnvironment( blocking=blocking, environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) return environment elif remote == 'socket-client': if environment is not None: raise TensorforceError.invalid( name='Environment.create', argument='environment', condition='socket-client instance') elif max_episode_timesteps is not None: raise TensorforceError.invalid( name='Environment.create', argument='max_episode_timesteps', condition='socket-client instance') elif len(kwargs) > 0: raise TensorforceError.invalid( name='Environment.create', argument='kwargs', condition='socket-client instance') from tensorforce.environments import SocketEnvironment environment = SocketEnvironment(host=host, port=port, blocking=blocking) return environment elif remote == 'socket-server': from tensorforce.environments import SocketEnvironment SocketEnvironment.remote( port=port, environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) elif remote is not None: raise TensorforceError.value(name='Environment.create', argument='remote', value=remote) elif isinstance(environment, (EnvironmentWrapper, RemoteEnvironment)): if max_episode_timesteps is not None and \ max_episode_timesteps != environment.max_episode_timesteps(): raise TensorforceError( message= 'Environment argument max_episode_timesteps has been specified twice ' 'with different values: {} != {}.'.format( max_episode_timesteps, environment.max_episode_timesteps())) if len(kwargs) > 0: raise TensorforceError.invalid( name='Environment.create', argument='kwargs', condition='EnvironmentWrapper instance') return environment elif isinstance(environment, type) and \ issubclass(environment, (EnvironmentWrapper, RemoteEnvironment)): raise TensorforceError.type(name='Environment.create', argument='environment', dtype=type(environment)) elif isinstance(environment, Environment): return EnvironmentWrapper( environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping) elif isinstance(environment, type) and issubclass( environment, Environment): environment = environment(**kwargs) assert isinstance(environment, Environment) return Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping) elif isinstance(environment, dict): # Dictionary specification util.deep_disjoint_update(target=kwargs, source=environment) environment = kwargs.pop('environment', kwargs.pop('type', 'default')) assert environment is not None if max_episode_timesteps is None: max_episode_timesteps = kwargs.pop('max_episode_timesteps', None) if reward_shaping is None: reward_shaping = kwargs.pop('reward_shaping', None) return Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) elif isinstance(environment, str): if os.path.isfile(environment): # JSON file specification with open(environment, 'r') as fp: environment = json.load(fp=fp) util.deep_disjoint_update(target=kwargs, source=environment) environment = kwargs.pop('environment', kwargs.pop('type', 'default')) assert environment is not None if max_episode_timesteps is None: max_episode_timesteps = kwargs.pop('max_episode_timesteps', None) if reward_shaping is None: reward_shaping = kwargs.pop('reward_shaping', None) return Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) elif environment in tensorforce.environments.environments: # Keyword specification environment = tensorforce.environments.environments[ environment] return Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) else: # Library specification import gym _environment = util.try_import_module( module=environment, parent_class=(Environment, gym.Env)) if _environment is not None: return Environment.create( environment=_environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) # Default: OpenAI Gym try: return Environment.create( environment='gym', level=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) except TensorforceError: raise TensorforceError.value(name='Environment.create', argument='environment', value=environment) else: # Default: OpenAI Gym import gym if isinstance(environment, gym.Env) or \ (isinstance(environment, type) and issubclass(environment, gym.Env)): return Environment.create( environment='gym', level=environment, max_episode_timesteps=max_episode_timesteps, reward_shaping=reward_shaping, **kwargs) else: raise TensorforceError.type(name='Environment.create', argument='environment', dtype=type(environment))
def __init__(self, agent, environment=None, num_parallel=None, environments=None, max_episode_timesteps=None, evaluation_environment=None, save_best_agent=None): self.environments = list() if environment is None: assert num_parallel is None and environments is not None if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) num_parallel = len(environments) environment = environments[0] self.is_environment_external = isinstance(environment, Environment) environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) states = environment.states() actions = environment.actions() self.environments.append(environment) for environment in environments[1:]: assert isinstance(environment, Environment) == self.is_environment_external environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) assert environment.states() == states assert environment.actions() == actions self.environments.append(environment) else: assert num_parallel is not None and environments is None assert not isinstance(environment, Environment) self.is_environment_external = False for _ in range(num_parallel): environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) self.environments(environment) if evaluation_environment is None: self.evaluation_environment = None else: self.is_eval_environment_external = isinstance( evaluation_environment, Environment) self.evaluation_environment = Environment.create( environment=evaluation_environment, max_episode_timesteps=max_episode_timesteps) assert self.evaluation_environment.states() == environment.states() assert self.evaluation_environment.actions( ) == environment.actions() self.is_agent_external = isinstance(agent, Agent) kwargs = dict(parallel_interactions=num_parallel) self.agent = Agent.create(agent=agent, environment=environment, **kwargs) self.save_best_agent = save_best_agent self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list() self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list()
def start_execute(self, actions): if self._expect_receive is not None: raise TensorforceError.unexpected() self._expect_receive = 'execute' assert self._actions is None self._actions = actions
def load(directory=None, filename=None, format=None, environment=None, **kwargs): """ Restores an agent from a directory/file. Args: directory (str): Checkpoint directory (<span style="color:#C00000"><b>required</b></span>, unless saver is specified). filename (str): Checkpoint filename, with or without append and extension (<span style="color:#00C000"><b>default</b></span>: "agent"). format ("checkpoint" | "saved-model" | "numpy" | "hdf5"): File format, "saved-model" loads an act-only agent based on a Protobuf model (<span style="color:#00C000"><b>default</b></span>: format matching directory and filename, required to be unambiguous). environment (Environment object): Environment which the agent is supposed to be trained on, environment-related arguments like state/action space specifications and maximum episode length will be extract if given (<span style="color:#C00000"><b>recommended</b></span>). kwargs: Additional agent arguments. """ if directory is not None: if filename is None: filename = 'agent' agent = os.path.join(directory, os.path.splitext(filename)[0] + '.json') if not os.path.isfile(agent) and agent[agent.rfind('-') + 1: -5].isdigit(): agent = agent[:agent.rindex('-')] + '.json' if os.path.isfile(agent): with open(agent, 'r') as fp: agent = json.load(fp=fp) if 'agent' in kwargs: if 'agent' in agent and agent['agent'] != kwargs['agent']: raise TensorforceError.value( name='Agent.load', argument='agent', value=kwargs['agent'] ) agent['agent'] = kwargs.pop('agent') else: agent = kwargs kwargs = dict() else: agent = kwargs kwargs = dict() # Overwrite values if environment is not None and environment.max_episode_timesteps() is not None: if 'max_episode_timesteps' in kwargs: assert kwargs['max_episode_timesteps'] >= environment.max_episode_timesteps() agent['max_episode_timesteps'] = kwargs['max_episode_timesteps'] else: agent['max_episode_timesteps'] = environment.max_episode_timesteps() if 'parallel_interactions' in kwargs and kwargs['parallel_interactions'] > 1: agent['parallel_interactions'] = kwargs['parallel_interactions'] agent.pop('internals', None) agent.pop('initial_internals', None) saver_restore = False if 'saver' in agent and isinstance(agent['saver'], dict): if not agent.get('load', True): raise TensorforceError.value( name='Agent.load', argument='saver[load]', value=agent['saver']['load'] ) agent['saver'] = dict(agent['saver']) agent['saver']['load'] = True saver_restore = True elif 'saver' in kwargs and isinstance(kwargs['saver'], dict): if not kwargs.get('load', True): raise TensorforceError.value( name='Agent.load', argument='saver[load]', value=kwargs['saver']['load'] ) kwargs['saver'] = dict(kwargs['saver']) kwargs['saver']['load'] = True saver_restore = True agent = Agent.create(agent=agent, environment=environment, **kwargs) if not saver_restore: agent.restore(directory=directory, filename=filename, format=format) return agent
def execute(self, actions): if self._timestep is None: raise TensorforceError( message= "An environment episode has to be initialized by calling reset() first." ) assert self._max_episode_timesteps is None or self._timestep < self._max_episode_timesteps if self._num_parallel is None: states, terminal, reward = self._environment.execute( actions=actions) else: parallel, states, terminal, reward = self._environment.execute( actions=actions) if self._execute_output_check: self._check_states_output(states=states, function='execute') if self._num_parallel is None: if isinstance(reward, (np.generic, np.ndarray)): reward = reward.item() if isinstance(terminal, (np.generic, np.ndarray)): terminal = terminal.item() if not isinstance(terminal, bool) and \ (not isinstance(terminal, int) or terminal < 0 or terminal > 2): raise TensorforceError( 'Environment.execute: invalid value {} for terminal.'. format(terminal)) if not isinstance(reward, (float, int)): raise TensorforceError( 'Environment.execute: invalid type {} for reward.'. format(type(reward))) else: TensorSpec(type='int', shape=( ), num_values=self._num_parallel).np_assert( x=parallel, batched=True, message='Environment.execute: invalid {issue} for parallel.' ) TensorSpec(type='bool', shape=()).np_assert( x=terminal, batched=True, message='Environment.execute: invalid {issue} for terminal.' ) TensorSpec(type='float', shape=()).np_assert( x=reward, batched=True, message='Environment.execute: invalid {issue} for reward.') self._execute_output_check = False if self._reward_shaping is not None: if isinstance(self._reward_shaping, str): reward = eval( self._reward_shaping, dict(), dict(states=self._previous_states, actions=actions, terminal=terminal, reward=reward, next_states=states, math=math, np=np, random=random)) else: reward = self._reward_shaping(self._previous_states, actions, terminal, reward, states) if isinstance(reward, tuple): reward, terminal = reward if isinstance(reward, (np.generic, np.ndarray)): reward = reward.item() if isinstance(terminal, (np.generic, np.ndarray)): terminal = terminal.item() self._previous_states = states self._timestep += 1 if self._num_parallel is None: terminal = int(terminal) if terminal == 0 and self._max_episode_timesteps is not None and \ self._timestep >= self._max_episode_timesteps: terminal = 2 if terminal > 0: self._timestep = None return states, terminal, reward else: terminal = terminal.astype(util.np_dtype('int')) if (terminal == 0).any() and self._max_episode_timesteps is not None and \ self._timestep >= self._max_episode_timesteps: terminal = np.where(terminal == 0, 2, terminal) parallel = parallel[:0] states = None if (terminal > 0).all(): self._timestep = None return parallel, states, terminal, reward
def create(agent='tensorforce', environment=None, **kwargs): """ Creates an agent from a specification. Args: agent (specification | Agent class/object | lambda[states -> actions]): JSON file, specification key, configuration dictionary, library module, or `Agent` class/object. Alternatively, an act-function mapping states to actions which is supposed to be recorded. (<span style="color:#00C000"><b>default</b></span>: Tensorforce base agent). environment (Environment object): Environment which the agent is supposed to be trained on, environment-related arguments like state/action space specifications and maximum episode length will be extract if given (<span style="color:#C00000"><b>recommended</b></span>). kwargs: Additional agent arguments. """ if isinstance(agent, Recorder): if environment is not None: # TODO: # assert agent.spec['states'] == environment.states() # assert agent.spec['actions'] == environment.actions() # assert environment.max_episode_timesteps() is None or \ # agent.spec['max_episode_timesteps'] >= environment.max_episode_timesteps() pass for key, value in kwargs.items(): if key == 'parallel_interactions': assert agent.spec[key] >= value else: assert agent.spec[key] == value if agent.is_initialized: agent.reset() else: agent.initialize() return agent elif (isinstance(agent, type) and issubclass(agent, Agent)) or callable(agent): # Type specification, or Recorder if environment is not None: if 'states' in kwargs: # TODO: # assert kwargs['states'] == environment.states() pass else: kwargs['states'] = environment.states() if 'actions' in kwargs: # assert kwargs['actions'] == environment.actions() pass else: kwargs['actions'] = environment.actions() if environment.max_episode_timesteps() is None: pass elif 'max_episode_timesteps' in kwargs: # assert kwargs['max_episode_timesteps'] >= environment.max_episode_timesteps() pass else: kwargs['max_episode_timesteps'] = environment.max_episode_timesteps() if isinstance(agent, type) and issubclass(agent, Agent): agent = agent(**kwargs) assert isinstance(agent, Agent) else: if 'recorder' not in kwargs: raise TensorforceError.required(name='Recorder', argument='recorder') agent = Recorder(fn_act=agent, **kwargs) return Agent.create(agent=agent, environment=environment) elif isinstance(agent, dict): # Dictionary specification agent.update(kwargs) kwargs = dict(agent) agent = kwargs.pop('agent', kwargs.pop('type', 'default')) return Agent.create(agent=agent, environment=environment, **kwargs) elif isinstance(agent, str): if os.path.isfile(agent): # JSON file specification with open(agent, 'r') as fp: agent = json.load(fp=fp) return Agent.create(agent=agent, environment=environment, **kwargs) elif '.' in agent: # Library specification library_name, module_name = agent.rsplit('.', 1) library = importlib.import_module(name=library_name) agent = getattr(library, module_name) return Agent.create(agent=agent, environment=environment, **kwargs) elif agent in tensorforce.agents.agents: # Keyword specification agent = tensorforce.agents.agents[agent] return Agent.create(agent=agent, environment=environment, **kwargs) else: raise TensorforceError.value(name='Agent.create', argument='agent', value=agent) else: raise TensorforceError.type(name='Agent.create', argument='agent', dtype=type(agent))
def specs_from_gym_space(space, ignore_value_bounds): import gym if isinstance(space, gym.spaces.Discrete): return dict(type='int', shape=(), num_values=space.n) elif isinstance(space, gym.spaces.MultiBinary): return dict(type='bool', shape=space.n) elif isinstance(space, gym.spaces.MultiDiscrete): num_discrete_space = len(space.nvec) if (space.nvec == space.nvec[0]).all(): return dict(type='int', shape=num_discrete_space, num_values=space.nvec[0]) else: specs = dict() for n in range(num_discrete_space): specs['gymmdc{}'.format(n)] = dict( type='int', shape=(), num_values=space.nvec[n]) return specs elif isinstance(space, gym.spaces.Box): if ignore_value_bounds: return dict(type='float', shape=space.shape) elif (space.low == space.low[0]).all() and (space.high == space.high[0]).all(): return dict(type='float', shape=space.shape, min_value=space.low[0], max_value=space.high[0]) else: specs = dict() low = space.low.flatten() high = space.high.flatten() for n in range(low.shape[0]): specs['gymbox{}'.format(n)] = dict(type='float', shape=(), min_value=low[n], max_value=high[n]) return specs elif isinstance(space, gym.spaces.Tuple): specs = dict() n = 0 for n, space in enumerate(space.spaces): spec = OpenAIGym.specs_from_gym_space( space=space, ignore_value_bounds=ignore_value_bounds) if 'type' in spec: specs['gymtpl{}'.format(n)] = spec else: for name, spec in spec.items(): specs['gymtpl{}-{}'.format(n, name)] = spec return specs elif isinstance(space, gym.spaces.Dict): specs = dict() for space_name, space in space.spaces.items(): spec = OpenAIGym.specs_from_gym_space( space=space, ignore_value_bounds=ignore_value_bounds) if 'type' in spec: specs[space_name] = spec else: for name, spec in spec.items(): specs['{}-{}'.format(space_name, name)] = spec return specs else: raise TensorforceError('Unknown Gym space.')
def restore(self, directory=None, filename=None, format=None): """ Restores the agent from a checkpoint. Args: directory (str): Checkpoint directory (<span style="color:#C00000"><b>required</b></span>, unless "saved-model" format and saver specified). filename (str): Checkpoint filename, with or without append and extension (<span style="color:#C00000"><b>required</b></span>, unless "saved-model" format and saver specified). format ("checkpoint" | "numpy" | "hdf5"): File format (<span style="color:#00C000"><b>default</b></span>: format matching directory and filename, required to be unambiguous). """ if not hasattr(self, 'model'): raise TensorforceError(message="Missing agent attribute model.") if not self.is_initialized: self.initialize() # format implicitly given if file exists if format is None and os.path.isfile(os.path.join(directory, filename)): if '.data-' in filename: filename = filename[:filename.index('.data-')] format = 'checkpoint' elif filename.endswith('.npz'): filename = filename[:-4] format = 'numpy' elif filename.endswith('.hdf5'): filename = filename[:-5] format = 'hdf5' elif filename.endswith('.h5'): filename = filename[:-3] format = 'hdf5' else: assert False elif format is None and os.path.isfile(os.path.join(directory, filename + '.index')): format = 'checkpoint' elif format is None and os.path.isfile(os.path.join(directory, filename + '.npz')): format = 'numpy' elif format is None and ( os.path.isfile(os.path.join(directory, filename + '.hdf5')) or os.path.isfile(os.path.join(directory, filename + '.h5')) ): format = 'hdf5' else: # infer format from directory found = None latest = -1 for name in os.listdir(directory): if format in (None, 'numpy') and name == filename + '.npz': assert found is None found = 'numpy' latest = None elif format in (None, 'numpy') and name.startswith(filename) and \ name.endswith('.npz'): assert found is None or found == 'numpy' found = 'numpy' n = int(name[len(filename) + 1: -4]) if n > latest: latest = n elif format in (None, 'hdf5') and \ (name == filename + '.hdf5' or name == filename + '.h5'): assert found is None found = 'hdf5' latest = None elif format in (None, 'hdf5') and name.startswith(filename) and \ (name.endswith('.hdf5') or name.endswith('.h5')): assert found is None or found == 'hdf5' found = 'hdf5' n = int(name[len(filename) + 1: -5]) if n > latest: latest = n if latest == -1: if format is None: format = 'checkpoint' else: assert format == 'checkpoint' if filename is None or \ not os.path.isfile(os.path.join(directory, filename + '.index')): import tensorflow as tf path = tf.train.latest_checkpoint(checkpoint_dir=directory) if not path: raise TensorforceError.exists_not(name='Checkpoint', value=directory) filename = os.path.basename(path) else: if format is None: format = found else: assert format == found if latest is not None: filename = filename + '-' + str(latest) self.timesteps, self.episodes, self.updates = self.model.restore( directory=directory, filename=filename, format=format )
def variable(self, *, name, spec, initializer, is_trainable, is_saved, initialization_scale=None): assert self.is_initialized is False # name if not isinstance(name, str): raise TensorforceError.type(name='variable', argument='name', dtype=type(name)) name = name.replace('/', '_') # spec if not isinstance(spec, TensorSpec): raise TensorforceError.type(name='variable', argument='spec', dtype=type(spec)) if spec.is_underspecified(): raise TensorforceError.value(name='variable', argument='spec', value=spec, hint='underspecified') # initializer initializer_names = ('constant', 'normal', 'normal-relu', 'ones', 'orthogonal', 'orthogonal-relu', 'zeros') if not isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) and \ initializer not in initializer_names: raise TensorforceError.value(name='variable', argument='initializer', value=initializer) elif isinstance(initializer, np.ndarray) and initializer.dtype != spec.np_type(): raise TensorforceError.type(name='variable', argument='initializer', dtype=initializer.dtype) elif isinstance( initializer, tf.Tensor) and tf_util.dtype(x=initializer) != spec.tf_type(): raise TensorforceError.type(name='variable', argument='initializer', dtype=tf_util.dtype(x=initializer)) # initialization_scale if initialization_scale is not None: if isinstance(initializer, (spec.py_type(), np.ndarray, tf.Tensor)) or \ initializer not in ('constant', 'orthogonal', 'orthogonal-relu'): raise TensorforceError.invalid( name='variable', argument='initialization_scale', condition='initializer not orthogonal') elif not isinstance(initialization_scale, spec.py_type()): raise TensorforceError.type(name='variable', argument='initialization_scale', dtype=type(initialization_scale), hint='!= float') # is_trainable if not isinstance(is_trainable, bool): raise TensorforceError.type(name='variable', argument='is_trainable', dtype=type(is_trainable)) elif is_trainable and spec.type != 'float': raise TensorforceError.value(name='variable', argument='is_trainable', value=is_trainable, condition='spec.type != float') # is_saved if not isinstance(is_saved, bool): raise TensorforceError.type(name='variable', argument='is_saved', dtype=type(is_saved)) # Variable initializer if isinstance(initializer, spec.py_type()): initializer = tf_util.constant(value=initializer, dtype=spec.type, shape=spec.shape) elif isinstance(initializer, np.ndarray): if initializer.shape != spec.shape: raise TensorforceError.mismatch(name='Module.variable', value1='shape', value2='initializer') initializer = tf_util.constant(value=initializer, dtype=spec.type) elif isinstance(initializer, tf.Tensor): if tf_util.shape(x=initializer) != spec.shape: raise TensorforceError.mismatch(name='Module.variable', value1='shape', value2='initializer') initializer = initializer elif not isinstance(initializer, str): raise TensorforceError( "Invalid variable initializer: {}".format(initializer)) elif initializer.startswith('normal'): if spec.type != 'float': raise TensorforceError( message= "Invalid variable initializer value for non-float variable: {}." .format(initializer)) if initializer.endswith('-relu'): stddev = min(0.1, np.sqrt(2.0 / util.product(xs=spec.shape[:-1]))) else: stddev = min( 0.1, np.sqrt( 2.0 / (util.product(xs=spec.shape[:-1]) + spec.shape[-1]))) initializer = tf.random.normal(shape=spec.shape, stddev=stddev, dtype=spec.tf_type()) elif initializer.startswith('orthogonal'): if spec.type != 'float': raise TensorforceError( message= "Invalid variable initializer value for non-float variable: {}." .format(initializer)) if spec.rank < 2: raise TensorforceError( message= "Invalid variable initializer value for 0/1-rank variable: {}." .format(initializer)) normal = np.random.normal(size=(util.product(xs=spec.shape[:-1]), spec.shape[-1])) u, _, v = np.linalg.svd(a=normal, full_matrices=False) orthogonal = u if u.shape[1] == spec.shape[-1] else v if initializer.endswith('-relu'): orthogonal = orthogonal * np.sqrt(2.0) if initialization_scale is not None and initialization_scale != 1.0: if initialization_scale <= 0.0: raise TensorforceError.value( name='variable', argument='initialization_scale', value=initialization_scale, hint='<= 0.0') orthogonal = orthogonal * initialization_scale initializer = tf_util.constant(value=orthogonal.reshape( spec.shape), dtype=spec.type) elif initializer == 'zeros': initializer = tf_util.zeros(shape=spec.shape, dtype=spec.type) elif initializer == 'ones': initializer = tf_util.ones(shape=spec.shape, dtype=spec.type) elif initializer == 'constant': initializer = tf.fill(dims=spec.shape, value=tf_util.constant( value=initialization_scale, dtype=spec.type)) # Variable variable = tf.Variable(initial_value=initializer, trainable=is_trainable, validate_shape=True, name=name, dtype=spec.tf_type(), shape=spec.shape) variable.is_saved = is_saved return variable
def prepare(self, environment=None, min_timesteps=None, states=None, actions=None, exclude_bool_action=False, exclude_int_action=False, exclude_float_action=False, exclude_bounded_action=False, require_observe=False, require_all=False, **agent): """ Generic unit-test preparation. """ Layer.layers = None if environment is None: if states is None: states = deepcopy(self.__class__.states) if actions is None: actions = deepcopy(self.__class__.actions) if exclude_bool_action or self.__class__.exclude_bool_action: actions.pop('bool_action') if exclude_int_action or self.__class__.exclude_int_action: actions.pop('int_action') if exclude_float_action or self.__class__.exclude_float_action: actions.pop('float_action') if exclude_bounded_action or self.__class__.exclude_bounded_action: actions.pop('bounded_action') if min_timesteps is None: min_timesteps = self.__class__.min_timesteps environment = UnittestEnvironment(states=states, actions=actions, min_timesteps=min_timesteps) elif min_timesteps is not None: raise TensorforceError.unexpected() environment = Environment.create(environment=environment, max_episode_timesteps=5) for key, value in self.__class__.agent.items(): if key not in agent: agent[key] = value if self.__class__.require_all or require_all: config = None elif self.__class__.require_observe or require_observe: config = dict(api_functions=['reset', 'act', 'observe']) else: config = dict(api_functions=['reset', 'act']) agent = Agent.create(agent=agent, environment=environment, config=config) return agent, environment
def run( self, # General num_episodes=None, num_timesteps=None, num_updates=None, # Parallel batch_agent_calls=False, sync_timesteps=False, sync_episodes=False, num_sleep_secs=0.001, # Callback callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, # Tqdm use_tqdm=True, mean_horizon=1, # Evaluation evaluation=False, save_best_agent=None, evaluation_callback=None): """ Run experiment. Args: num_episodes (int > 0): Number of episodes to run experiment, sum of episodes across all parallel/vectorized environment(s) / actors in a multi-actor environment (<span style="color:#00C000"><b>default</b></span>: no episode limit). num_timesteps (int > 0): Number of timesteps to run experiment, sum of timesteps across all parallel/vectorized environment(s) / actors in a multi-actor environment (<span style="color:#00C000"><b>default</b></span>: no timestep limit). num_updates (int > 0): Number of agent updates to run experiment (<span style="color:#00C000"><b>default</b></span>: no update limit). batch_agent_calls (bool): Whether to batch agent calls for parallel environment execution (<span style="color:#00C000"><b>default</b></span>: false, separate call per environment). sync_timesteps (bool): Whether to synchronize parallel environment execution on timestep-level, implied by batch_agent_calls (<span style="color:#00C000"><b>default</b></span>: false, unless batch_agent_calls is true). sync_episodes (bool): Whether to synchronize parallel environment execution on episode-level (<span style="color:#00C000"><b>default</b></span>: false). num_sleep_secs (float): Sleep duration if no environment is ready (<span style="color:#00C000"><b>default</b></span>: one milliseconds). callback (callable[(Runner, parallel) -> bool]): Callback function taking the runner instance plus parallel index and returning a boolean value indicating whether execution should continue (<span style="color:#00C000"><b>default</b></span>: callback always true). callback_episode_frequency (int): Episode interval between callbacks (<span style="color:#00C000"><b>default</b></span>: every episode). callback_timestep_frequency (int): Timestep interval between callbacks (<span style="color:#00C000"><b>default</b></span>: not specified). use_tqdm (bool): Whether to display a tqdm progress bar for the experiment run (<span style="color:#00C000"><b>default</b></span>: true), with the following additional information (averaged over number of episodes given via mean_horizon): <ul> <li>return – cumulative episode return</li> <li>ts/ep – timesteps per episode</li> <li>sec/ep – seconds per episode</li> <li>ms/ts – milliseconds per timestep</li> <li>agent – percentage of time spent on agent computation</li> <li>comm – if remote environment execution, percentage of time spent on communication</li> </ul> mean_horizon (int): Number of episodes progress bar values and evaluation score are averaged over (<span style="color:#00C000"><b>default</b></span>: not averaged). evaluation (bool): Whether to run in evaluation mode, only valid if single environment (<span style="color:#00C000"><b>default</b></span>: no evaluation). save_best_agent (string): Directory to save the best version of the agent according to the evaluation score (<span style="color:#00C000"><b>default</b></span>: best agent is not saved). evaluation_callback (int | callable[Runner -> float]): Callback function taking the runner instance and returning an evaluation score (<span style="color:#00C000"><b>default</b></span>: cumulative evaluation return averaged over mean_horizon episodes). """ # General if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes if num_timesteps is None: self.num_timesteps = float('inf') else: self.num_timesteps = num_timesteps if num_updates is None: self.num_updates = float('inf') else: self.num_updates = num_updates # Parallel if len(self.environments) == 1: condition = 'single environment' elif self.num_vectorized is not None: condition = 'vectorized environment' else: condition = None if condition is None: pass elif batch_agent_calls: raise TensorforceError.invalid(name='Runner.run', argument='batch_agent_calls', condition=condition) elif sync_timesteps: raise TensorforceError.invalid(name='Runner.run', argument='sync_timesteps', condition=condition) elif sync_episodes: raise TensorforceError.invalid(name='Runner.run', argument='sync_episodes', condition=condition) self.batch_agent_calls = batch_agent_calls or (self.num_vectorized is not None) self.sync_timesteps = sync_timesteps or self.batch_agent_calls self.sync_episodes = sync_episodes or (self.num_vectorized is not None) self.num_sleep_secs = num_sleep_secs if self.num_vectorized is None: self.num_environments = len(self.environments) else: self.num_environments = self.num_vectorized # Callback assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r, p: True) elif util.is_iterable(x=callback): def sequential_callback(runner, parallel): result = True for fn in callback: x = fn(runner, parallel) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner, parallel): result = callback(runner, parallel) if isinstance(result, bool): return result else: return True self.callback = boolean_callback # Experiment statistics self.episode_returns = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list() if self.is_environment_remote: self.episode_env_seconds = list() if self.evaluation or evaluation: self.evaluation_returns = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list() if self.is_environment_remote: self.evaluation_env_seconds = list() if self.num_environments == 1: # for tqdm self.episode_returns = self.evaluation_returns self.episode_timesteps = self.evaluation_timesteps self.episode_seconds = self.evaluation_seconds self.episode_agent_seconds = self.evaluation_agent_seconds if self.is_environment_remote: self.episode_env_seconds = self.evaluation_env_seconds else: # for tqdm self.evaluation_returns = self.episode_returns self.evaluation_timesteps = self.episode_timesteps self.evaluation_seconds = self.episode_seconds self.evaluation_agent_seconds = self.episode_agent_seconds if self.is_environment_remote: self.evaluation_env_seconds = self.episode_env_seconds # Timestep/episode/update counter self.timesteps = 0 self.episodes = 0 self.updates = 0 # Tqdm if use_tqdm: if hasattr(self, 'tqdm'): self.tqdm.close() assert self.num_episodes != float( 'inf') or self.num_timesteps != float('inf') inner_callback = self.callback if self.num_episodes != float('inf'): # Episode-based tqdm (default option if both num_episodes and num_timesteps set) assert self.num_episodes != float('inf') bar_format = ( '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, return={postfix[0]:.2f}, ts/ep=' '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent=' '{postfix[4]:.1f}%]') postfix = [0.0, 0, 0.0, 0.0, 0.0] if self.is_environment_remote: bar_format = bar_format[:-1] + ', comm={postfix[5]:.1f}%]' postfix.append(0.0) self.tqdm = tqdm(desc='Episodes', total=self.num_episodes, bar_format=bar_format, initial=self.episodes, postfix=postfix) self.tqdm_last_update = self.episodes def tqdm_callback(runner, parallel): if len(runner.evaluation_returns) > 0: mean_return = float( np.mean(runner.evaluation_returns[-mean_horizon:])) runner.tqdm.postfix[0] = mean_return if len(runner.episode_timesteps) > 0: mean_ts_per_ep = int( np.mean(runner.episode_timesteps[-mean_horizon:])) mean_sec_per_ep = float( np.mean(runner.episode_seconds[-mean_horizon:])) mean_agent_sec = float( np.mean( runner.episode_agent_seconds[-mean_horizon:])) try: mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep except ZeroDivisionError: mean_ms_per_ts = 0.0 try: mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep except ZeroDivisionError: mean_rel_agent = 0.0 runner.tqdm.postfix[1] = mean_ts_per_ep runner.tqdm.postfix[2] = mean_sec_per_ep runner.tqdm.postfix[3] = mean_ms_per_ts runner.tqdm.postfix[4] = mean_rel_agent if runner.is_environment_remote and len( runner.episode_env_seconds) > 0: mean_env_sec = float( np.mean( runner.episode_env_seconds[-mean_horizon:])) mean_rel_comm = (mean_agent_sec + mean_env_sec ) * 100.0 / mean_sec_per_ep mean_rel_comm = 100.0 - mean_rel_comm runner.tqdm.postfix[5] = mean_rel_comm runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update)) runner.tqdm_last_update = runner.episodes return inner_callback(runner, parallel) else: # Timestep-based tqdm self.tqdm = tqdm(desc='Timesteps', total=self.num_timesteps, initial=self.timesteps, postfix=dict(mean_return='n/a')) self.tqdm_last_update = self.timesteps def tqdm_callback(runner, parallel): # sum_timesteps_return = sum(runner.timestep_returns[num_mean_return:]) # num_timesteps = min(num_mean_return, runner.evaluation_timestep) # mean_return = sum_timesteps_return / num_episodes runner.tqdm.set_postfix(mean_return='n/a') runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update)) runner.tqdm_last_update = runner.timesteps return inner_callback(runner, parallel) self.callback = tqdm_callback # Evaluation if evaluation and self.num_environments > 1: raise TensorforceError.invalid(name='Runner.run', argument='evaluation', condition='parallel environments') self.evaluation_run = self.evaluation or evaluation self.save_best_agent = save_best_agent if evaluation_callback is None: self.evaluation_callback = (lambda r: None) else: self.evaluation_callback = evaluation_callback if self.save_best_agent is not None: inner_evaluation_callback = self.evaluation_callback def mean_return_callback(runner): result = inner_evaluation_callback(runner) if result is None: return float( np.mean(runner.evaluation_returns[-mean_horizon:])) else: return result self.evaluation_callback = mean_return_callback self.best_evaluation_score = None # Episode statistics self.episode_return = [0.0 for _ in range(self.num_environments)] self.episode_timestep = [0 for _ in range(self.num_environments)] # if self.batch_agent_calls: # self.episode_agent_second = 0.0 # self.episode_start = time.time() if self.evaluation_run: self.episode_agent_second = [ 0.0 for _ in range(self.num_environments - 1) ] self.episode_start = [ time.time() for _ in range(self.num_environments - 1) ] else: self.episode_agent_second = [ 0.0 for _ in range(self.num_environments) ] self.episode_start = [ time.time() for _ in range(self.num_environments) ] self.evaluation_agent_second = 0.0 self.evaluation_start = time.time() # Values self.terminate = 0 self.prev_terminals = [-1 for _ in range(self.num_environments)] self.states = [None for _ in range(self.num_environments)] self.terminals = [None for _ in range(self.num_environments)] self.rewards = [None for _ in range(self.num_environments)] if self.evaluation_run: self.evaluation_internals = self.agent.initial_internals() # Required if agent was previously stopped mid-episode self.agent.reset() # Reset environments if self.num_vectorized is None: for environment in self.environments: environment.start_reset() else: if self.environments[0].is_vectorizable(): parallel, states = self.environments[0].reset( num_parallel=self.num_vectorized) else: parallel, states = self.environments[0].reset() for i, n in enumerate(parallel): self.states[n] = states[i] self.prev_terminals[n] = -2 # Runner loop while any(terminal <= 0 for terminal in self.prev_terminals): self.terminals = [None for _ in self.terminals] if self.batch_agent_calls: if self.num_vectorized is None: # Retrieve observations (only if not already terminated) while any(terminal is None for terminal in self.terminals): for n in range(self.num_environments): if self.terminals[n] is not None: # Already received continue elif self.prev_terminals[n] <= 0: # Receive if not terminal observation = self.environments[ n].receive_execute() if observation is None: continue self.states[n], self.terminals[ n], self.rewards[n] = observation else: # Terminal self.states[n] = None self.terminals[n] = self.prev_terminals[n] self.rewards[n] = None else: # Vectorized environment execute if all(terminal >= -1 for terminal in self.prev_terminals): parallel, states, terminals, rewards = self.environments[ 0].execute(actions=np.asarray(self.actions)) i = 0 for n, terminal in enumerate(self.prev_terminals): if terminal <= 0: self.terminals[n] = terminals[i] self.rewards[n] = rewards[i] if terminals[i] > 0: self.states[n] = None i += 1 else: self.states[n] = None self.terminals[n] = self.prev_terminals[n] self.rewards[n] = None for i, n in enumerate(parallel): assert self.terminals[n] <= 0 or self.terminals[ n] == 2 self.states[n] = states[i] else: for n, terminal in enumerate(self.prev_terminals): if terminal < -1: self.terminals[n] = -1 else: self.terminals[n] = self.prev_terminals[n] self.handle_observe_joint() self.handle_act_joint() # Parallel environments loop no_environment_ready = True for n in range(self.num_environments): if self.prev_terminals[n] > 0: # Continue if episode terminated (either sync_episodes or finished) self.terminals[n] = self.prev_terminals[n] continue elif self.batch_agent_calls: # Handled before parallel environments loop pass elif self.sync_timesteps: # Wait until environment is ready while True: observation = self.environments[n].receive_execute() if observation is not None: break else: # Check whether environment is ready, otherwise continue observation = self.environments[n].receive_execute() if observation is None: self.terminals[n] = self.prev_terminals[n] continue no_environment_ready = False if not self.batch_agent_calls: self.states[n], self.terminals[n], self.rewards[ n] = observation # Check whether evaluation environment if self.evaluation_run and n == self.num_environments - 1: if self.terminals[n] == -1: # Initial act self.handle_act_evaluation() else: # Observe self.handle_observe_evaluation() if self.terminals[n] == 0: # Act self.handle_act_evaluation() else: # Terminal self.handle_terminal_evaluation() else: if self.terminals[n] == -1: # Initial act self.handle_act(parallel=n) else: # Observe self.handle_observe(parallel=n) if self.terminals[n] == 0: # Act self.handle_act(parallel=n) else: # Terminal self.handle_terminal(parallel=n) self.prev_terminals = list(self.terminals) # Sync_episodes: Reset if all episodes terminated if self.sync_episodes and all(terminal > 0 for terminal in self.terminals): num_episodes_left = self.num_episodes - self.episodes if self.num_vectorized is None: num_noneval_environments = self.num_environments - int( self.evaluation_run) for n in range( min(num_noneval_environments, num_episodes_left)): self.prev_terminals[n] = -1 self.environments[n].start_reset() if self.evaluation_run and num_episodes_left > 0: self.prev_terminals[-1] = -1 self.environments[-1].start_reset() elif num_episodes_left > 0: if self.environments[0].is_vectorizable(): parallel, states = self.environments[0].reset( num_parallel=min(num_episodes_left, self.num_vectorized)) else: parallel, states = self.environments[0].reset() for i, n in enumerate(parallel): self.states[n] = states[i] self.prev_terminals[n] = -2 else: self.prev_terminals = list() # Sleep if no environment was ready if no_environment_ready: time.sleep(self.num_sleep_secs)
def observe(self, reward=0.0, terminal=False, parallel=0): # Check whether inputs are batched if util.is_iterable(x=reward): reward = np.asarray(reward) num_parallel = reward.shape[0] if terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) if parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=terminal): terminal = np.asarray([int(t) for t in terminal]) num_parallel = terminal.shape[0] if reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=parallel): parallel = np.asarray(parallel) num_parallel = parallel.shape[0] if reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) else: reward = np.asarray([float(reward)]) terminal = np.asarray([int(terminal)]) parallel = np.asarray([int(parallel)]) num_parallel = 1 # Check whether shapes/lengths are consistent if parallel.shape[0] == 0: raise TensorforceError.value(name='Agent.observe', argument='len(parallel)', value=parallel.shape[0], hint='= 0') if reward.shape != parallel.shape: raise TensorforceError.value(name='Agent.observe', argument='len(reward)', value=reward.shape, hint='!= parallel length') if terminal.shape != parallel.shape: raise TensorforceError.value(name='Agent.observe', argument='len(terminal)', value=terminal.shape, hint='!= parallel length') # Convert terminal to int if necessary if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) terminal = np.where(terminal, ones, zeros) # Check whether current timesteps are not completed if self.timestep_completed[parallel].any(): raise TensorforceError( message="Calling agent.observe must be preceded by agent.act.") self.timestep_completed[parallel] = True # Check whether episode is too long self.timestep_counter[parallel] += 1 if self.max_episode_timesteps is not None and np.logical_and( terminal == 0, self.timestep_counter[parallel] > self.max_episode_timesteps).any(): raise TensorforceError( message="Episode longer than max_episode_timesteps.") self.timestep_counter[parallel] = np.where( terminal > 0, 0, self.timestep_counter[parallel]) if self.recorder is None: pass elif self.num_episodes < self.recorder.get('start', 0): # Increment num_episodes for t in terminal.tolist(): if t > 0: self.num_episodes += 1 else: # Store values per parallel interaction for p, t, r in zip(parallel.tolist(), terminal.tolist(), reward.tolist()): # Buffer inputs self.buffers['terminal'][p].append(t) self.buffers['reward'][p].append(r) # Continue if not terminal if t == 0: continue self.num_episodes += 1 # Buffered terminal/reward inputs for name in self.states_spec: self.recorded['states'][name].append( np.stack(self.buffers['states'][name][p], axis=0)) self.buffers['states'][name][p].clear() for name, spec in self.actions_spec.items(): self.recorded['actions'][name].append( np.stack(self.buffers['actions'][name][p], axis=0)) self.buffers['actions'][name][p].clear() self.recorded['terminal'].append( np.array(self.buffers['terminal'][p], dtype=self.terminal_spec.np_type())) self.buffers['terminal'][p].clear() self.recorded['reward'].append( np.array(self.buffers['reward'][p], dtype=self.reward_spec.np_type())) self.buffers['reward'][p].clear() # Check whether recording step if (self.num_episodes - self.recorder.get('start', 0)) \ % self.recorder.get('frequency', 1) != 0: continue # Manage recorder directory directory = self.recorder['directory'] if os.path.isdir(directory): files = sorted( f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and os.path.splitext(f)[1] == '.npz') else: os.makedirs(directory) files = list() max_traces = self.recorder.get('max-traces') if max_traces is not None and len(files) > max_traces - 1: for filename in files[:-max_traces + 1]: filename = os.path.join(directory, filename) os.remove(filename) # Write recording file filename = os.path.join( directory, 'trace-{:09d}.npz'.format(self.num_episodes - 1)) # time.strftime('%Y%m%d-%H%M%S') kwargs = self.recorded.fmap(function=np.concatenate, cls=ArrayDict).items() np.savez_compressed(file=filename, **dict(kwargs)) # Clear recorded values for recorded in self.recorded.values(): recorded.clear() if self._is_agent: return reward, terminal, parallel else: return 0