def __init__(self, name, tensors, aggregation='concat', axis=0, input_spec=None, summary_labels=None): if not isinstance(tensors, str) and not util.is_iterable(x=tensors): raise TensorforceError.type(name='retrieve', argument='tensors', dtype=type(tensors)) elif util.is_iterable(x=tensors) and len(tensors) == 0: raise TensorforceError.value(name='retrieve', argument='tensors', value=tensors, hint='zero length') if aggregation not in ('concat', 'product', 'stack', 'sum'): raise TensorforceError.value( name='retrieve', argument='aggregation', value=aggregation, hint='not in {concat,product,stack,sum}') self.tensors = (tensors, ) if isinstance(tensors, str) else tuple(tensors) self.aggregation = aggregation self.axis = axis super().__init__(name=name, input_spec=input_spec, summary_labels=summary_labels, l2_regularization=0.0) self.input_spec = None
def __init__( self, *, size, window=3, stride=1, padding='same', dilation=1, bias=True, activation='relu', dropout=0.0, initialization_scale=1.0, vars_trainable=True, l2_regularization=None, name=None, input_spec=None ): if isinstance(window, int): self.window = (window, window) elif util.is_iterable(x=window) and len(window) == 2: self.window = tuple(window) else: raise TensorforceError.type(name='Conv2d', argument='window', dtype=type(window)) if isinstance(stride, int): self.stride = (1, stride, stride, 1) elif util.is_iterable(x=stride) and len(stride) == 2: self.stride = (1, stride[0], stride[1], 1) else: raise TensorforceError.type(name='Conv2d', argument='stride', dtype=type(stride)) self.padding = padding if isinstance(dilation, int): self.dilation = (1, dilation, dilation, 1) elif util.is_iterable(x=dilation) and len(dilation) == 2: self.dilation = (1, dilation[0], dilation[1], 1) else: raise TensorforceError.type(name='Conv2d', argument='dilation', dtype=type(dilation)) super().__init__( name=name, size=size, bias=bias, activation=activation, dropout=dropout, vars_trainable=vars_trainable, input_spec=input_spec, l2_regularization=l2_regularization ) self.initialization_scale = initialization_scale
def __init__( self, *, size, window=3, stride=1, padding='same', dilation=1, bias=True, activation='relu', dropout=0.0, initialization_scale=1.0, vars_trainable=True, l2_regularization=None, name=None, input_spec=None ): if isinstance(window, int): self.window = (window, window) elif util.is_iterable(x=window) and len(window) == 2: self.window = tuple(window) else: raise TensorforceError.type(name='Conv2d', argument='window', dtype=type(window)) if isinstance(stride, int): self.stride = (1, stride, stride, 1) elif util.is_iterable(x=stride) and len(stride) == 2: self.stride = (1, stride[0], stride[1], 1) else: raise TensorforceError.type(name='Conv2d', argument='stride', dtype=type(stride)) self.padding = padding if isinstance(dilation, int): self.dilation = (1, dilation, dilation, 1) elif util.is_iterable(x=dilation) and len(dilation) == 2: self.dilation = (1, dilation[0], dilation[1], 1) else: raise TensorforceError.type(name='Conv2d', argument='dilation', dtype=type(dilation)) super().__init__( name=name, size=size, bias=bias, activation=activation, dropout=dropout, vars_trainable=vars_trainable, input_spec=input_spec, l2_regularization=l2_regularization ) self.initialization_scale = initialization_scale self.architecture_kwargs['size'] = str(size) self.architecture_kwargs['window'] = str(window) self.architecture_kwargs['padding'] = str(padding) if stride != 1: self.architecture_kwargs['stride'] = str(stride) if dilation != 1: self.architecture_kwargs['dilation'] = str(dilation) self.architecture_kwargs['bias'] = str(bias) if activation is not None: self.architecture_kwargs['activation'] = str(activation) if dropout != 0.0: self.architecture_kwargs['dropout'] = str(dropout) if initialization_scale != 1.0: self.architecture_kwargs['initialization_scale'] = str(initialization_scale) if not vars_trainable: self.architecture_kwargs['trainable'] = str(vars_trainable) if l2_regularization is not None: self.architecture_kwargs['l2_regularization'] = str(l2_regularization)
def __init__(self, agent, environments): if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) if not isinstance(agent, Agent): agent = Agent.from_spec(spec=agent, states=environments[0].states(), actions=environments[0].actions(), parallel_interactions=len(environments)) if len(environments) > agent.parallel_interactions: raise TensorforceError(message="Too many environments.") self.agent = agent self.environments = tuple(environments) self.agent.initialize() self.global_episode = self.agent.episode self.global_timestep = self.agent.timestep self.episode_rewards = list() self.episode_timesteps = list() self.episode_times = list()
def is_summary_logged(self, label): # Check whether any summaries are logged if self.summary_labels is None: return False # Check whether not in while loop if Module.while_counter > 0: return False # Check whether not in nested condition if Module.cond_counter > 1: return False # Temporary if label == 'variables' or label == 'variables-histogram': return False # Check whether given label is logged if util.is_iterable(x=label): assert all(not x.endswith('-histogram') for x in label) if self.summary_labels != 'all' and all(x not in self.summary_labels for x in label): return False else: if (self.summary_labels != 'all' or label.endswith('-histogram')) and \ label not in self.summary_labels: return False return True
def __init__(self, *, tensors, l2_regularization=None, name=None, input_spec=None): super(Layer, self).__init__(l2_regularization=l2_regularization, name=name) Layer._REGISTERED_LAYERS[self.name] = self if isinstance(tensors, str): pass elif not util.is_iterable(x=tensors): raise TensorforceError.type(name='MultiInputLayer', argument='tensors', dtype=type(tensors)) elif len(tensors) == 0: raise TensorforceError.value(name='MultiInputLayer', argument='tensors', value=tensors, hint='zero length') if isinstance(tensors, str): self.tensors = (tensors, ) else: self.tensors = tuple(tensors) self.input_spec = self.default_input_spec() if not isinstance(self.input_spec, TensorsSpec): raise TensorforceError.unexpected() self.input_spec = self.input_spec.unify(other=input_spec)
def fn(query=None, **kwargs): # Feed_dict dictionary feed_dict = dict() for key, arg in kwargs.items(): if arg is None: continue elif isinstance(arg, dict): # Support single nesting (for states, internals, actions) for key, arg in arg.items(): feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg else: feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg if not all(isinstance(x, str) and x.endswith('-input:0') for x in feed_dict): raise TensorforceError.unexpected() # Fetches value/tuple fetches = util.fmap(function=(lambda x: x.name), xs=results) if query is not None: # If additional tensors are to be fetched query = util.fmap( function=(lambda x: util.join_scopes(name, x) + '-output:0'), xs=query ) if util.is_iterable(x=fetches): fetches = tuple(fetches) + (query,) else: fetches = (fetches, query) if not util.reduce_all( predicate=(lambda x: isinstance(x, str) and x.endswith('-output:0')), xs=fetches ): raise TensorforceError.unexpected() # TensorFlow session call fetched = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) return fetched
def __init__(self, agent, environments, evaluation_environment=None, save_best_agent=False): # save_best overwrites saver... if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) self.is_environment_external = tuple( isinstance(environment, Environment) for environment in environments) self.environments = tuple( Environment.create(environment=environment) for environment in environments) self.is_eval_environment_external = isinstance(evaluation_environment, Environment) if evaluation_environment is None: self.evaluation_environment = None else: self.evaluation_environment = Environment.create( environment=evaluation_environment) self.save_best_agent = save_best_agent self.is_agent_external = isinstance(agent, Agent) kwargs = dict(parallel_interactions=len(environments)) # warning: save_best_agent if not self.is_agent_external and self.save_best_agent: # Disable periodic saving kwargs = dict(saver=dict(seconds=None, steps=None)) self.agent = Agent.create(agent=agent, environment=self.environments[0], **kwargs) if not self.agent.model.is_initialized: self.agent.initialize() # self.global_episodes = self.agent.episodes # self.global_timesteps = self.agent.timesteps # self.global_updates = self.agent.updates self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list() self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list()
def __init__(self, name, tensors, aggregation='concat', axis=0, input_spec=None): """ Retrieve constructor. Args: tensors (iter[string]): Global names of tensors to retrieve. aggregation ('concat' | 'product' | 'stack' | 'sum'): Aggregation type. axis (int >= 0): Aggregation axis (excluding batch axis). """ if not isinstance(tensors, str) and not util.is_iterable(x=tensors): raise TensorforceError.type(name='retrieve', argument='tensors', value=tensors) elif util.is_iterable(x=tensors) and len(tensors) == 0: raise TensorforceError.value(name='retrieve', argument='tensors', value=tensors) if aggregation not in ('concat', 'product', 'stack', 'sum'): raise TensorforceError.value(name='retrieve', argument='aggregation', value=aggregation) self.tensors = (tensors, ) if isinstance(tensors, str) else tuple(tensors) self.aggregation = aggregation self.axis = axis super().__init__(name=name, input_spec=input_spec, l2_regularization=0.0) self.input_spec = None
def add_placeholder(self, name, dtype, shape, batched, default=None): # name name = name + '-input' if not util.is_valid_name(name=name): raise TensorforceError.value(name='placeholder', argument='name', value=name) # dtype if not util.is_valid_type(dtype=dtype): raise TensorforceError.value(name='placeholder', argument='dtype', value=dtype) # shape if not util.is_iterable(x=shape) or \ not all(isinstance(num_dims, int) for num_dims in shape): raise TensorforceError.type(name='placeholder', argument='shape', value=shape) elif not all(num_dims > 0 for num_dims in shape): raise TensorforceError.value(name='placeholder', argument='shape', value=shape) # batched if not isinstance(batched, bool): raise TensorforceError.type(name='placeholder', argument='batched', value=batched) # default if default is not None: if batched: raise TensorforceError.unexpected() elif not isinstance(default, tf.Tensor): raise TensorforceError.unexpected() elif util.dtype(x=default) != dtype: raise TensorforceError.unexpected() # Placeholder if batched: shape = (None, ) + shape if default is None: dtype = util.tf_dtype(dtype=dtype) placeholder = tf.placeholder(dtype=dtype, shape=shape, name=name) else: # check dtype and shape !!! placeholder = tf.placeholder_with_default(input=default, shape=shape, name=name) return placeholder
def observe(self, reward=0.0, terminal=False, parallel=0): # Check whether inputs are batched if util.is_iterable(x=reward) or (isinstance(reward, np.ndarray) and reward.ndim > 0): reward = np.asarray(reward) num_parallel = reward.shape[0] if not isinstance(terminal, np.ndarray) and terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) if not isinstance(parallel, np.ndarray) and parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=terminal) or \ (isinstance(terminal, np.ndarray) and terminal.ndim > 0): terminal = np.asarray(terminal, dtype=util.np_dtype(dtype='int')) num_parallel = terminal.shape[0] if not isinstance(reward, np.ndarray) and reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if not isinstance(parallel, np.ndarray) and parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=parallel) or \ (isinstance(parallel, np.ndarray) and parallel.ndim > 0): parallel = np.asarray(parallel) num_parallel = parallel.shape[0] if not isinstance(reward, np.ndarray) and reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if not isinstance(terminal, np.ndarray) and terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) else: reward = np.asarray([float(reward)]) terminal = np.asarray([int(terminal)]) parallel = np.asarray([int(parallel)]) num_parallel = 1 # Check whether shapes/lengths are consistent if parallel.shape[0] == 0: raise TensorforceError.value(name='Agent.observe', argument='len(parallel)', value=parallel.shape[0], hint='= 0') if reward.shape != parallel.shape: raise TensorforceError.value(name='Agent.observe', argument='len(reward)', value=reward.shape, hint='!= parallel length') if terminal.shape != parallel.shape: raise TensorforceError.value(name='Agent.observe', argument='len(terminal)', value=terminal.shape, hint='!= parallel length') # Convert terminal to int if necessary if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) terminal = np.where(terminal, ones, zeros) # Check whether current timesteps are not completed if self.timestep_completed[parallel].any(): raise TensorforceError( message="Calling agent.observe must be preceded by agent.act.") self.timestep_completed[parallel] = True # Check whether episode is too long self.timestep_counter[parallel] += 1 if self.max_episode_timesteps is not None and np.logical_and( terminal == 0, self.timestep_counter[parallel] > self.max_episode_timesteps).any(): raise TensorforceError( message="Episode longer than max_episode_timesteps.") self.timestep_counter[parallel] = np.where( terminal > 0, 0, self.timestep_counter[parallel]) if self.recorder is None: pass elif self.num_episodes < self.recorder.get('start', 0): # Increment num_episodes for t in terminal.tolist(): if t > 0: self.num_episodes += 1 else: # Store values per parallel interaction for p, t, r in zip(parallel.tolist(), terminal.tolist(), reward.tolist()): # Buffer inputs self.buffers['terminal'][p].append(t) self.buffers['reward'][p].append(r) # Continue if not terminal if t == 0: continue self.num_episodes += 1 # Buffered terminal/reward inputs for name in self.states_spec: self.recorded['states'][name].append( np.stack(self.buffers['states'][name][p], axis=0)) self.buffers['states'][name][p].clear() for name, spec in self.actions_spec.items(): self.recorded['actions'][name].append( np.stack(self.buffers['actions'][name][p], axis=0)) self.buffers['actions'][name][p].clear() self.recorded['terminal'].append( np.array(self.buffers['terminal'][p], dtype=self.terminal_spec.np_type())) self.buffers['terminal'][p].clear() self.recorded['reward'].append( np.array(self.buffers['reward'][p], dtype=self.reward_spec.np_type())) self.buffers['reward'][p].clear() # Check whether recording step if (self.num_episodes - self.recorder.get('start', 0)) \ % self.recorder.get('frequency', 1) != 0: continue # Manage recorder directory directory = self.recorder['directory'] if os.path.isdir(directory): files = sorted( f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and os.path.splitext(f)[1] == '.npz') else: os.makedirs(directory) files = list() max_traces = self.recorder.get('max-traces') if max_traces is not None and len(files) > max_traces - 1: for filename in files[:-max_traces + 1]: filename = os.path.join(directory, filename) os.remove(filename) # Write recording file filename = os.path.join( directory, 'trace-{:09d}.npz'.format(self.num_episodes - 1)) # time.strftime('%Y%m%d-%H%M%S') kwargs = self.recorded.fmap(function=np.concatenate, cls=ArrayDict).items() np.savez_compressed(file=filename, **dict(kwargs)) # Clear recorded values for recorded in self.recorded.values(): recorded.clear() if self._is_agent: return reward, terminal, parallel else: return 0
def add_variable(self, name, dtype, shape, is_trainable, initializer='zeros', summarize=None, shared=None): # name if not util.is_valid_name(name=name): raise TensorforceError.value(name='variable', argument='name', value=name) elif name in self.variables: raise TensorforceError.exists(name='variable', value=name) # dtype if not util.is_valid_type(dtype=dtype): raise TensorforceError.value(name='variable', argument='dtype', value=dtype) # shape if not util.is_iterable(x=shape) or \ not all(isinstance(num_dims, int) for num_dims in shape): raise TensorforceError.type(name='variable', argument='shape', value=shape) elif not all(num_dims > 0 for num_dims in shape): raise TensorforceError.value(name='variable', argument='shape', value=shape) # is_trainable if not isinstance(is_trainable, bool): raise TensorforceError.type(name='variable', argument='is_trainable', value=is_trainable) # initializer if not isinstance(initializer, (util.py_dtype(dtype=dtype), np.ndarray, tf.Tensor)) and \ initializer not in ('random', 'zeros', 'ones'): raise TensorforceError.value(name='variable', argument='initializer', value=initializer) elif isinstance(initializer, np.ndarray) and \ initializer.dtype != util.np_dtype(dtype=dtype): raise TensorforceError.type(name='variable', argument='initializer', value=initializer) elif isinstance(initializer, tf.Tensor) and util.dtype(x=initializer) != dtype: raise TensorforceError.type(name='variable', argument='initializer', value=initializer) elif isinstance(initializer, str) and initializer == 'random' and dtype != 'float': raise TensorforceError( message= "Invalid variable initializer value for non-float variable: {}." .format(initializer)) # summarize if summarize is not None and not isinstance(summarize, bool): raise TensorforceError.type(name='variable', argument='summarize', value=summarize) # shared if shared is not None and not isinstance(shared, str): raise TensorforceError.type(name='variable', argument='shared', value=shared) variable = None if shared is not None and len(tf.get_collection(key=shared)) > 0: # Retrieve shared variable from TensorFlow collection = tf.get_collection(key=shared) if len(collection) > 1: raise TensorforceError.unexpected() variable = collection[0] else: tf_dtype = util.tf_dtype(dtype=dtype) # Variable initializer if isinstance(initializer, util.py_dtype(dtype=dtype)): initializer = tf.constant(value=initializer, dtype=tf_dtype, shape=shape) elif isinstance(initializer, np.ndarray): if initializer.shape != shape: raise TensorforceError( "Invalid variable initializer shape: {}.".format( initializer.shape)) initializer = initializer elif isinstance(initializer, tf.Tensor): if util.shape(x=initializer) != shape: raise TensorforceError( "Invalid variable initializer shape: {}.".format( util.shape(x=initializer))) initializer = initializer elif not isinstance(initializer, str): raise TensorforceError( "Invalid variable initializer: {}".format(initializer)) elif initializer == 'random': stddev = min( 0.1, sqrt(2.0 / (util.product(xs=shape[:-1]) + shape[-1]))) initializer = tf.random_normal( shape=shape, mean=0.0, stddev=stddev, dtype=util.tf_dtype(dtype=dtype)) elif initializer == 'zeros': initializer = tf.zeros(shape=shape, dtype=tf_dtype) elif initializer == 'ones': initializer = tf.ones(shape=shape, dtype=tf_dtype) # Variable variable = tf.Variable(initial_value=initializer, trainable=is_trainable, validate_shape=True, name=name, dtype=tf_dtype, expected_shape=shape) # collections= # Register shared variable with TensorFlow if shared is not None: tf.add_to_collection(name=shared, value=variable) # Register variable self.variables[name] = variable if is_trainable: self.trainable_variables[name] = variable # Add summary if (summarize is None and is_trainable) or summarize: variable = tf.identity(input=variable) variable = self.add_summary(label='variables', name=name, tensor=variable, mean_variance=True) return variable
def run( self, # General num_episodes=None, num_timesteps=None, num_updates=None, num_repeat_actions=1, # Callback callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, # Tqdm use_tqdm=True, mean_horizon=1, # Evaluation evaluation=False, evaluation_callback=None, evaluation_frequency=None, num_evaluation_iterations=1 ): # General if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes if num_timesteps is None: self.num_timesteps = float('inf') else: self.num_timesteps = num_timesteps if num_updates is None: self.num_updates = float('inf') else: self.num_updates = num_updates self.num_repeat_actions = num_repeat_actions # Callback assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r: True) elif util.is_iterable(x=callback): def sequential_callback(runner): result = True for fn in callback: x = fn(runner) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner): result = callback(runner) if isinstance(result, bool): return result else: return True self.callback = boolean_callback # Timestep/episode/update counter self.timesteps = 0 self.episodes = 0 self.updates = 0 # Tqdm if use_tqdm: if hasattr(self, 'tqdm'): self.tqdm.close() assert self.num_episodes != float('inf') or self.num_timesteps != float('inf') inner_callback = self.callback if self.num_episodes != float('inf'): # Episode-based tqdm (default option if both num_episodes and num_timesteps set) assert self.num_episodes != float('inf') bar_format = ( '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep=' '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent=' '{postfix[4]:.1f}%]' ) postfix = [0.0, 0, 0.0, 0.0, 0.0] self.tqdm = tqdm( desc='Episodes', total=self.num_episodes, bar_format=bar_format, initial=self.episodes, postfix=postfix ) self.tqdm_last_update = self.episodes def tqdm_callback(runner): mean_reward = float(np.mean(runner.episode_rewards[-mean_horizon:])) mean_ts_per_ep = int(np.mean(runner.episode_timesteps[-mean_horizon:])) mean_sec_per_ep = float(np.mean(runner.episode_seconds[-mean_horizon:])) mean_agent_sec = float(np.mean(runner.episode_agent_seconds[-mean_horizon:])) mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep runner.tqdm.postfix[0] = mean_reward runner.tqdm.postfix[1] = mean_ts_per_ep runner.tqdm.postfix[2] = mean_sec_per_ep runner.tqdm.postfix[3] = mean_ms_per_ts runner.tqdm.postfix[4] = mean_rel_agent runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update)) runner.tqdm_last_update = runner.episodes return inner_callback(runner) else: # Timestep-based tqdm assert self.num_timesteps != float('inf') self.tqdm = tqdm( desc='Timesteps', total=self.num_timesteps, initial=self.timesteps, postfix=dict(mean_reward='n/a') ) self.tqdm_last_update = self.timesteps def tqdm_callback(runner): # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:]) # num_timesteps = min(num_mean_reward, runner.episode_timestep) # mean_reward = sum_timesteps_reward / num_episodes runner.tqdm.set_postfix(mean_reward='n/a') runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update)) runner.tqdm_last_update = runner.timesteps return inner_callback(runner) self.callback = tqdm_callback # Evaluation self.evaluation = evaluation if evaluation_callback is None: self.evaluation_callback = (lambda r: None) else: assert not self.evaluation self.evaluation_callback = evaluation_callback if self.evaluation: assert evaluation_frequency is None self.evaluation_frequency = evaluation_frequency self.num_evaluation_iterations = num_evaluation_iterations if self.save_best_agent is not None: assert not self.evaluation inner_evaluation_callback = self.evaluation_callback def mean_reward_callback(runner): result = inner_evaluation_callback(runner) if result is None: return float(np.mean(runner.evaluation_rewards)) else: return result self.evaluation_callback = mean_reward_callback self.best_evaluation_score = None # Required if agent was previously stopped mid-episode self.agent.reset() # Episode loop while True: # Run episode if not self.run_episode(environment=self.environment, evaluation=self.evaluation): return # Increment episode counter (after calling callback) self.episodes += 1 # Update experiment statistics self.episode_rewards.append(self.episode_reward) self.episode_timesteps.append(self.episode_timestep) self.episode_seconds.append(self.episode_second) self.episode_agent_seconds.append(self.episode_agent_second) # Run evaluation if self.evaluation_frequency is None: is_evaluation = False elif self.evaluation_frequency == 'update': is_evaluation = self.episode_updated else: is_evaluation = (self.episodes % self.evaluation_frequency == 0) if is_evaluation: if self.evaluation_environment is None: environment = self.environment else: environment = self.evaluation_environment self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list() # Evaluation loop for _ in range(self.num_evaluation_iterations): self.run_episode(environment=environment, evaluation=True) self.evaluation_rewards.append(self.episode_reward) self.evaluation_timesteps.append(self.episode_timestep) self.evaluation_seconds.append(self.episode_second) self.evaluation_agent_seconds.append(self.episode_agent_second) # Evaluation callback if self.save_best_agent is not None: evaluation_score = self.evaluation_callback(self) assert isinstance(evaluation_score, float) if self.best_evaluation_score is None: self.best_evaluation_score = evaluation_score elif evaluation_score > self.best_evaluation_score: self.best_evaluation_score = evaluation_score self.agent.save( directory=self.save_best_agent, filename='best-model', append_timestep=False ) else: self.evaluation_callback(self) # # Update global timestep/episode/update # self.global_timesteps = self.agent.timesteps # self.global_episodes = self.agent.episodes # self.global_updates = self.agent.updates # Callback if self.episodes % self.callback_episode_frequency == 0 and not self.callback(self): return # Terminate experiment if too long if self.timesteps >= self.num_timesteps: return # elif self.evaluation and self.timesteps >= self.num_timesteps: # return elif self.episodes >= self.num_episodes: return # elif self.evaluation and self.episodes >= self.num_episodes: # return elif self.updates >= self.num_updates: return # elif self.evaluation and self.updates >= self.num_updates: # return elif self.agent.should_stop(): return
def add_summary( self, label, name, tensor, pass_tensors=None, return_summaries=False, mean_variance=False, enumerate_last_rank=False ): # should be "labels" !!! # label if util.is_iterable(x=label): if not all(isinstance(x, str) for x in label): raise TensorforceError.value( name='Module.add_summary', argument='label', value=label ) else: if not isinstance(label, str): raise TensorforceError.type( name='Module.add_summary', argument='label', dtype=type(label) ) # name if not isinstance(name, str): raise TensorforceError.type( name='Module.add_summary', argument='name', dtype=type(name) ) # tensor if not isinstance(tensor, (tf.Tensor, tf.Variable)): raise TensorforceError.type( name='Module.add_summary', argument='tensor', dtype=type(tensor) ) # pass_tensors if util.is_iterable(x=pass_tensors): if not all(isinstance(x, (tf.Tensor, tf.IndexedSlices)) for x in pass_tensors): raise TensorforceError.value( name='Module.add_summary', argument='pass_tensors', value=pass_tensors ) elif pass_tensors is not None: if not isinstance(pass_tensors, tf.Tensor): raise TensorforceError.type( name='Module.add_summary', argument='pass_tensors', dtype=type(pass_tensors) ) # enumerate_last_rank if not isinstance(enumerate_last_rank, bool): raise TensorforceError.type( name='Module.add_summary', argument='enumerate_last_rank', dtype=type(tensor) ) if pass_tensors is None: pass_tensors = tensor # Check whether summary is logged if not self.is_summary_logged(label=label): return pass_tensors # Add to available summaries if util.is_iterable(x=label): self.available_summaries.update(label) else: self.available_summaries.add(label) # Handle enumerate_last_rank if enumerate_last_rank: dims = util.shape(x=tensor)[-1] tensors = OrderedDict([(name + str(n), tensor[..., n]) for n in range(dims)]) else: tensors = OrderedDict([(name, tensor)]) if mean_variance: for name in list(tensors): tensor = tensors.pop(name) mean, variance = tf.nn.moments(x=tensor, axes=tuple(range(util.rank(x=tensor)))) tensors[name + '-mean'] = mean tensors[name + '-variance'] = variance # Scope handling if Module.scope_stack is not None: for scope in reversed(Module.scope_stack[1:]): scope.__exit__(None, None, None) if len(Module.global_scope) > 0: temp_scope = tf.name_scope(name='/'.join(Module.global_scope)) temp_scope.__enter__() tensors = util.fmap(function=util.identity_operation, xs=tensors) # TensorFlow summaries assert Module.global_summary_step is not None step = Module.retrieve_tensor(name=Module.global_summary_step) summaries = list() for name, tensor in tensors.items(): shape = util.shape(x=tensor) if shape == (): summaries.append(tf.summary.scalar(name=name, data=tensor, step=step)) elif shape == (-1,): tensor = tf.math.reduce_sum(input_tensor=tensor, axis=0) summaries.append(tf.summary.scalar(name=name, data=tensor, step=step)) elif shape == (1,): tensor = tf.squeeze(input=tensor, axis=-1) summaries.append(tf.summary.scalar(name=name, data=tensor, step=step)) elif shape == (-1, 1): tensor = tf.math.reduce_sum(input_tensor=tf.squeeze(input=tensor, axis=-1), axis=0) summaries.append(tf.summary.scalar(name=name, data=tensor, step=step)) else: # General tensor as histogram assert not util.is_iterable(x=label) and label.endswith('-histogram') summaries.append(tf.summary.histogram(name=name, data=tensor, step=step)) # Scope handling if Module.scope_stack is not None: if len(Module.global_scope) > 0: temp_scope.__exit__(None, None, None) for scope in Module.scope_stack[1:]: scope.__enter__() with tf.control_dependencies(control_inputs=summaries): return util.fmap(function=util.identity_operation, xs=pass_tensors)
def add_summary(self, label, name, tensor, pass_tensors=None, return_summaries=False, mean_variance=False, enumerate_last_rank=False): # should be "labels" !!! # label if util.is_iterable(x=label): if not all(isinstance(x, str) for x in label): raise TensorforceError.type(name='summary', argument='label', value=label) else: if not isinstance(label, str): raise TensorforceError.type(name='summary', argument='label', value=label) # name if not isinstance(name, str): raise TensorforceError.type(name='summary', argument='name', value=name) # tensor if not isinstance(tensor, tf.Tensor): raise TensorforceError.type(name='summary', argument='tensor', value=tensor) # pass_tensors if util.is_iterable(x=pass_tensors): if not all(isinstance(x, tf.Tensor) for x in pass_tensors): raise TensorforceError.type(name='summary', argument='pass_tensors', value=pass_tensors) elif pass_tensors is not None: if not isinstance(pass_tensors, tf.Tensor): raise TensorforceError.type(name='summary', argument='pass_tensors', value=pass_tensors) # enumerate_last_rank if not isinstance(enumerate_last_rank, bool): raise TensorforceError.type(name='summary', argument='enumerate_last_rank', value=tensor) if pass_tensors is None: pass_tensors = tensor # Check whether summaries are logged if self.summary_labels is None: return pass_tensors # Check whether not in while loop if 'while' in Module.global_scope: # 'cond' in Module.global_scope return pass_tensors # Check whether given label is logged if util.is_iterable(x=label): if all(x not in self.summary_labels for x in label): return pass_tensors else: if label not in self.summary_labels: return pass_tensors # Handle enumerate_last_rank if enumerate_last_rank: num_dims = util.shape(x=tensor)[-1] tensors = OrderedDict([(name + str(n), tensor[..., n]) for n in range(num_dims)]) else: tensors = OrderedDict([(name, tensor)]) if mean_variance: for name in list(tensors): tensor = tensors.pop(name) mean, variance = tf.nn.moments(x=tensor, axes=tuple( range(util.rank(x=tensor)))) tensors[name + '-mean'] = mean tensors[name + '-variance'] = variance # TensorFlow summaries summaries = list() for name, tensor in tensors.items(): shape = util.shape(x=tensor) if shape == () or shape == (-1, ): # Scalar summaries.append( tf.contrib.summary.scalar(name=name, tensor=tensor)) elif shape == (1, ) or shape == (-1, 1): # Single-value tensor as scalar tensor = tf.squeeze(input=tensor, axis=-1) summaries.append( tf.contrib.summary.scalar(name=name, tensor=tensor)) else: # General tensor as histogram summaries.append( tf.contrib.summary.histogram(name=name, tensor=tensor)) with tf.control_dependencies(control_inputs=summaries): if util.is_iterable(x=pass_tensors): return tuple( util.identity_operation(x=x) for x in pass_tensors) else: return util.identity_operation(x=pass_tensors)
def run( self, # General num_episodes=None, num_timesteps=None, max_episode_timesteps=None, deterministic=False, num_sleep_secs=0.01, sync_timesteps=False, sync_episodes=False, # Callback callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, # Tqdm use_tqdm=True, num_mean_reward=100): # General if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes if num_timesteps is None: self.num_timesteps = float('inf') else: self.num_timesteps = num_timesteps if max_episode_timesteps is None: self.max_episode_timesteps = float('inf') else: self.max_episode_timesteps = max_episode_timesteps self.deterministic = deterministic self.num_sleep_secs = num_sleep_secs self.sync_timesteps = sync_timesteps self.sync_episodes = sync_episodes # Callback assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r, p: True) elif util.is_iterable(x=callback): def sequential_callback(runner, parallel): result = True for fn in callback: x = fn(runner, parallel) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner, parallel): result = callback(runner, parallel) if isinstance(result, bool): return result else: return True self.callback = boolean_callback # Tqdm if use_tqdm: from tqdm import tqdm if hasattr(self, 'tqdm'): self.tqdm.close() assert self.num_episodes != float( 'inf') or self.num_timesteps != float('inf') inner_callback = self.callback if self.num_episodes != float('inf'): # Episode-based tqdm (default option if both num_episodes and num_timesteps set) assert self.num_episodes != float('inf') self.tqdm = tqdm( desc='Episodes', total=self.num_episodes, initial=self.global_episode, postfix=dict(mean_reward='{:.2f}'.format(0.0))) self.tqdm_last_update = self.global_episode def tqdm_callback(runner, parallel): mean_reward = float( np.mean(runner.episode_rewards[-num_mean_reward:])) runner.tqdm.set_postfix( mean_reward='{:.2f}'.format(mean_reward)) runner.tqdm.update(n=(runner.global_episode - runner.tqdm_last_update)) runner.tqdm_last_update = runner.global_episode return inner_callback(runner, parallel) else: # Timestep-based tqdm self.tqdm = tqdm(desc='Timesteps', total=self.num_timesteps, initial=self.global_timestep, postfix=dict(mean_reward='n/a')) self.tqdm_last_update = self.global_timestep def tqdm_callback(runner, parallel): # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:]) # num_timesteps = min(num_mean_reward, runner.episode_timestep) # mean_reward = sum_timesteps_reward / num_episodes runner.tqdm.set_postfix(mean_reward='n/a') runner.tqdm.update(n=(runner.global_timestep - runner.tqdm_last_update)) runner.tqdm_last_update = runner.global_timestep return inner_callback(runner, parallel) self.callback = tqdm_callback # Reset agent self.agent.reset() # Episode counter self.episode = 1 # Reset environments and episode statistics for environment in self.environments: environment.start_reset() self.episode_reward = [0 for _ in self.environments] self.episode_timestep = [0 for _ in self.environments] episode_start = [time.time() for _ in self.environments] if self.sync_episodes: terminated = [False for _ in self.environments] # Runner loop while True: if not self.sync_timesteps: no_environment_ready = True # Parallel environments loop for parallel, environment in enumerate(self.environments): if self.sync_episodes and terminated[parallel]: # Continue if episode terminated continue if self.sync_timesteps: # Wait until environment is ready while True: observation = environment.retrieve_execute() if observation is not None: break time.sleep(num_sleep_secs) else: # Check whether environment is ready observation = environment.retrieve_execute() if observation is None: continue no_environment_ready = False states, terminal, reward = observation if terminal is None: # Retrieve actions from agent actions = self.agent.act(states=states, deterministic=deterministic, parallel=parallel) self.episode_timestep[parallel] += 1 # Execute actions in environment environment.start_execute(actions=actions) continue # Terminate episode if too long if self.episode_timestep[ parallel] >= self.max_episode_timesteps: terminal = True # Observe unless episode just started assert (terminal is None) == (self.episode_timestep[parallel] == 0) if terminal is not None: self.agent.observe(terminal=terminal, reward=reward, parallel=parallel) self.episode_reward[parallel] += reward # Update global timestep/episode self.global_timestep = self.agent.timestep self.global_episode = self.agent.episode # Callback plus experiment termination check if self.episode_timestep[parallel] % self.callback_timestep_frequency == 0 and \ not self.callback(self, parallel): return if terminal: # Update experiment statistics self.episode_rewards.append(self.episode_reward[parallel]) self.episode_timesteps.append( self.episode_timestep[parallel]) self.episode_times.append(time.time() - episode_start[parallel]) # Callback if self.episode % self.callback_episode_frequency == 0 and \ not self.callback(self, parallel): return # Terminate experiment if too long if self.global_timestep >= self.num_timesteps: return elif self.global_episode >= self.num_episodes: return elif self.agent.should_stop(): return # Check whether episode terminated if terminal: # Increment episode counter (after calling callback) self.episode += 1 # Reset environment and episode statistics environment.start_reset() self.episode_reward[parallel] = 0 self.episode_timestep[parallel] = 0 episode_start[parallel] = time.time() if self.sync_episodes: terminated[parallel] = True else: # Retrieve actions from agent actions = self.agent.act(states=states, deterministic=deterministic, parallel=parallel) self.episode_timestep[parallel] += 1 # Execute actions in environment environment.start_execute(actions=actions) if not self.sync_timesteps and no_environment_ready: # Sleep if no environment was ready time.sleep(num_sleep_secs) if self.sync_episodes and all(terminated): # Reset if all episodes terminated terminated = [False for _ in self.environments]
def run( self, # General num_episodes=None, num_timesteps=None, max_episode_timesteps=None, deterministic=False, num_repeat_actions=1, # Callback callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, # Tqdm use_tqdm=True, num_mean_reward=100, # Evaluation evaluation_callback=None, evaluation_frequency=None, update_as_evaluation_frequency=False, max_evaluation_timesteps=None, num_evaluation_iterations=1, save_best_agent=False): # General if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes if num_timesteps is None: self.num_timesteps = float('inf') else: self.num_timesteps = num_timesteps if max_episode_timesteps is None: self.max_episode_timesteps = float('inf') else: self.max_episode_timesteps = max_episode_timesteps self.deterministic = deterministic self.num_repeat_actions = num_repeat_actions # Callback assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r: True) elif util.is_iterable(x=callback): def sequential_callback(runner): result = True for fn in callback: x = fn(runner) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner): result = callback(runner) if isinstance(result, bool): return result else: return True self.callback = boolean_callback # Tqdm if use_tqdm: from tqdm import tqdm if hasattr(self, 'tqdm'): self.tqdm.close() assert self.num_episodes != float( 'inf') or self.num_timesteps != float('inf') inner_callback = self.callback if self.num_episodes != float('inf'): # Episode-based tqdm (default option if both num_episodes and num_timesteps set) assert self.num_episodes != float('inf') self.tqdm = tqdm( desc='Episodes', total=self.num_episodes, initial=self.global_episode, postfix=dict(mean_reward='{:.2f}'.format(0.0))) self.tqdm_last_update = self.global_episode def tqdm_callback(runner): mean_reward = float( np.mean(runner.episode_rewards[-num_mean_reward:])) runner.tqdm.set_postfix( mean_reward='{:.2f}'.format(mean_reward)) runner.tqdm.update(n=(runner.global_episode - runner.tqdm_last_update)) runner.tqdm_last_update = runner.global_episode return inner_callback(runner) else: # Timestep-based tqdm assert self.num_timesteps != float('inf') self.tqdm = tqdm(desc='Timesteps', total=self.num_timesteps, initial=self.global_timestep, postfix=dict(mean_reward='n/a')) self.tqdm_last_update = self.global_timestep def tqdm_callback(runner): # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:]) # num_timesteps = min(num_mean_reward, runner.episode_timestep) # mean_reward = sum_timesteps_reward / num_episodes runner.tqdm.set_postfix(mean_reward='n/a') runner.tqdm.update(n=(runner.global_timestep - runner.tqdm_last_update)) runner.tqdm_last_update = runner.global_timestep return inner_callback(runner) self.callback = tqdm_callback # Evaluation if evaluation_callback is None: self.evaluation_callback = (lambda r: None) else: self.evaluation_callback = evaluation_callback if evaluation_frequency is None: if update_as_evaluation_frequency: assert evaluation_frequency is None batch_size = self.agent.model.update_mode['batch_size'] frequency = self.agent.model.update_mode.get( 'frequency', batch_size) if self.agent.model.update_unit == 'episodes': self.evaluation_frequency = frequency else: # Approximates maximum number of episodes for an update to happen self.evaluation_frequency = frequency // self.max_episode_timesteps else: self.evaluation_frequency = float('inf') else: assert not update_as_evaluation_frequency self.evaluation_frequency = evaluation_frequency if max_evaluation_timesteps is None: self.max_evaluation_timesteps = float('inf') else: self.max_evaluation_timesteps = max_evaluation_timesteps self.num_evaluation_iterations = num_evaluation_iterations self.save_best_agent = save_best_agent if self.save_best_agent: inner_evaluation_callback = self.evaluation_callback def mean_reward_callback(runner): result = inner_evaluation_callback(runner) if result is None: return float(np.mean(runner.evaluation_rewards)) else: return result self.evaluation_callback = mean_reward_callback self.best_evaluation_score = None # Reset agent self.agent.reset() # Episode counter self.episode = 1 # Episode loop while True: # Run episode if not self.run_episode(environment=self.environment, max_timesteps=self.max_episode_timesteps, evaluation=False): return # Update experiment statistics self.episode_rewards.append(self.episode_reward) self.episode_timesteps.append(self.episode_timestep) self.episode_times.append(self.episode_time) # Run evaluation if self.episode % self.evaluation_frequency == 0: if self.evaluation_environment is None: environment = self.environment else: environment = self.evaluation_environment self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_times = list() # Evaluation loop for _ in range(self.num_evaluation_iterations): self.run_episode( environment=environment, max_timesteps=self.max_evaluation_timesteps, evaluation=True) self.evaluation_rewards.append(self.episode_reward) self.evaluation_timesteps.append(self.episode_timestep) self.evaluation_times.append(self.episode_time) # Update global timestep/episode self.global_timestep = self.agent.timestep self.global_episode = self.agent.episode # Evaluation callback if self.save_best_agent: evaluation_score = self.evaluation_callback(self) assert isinstance(evaluation_score, float) if self.best_evaluation_score is None: self.best_evaluation_score = evaluation_score elif evaluation_score > self.best_evaluation_score: self.best_evaluation_score = evaluation_score self.agent.save(filename='best-model', append_timestep=False) else: self.evaluation_callback(self) # Update global timestep/episode self.global_timestep = self.agent.timestep self.global_episode = self.agent.episode # Callback if self.episode % self.callback_episode_frequency == 0 and not self.callback( self): return # Terminate experiment if too long if self.global_timestep >= self.num_timesteps: return elif self.global_episode >= self.num_episodes: return elif self.agent.should_stop(): return # Increment episode counter (after calling callback) self.episode += 1
def run( self, # General num_episodes=None, num_timesteps=None, num_updates=None, # Parallel batch_agent_calls=False, sync_timesteps=False, sync_episodes=False, num_sleep_secs=0.001, # Callback callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, # Tqdm use_tqdm=True, mean_horizon=1, # Evaluation evaluation=False, save_best_agent=None, evaluation_callback=None ): """ Run experiment. Args: num_episodes (int > 0): Number of episodes to run experiment (<span style="color:#00C000"><b>default</b></span>: no episode limit). num_timesteps (int > 0): Number of timesteps to run experiment (<span style="color:#00C000"><b>default</b></span>: no timestep limit). num_updates (int > 0): Number of agent updates to run experiment (<span style="color:#00C000"><b>default</b></span>: no update limit). batch_agent_calls (bool): Whether to batch agent calls for parallel environment execution (<span style="color:#00C000"><b>default</b></span>: false, separate call per environment). sync_timesteps (bool): Whether to synchronize parallel environment execution on timestep-level, implied by batch_agent_calls (<span style="color:#00C000"><b>default</b></span>: false, unless batch_agent_calls is true). sync_episodes (bool): Whether to synchronize parallel environment execution on episode-level (<span style="color:#00C000"><b>default</b></span>: false). num_sleep_secs (float): Sleep duration if no environment is ready (<span style="color:#00C000"><b>default</b></span>: one milliseconds). callback ((Runner, parallel) -> bool): Callback function taking the runner instance plus parallel index and returning a boolean value indicating whether execution should continue (<span style="color:#00C000"><b>default</b></span>: callback always true). callback_episode_frequency (int): Episode interval between callbacks (<span style="color:#00C000"><b>default</b></span>: every episode). callback_timestep_frequency (int): Timestep interval between callbacks (<span style="color:#00C000"><b>default</b></span>: not specified). use_tqdm (bool): Whether to display a tqdm progress bar for the experiment run (<span style="color:#00C000"><b>default</b></span>: true), with the following additional information (averaged over number of episodes given via mean_horizon): <ul> <li>reward – cumulative episode reward</li> <li>ts/ep – timesteps per episode</li> <li>sec/ep – seconds per episode</li> <li>ms/ts – milliseconds per timestep</li> <li>agent – percentage of time spent on agent computation</li> <li>comm – if remote environment execution, percentage of time spent on communication</li> </ul> mean_horizon (int): Number of episodes progress bar values and evaluation score are averaged over (<span style="color:#00C000"><b>default</b></span>: not averaged). evaluation (bool): Whether to run in evaluation mode, only valid if single environment (<span style="color:#00C000"><b>default</b></span>: no evaluation). save_best_agent (string): Directory to save the best version of the agent according to the evaluation score (<span style="color:#00C000"><b>default</b></span>: best agent is not saved). evaluation_callback (int | Runner -> float): Callback function taking the runner instance and returning an evaluation score (<span style="color:#00C000"><b>default</b></span>: cumulative evaluation reward averaged over mean_horizon episodes). """ # General if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes if num_timesteps is None: self.num_timesteps = float('inf') else: self.num_timesteps = num_timesteps if num_updates is None: self.num_updates = float('inf') else: self.num_updates = num_updates # Parallel if len(self.environments) > 1: pass elif batch_agent_calls: raise TensorforceError.invalid( name='Runner.run', argument='batch_agent_calls', condition='single environment' ) elif sync_timesteps: raise TensorforceError.invalid( name='Runner.run', argument='sync_timesteps', condition='single environment' ) elif sync_episodes: raise TensorforceError.invalid( name='Runner.run', argument='sync_episodes', condition='single environment' ) self.batch_agent_calls = batch_agent_calls self.sync_timesteps = sync_timesteps or self.batch_agent_calls self.sync_episodes = sync_episodes self.num_sleep_secs = num_sleep_secs # Callback assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r, p: True) elif util.is_iterable(x=callback): def sequential_callback(runner, parallel): result = True for fn in callback: x = fn(runner, parallel) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner, parallel): result = callback(runner, parallel) if isinstance(result, bool): return result else: return True self.callback = boolean_callback # Experiment statistics self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list() if self.is_environment_remote: self.episode_env_seconds = list() if self.evaluation or evaluation: self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list() if self.is_environment_remote: self.evaluation_env_seconds = list() if len(self.environments) == 1: # for tqdm self.episode_rewards = self.evaluation_rewards self.episode_timesteps = self.evaluation_timesteps self.episode_seconds = self.evaluation_seconds self.episode_agent_seconds = self.evaluation_agent_seconds if self.is_environment_remote: self.episode_env_seconds = self.evaluation_env_seconds else: # for tqdm self.evaluation_rewards = self.episode_rewards self.evaluation_timesteps = self.episode_timesteps self.evaluation_seconds = self.episode_seconds self.evaluation_agent_seconds = self.episode_agent_seconds if self.is_environment_remote: self.evaluation_env_seconds = self.episode_env_seconds # Timestep/episode/update counter self.timesteps = 0 self.episodes = 0 self.updates = 0 # Tqdm if use_tqdm: if hasattr(self, 'tqdm'): self.tqdm.close() assert self.num_episodes != float('inf') or self.num_timesteps != float('inf') inner_callback = self.callback if self.num_episodes != float('inf'): # Episode-based tqdm (default option if both num_episodes and num_timesteps set) assert self.num_episodes != float('inf') bar_format = ( '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep=' '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent=' '{postfix[4]:.1f}%]' ) postfix = [0.0, 0, 0.0, 0.0, 0.0] if self.is_environment_remote: bar_format = bar_format[:-1] + ', comm={postfix[5]:.1f}%]' postfix.append(0.0) self.tqdm = tqdm( desc='Episodes', total=self.num_episodes, bar_format=bar_format, initial=self.episodes, postfix=postfix ) self.tqdm_last_update = self.episodes def tqdm_callback(runner, parallel): if len(runner.evaluation_rewards) > 0: mean_reward = float(np.mean(runner.evaluation_rewards[-mean_horizon:])) runner.tqdm.postfix[0] = mean_reward if len(runner.episode_timesteps) > 0: mean_ts_per_ep = int(np.mean(runner.episode_timesteps[-mean_horizon:])) mean_sec_per_ep = float(np.mean(runner.episode_seconds[-mean_horizon:])) mean_agent_sec = float( np.mean(runner.episode_agent_seconds[-mean_horizon:]) ) mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep runner.tqdm.postfix[1] = mean_ts_per_ep runner.tqdm.postfix[2] = mean_sec_per_ep runner.tqdm.postfix[3] = mean_ms_per_ts runner.tqdm.postfix[4] = mean_rel_agent if runner.is_environment_remote and len(runner.episode_env_seconds) > 0: mean_env_sec = float(np.mean(runner.episode_env_seconds[-mean_horizon:])) mean_rel_comm = (mean_agent_sec + mean_env_sec) * 100.0 / mean_sec_per_ep mean_rel_comm = 100.0 - mean_rel_comm runner.tqdm.postfix[5] = mean_rel_comm runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update)) runner.tqdm_last_update = runner.episodes return inner_callback(runner, parallel) else: # Timestep-based tqdm self.tqdm = tqdm( desc='Timesteps', total=self.num_timesteps, initial=self.timesteps, postfix=dict(mean_reward='n/a') ) self.tqdm_last_update = self.timesteps def tqdm_callback(runner, parallel): # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:]) # num_timesteps = min(num_mean_reward, runner.evaluation_timestep) # mean_reward = sum_timesteps_reward / num_episodes runner.tqdm.set_postfix(mean_reward='n/a') runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update)) runner.tqdm_last_update = runner.timesteps return inner_callback(runner, parallel) self.callback = tqdm_callback # Evaluation if evaluation and len(self.environments) > 1: raise TensorforceError.invalid( name='Runner.run', argument='evaluation', condition='multiple environments' ) self.evaluation_run = self.evaluation or evaluation self.save_best_agent = save_best_agent if evaluation_callback is None: self.evaluation_callback = (lambda r: None) else: self.evaluation_callback = evaluation_callback if self.save_best_agent is not None: inner_evaluation_callback = self.evaluation_callback def mean_reward_callback(runner): result = inner_evaluation_callback(runner) if result is None: return float(np.mean(runner.evaluation_rewards[-mean_horizon:])) else: return result self.evaluation_callback = mean_reward_callback self.best_evaluation_score = None # Episode statistics self.episode_reward = [0.0 for _ in self.environments] self.episode_timestep = [0 for _ in self.environments] # if self.batch_agent_calls: # self.episode_agent_second = 0.0 # self.episode_start = time.time() if self.evaluation_run: self.episode_agent_second = [0.0 for _ in self.environments[:-1]] self.episode_start = [time.time() for _ in self.environments[:-1]] else: self.episode_agent_second = [0.0 for _ in self.environments] self.episode_start = [time.time() for _ in self.environments] self.evaluation_agent_second = 0.0 self.evaluation_start = time.time() # Values self.terminate = 0 self.prev_terminals = [-1 for _ in self.environments] self.states = [None for _ in self.environments] self.terminals = [None for _ in self.environments] self.rewards = [None for _ in self.environments] if self.evaluation_run: self.evaluation_internals = self.agent.initial_internals() # Required if agent was previously stopped mid-episode self.agent.reset() # Reset environments for environment in self.environments: environment.start_reset() # Runner loop while any(terminal <= 0 for terminal in self.prev_terminals): self.terminals = [None for _ in self.terminals] if self.batch_agent_calls: # Retrieve observations (only if not already terminated) while any(terminal is None for terminal in self.terminals): for n in range(len(self.environments)): if self.terminals[n] is not None: # Already received continue elif self.prev_terminals[n] <= 0: # Receive if not terminal observation = self.environments[n].receive_execute() if observation is None: continue self.states[n], self.terminals[n], self.rewards[n] = observation else: # Terminal self.states[n] = None self.terminals[n] = self.prev_terminals[n] self.rewards[n] = None self.handle_observe_joint() self.handle_act_joint() # Parallel environments loop no_environment_ready = True for n in range(len(self.environments)): if self.prev_terminals[n] > 0: # Continue if episode terminated (either sync_episodes or finished) self.terminals[n] = self.prev_terminals[n] continue elif self.batch_agent_calls: # Handled before parallel environments loop pass elif self.sync_timesteps: # Wait until environment is ready while True: observation = self.environments[n].receive_execute() if observation is not None: break else: # Check whether environment is ready, otherwise continue observation = self.environments[n].receive_execute() if observation is None: self.terminals[n] = self.prev_terminals[n] continue no_environment_ready = False if not self.batch_agent_calls: self.states[n], self.terminals[n], self.rewards[n] = observation # Check whether evaluation environment if self.evaluation_run and n == (len(self.environments) - 1): if self.terminals[n] == -1: # Initial act self.handle_act_evaluation() else: # Observe self.handle_observe_evaluation() if self.terminals[n] == 0: # Act self.handle_act_evaluation() else: # Terminal self.handle_terminal_evaluation() else: if self.terminals[n] == -1: # Initial act self.handle_act(parallel=n) else: # Observe self.handle_observe(parallel=n) if self.terminals[n] == 0: # Act self.handle_act(parallel=n) else: # Terminal self.handle_terminal(parallel=n) self.prev_terminals = list(self.terminals) # Sync_episodes: Reset if all episodes terminated if self.sync_episodes and all(terminal > 0 for terminal in self.terminals): num_episodes_left = self.num_episodes - self.episodes num_noneval_environments = len(self.environments) - int(self.evaluation_run) for n in range(min(num_noneval_environments, num_episodes_left)): self.prev_terminals[n] = -1 self.environments[n].start_reset() if self.evaluation_run and num_episodes_left > 0: self.prev_terminals[-1] = -1 self.environments[-1].start_reset() # Sleep if no environment was ready if no_environment_ready: time.sleep(self.num_sleep_secs)
def __setattr__(self, name, value): if not self.overwrite: raise NotImplementedError if name == 'type': if value is None: # Type: None pass elif util.is_iterable(x=value): # Type: tuple(*types) if any(_normalize_type(dtype=x) is None for x in value): raise TensorforceError.value(name='TensorSpec', argument=name, value=value) value = tuple(_normalize_type(dtype=x) for x in value) else: # Type: 'bool' | 'int' | 'float' if _normalize_type(dtype=value) is None: raise TensorforceError.value(name='TensorSpec', argument=name, value=value) value = _normalize_type(dtype=value) # Delete attributes not required anymore if self.type is not None and self.type != 'bool' and value == 'bool': super().__delattr__('min_value') super().__delattr__('max_value') if self.type is not None and ( self.type == 'int' or (isinstance(self.type, tuple) and 'int' in self.type) ) and value != 'int' and (not isinstance(value, tuple) or 'int' not in value): super().__delattr__('num_values') # Set type attribute super().__setattr__(name, value) # Reset attributes if self.type == 'int' or (isinstance(self.type, tuple) and 'int' in self.type): self.min_value = None self.max_value = None self.num_values = None elif self.type != 'bool': self.min_value = None self.max_value = None elif name == 'shape': if value is None: # Shape: None pass elif util.is_iterable(x=value): if len(value) > 0 and value[0] is None: # Shape: tuple(None, *ints >= -1) try: value = (None,) + tuple(int(x) for x in value[1:]) if any(x < -1 for x in value[1:]): raise TensorforceError.value( name='TensorSpec', argument=name, value=value ) except BaseException: raise TensorforceError.type( name='TensorSpec', argument=name, value=type(value) ) else: # Shape: tuple(*ints >= -1) try: value = tuple(int(x) for x in value) if any(x < -1 for x in value): raise TensorforceError.value( name='TensorSpec', argument=name, value=value ) except BaseException: raise TensorforceError.value(name='TensorSpec', argument=name, value=value) else: # Shape: (int >= -1,) try: value = (int(value),) if value[0] < -1: raise TensorforceError.value(name='TensorSpec', argument=name, value=value) except BaseException: raise TensorforceError.type(name='TensorSpec', argument=name, value=type(value)) # TODO: check min/max_value shape if np.ndarray # Set shape attribute super().__setattr__(name, value) elif name == 'min_value' or name == 'max_value': # Invalid for type == 'bool', or type == 'int' and num_values != None if self.type == 'bool': raise TensorforceError.invalid( name='TensorSpec', argument=name, condition='type is bool' ) if value is None: # Min/max value: None pass else: # Min/max value: int/float try: value = self.py_type()(value) if self.type == 'int' and self.num_values is not None: if name == 'min_value': assert value == 0 elif name == 'max_value': assert value == self.num_values - 1 except BaseException: try: value = np.asarray(value, dtype=self.np_type()) if self.type == 'int': assert self.num_values is None except BaseException: raise TensorforceError.type( name='TensorSpec', argument=name, value=type(value) ) if isinstance(value, np.ndarray): if self.shape is not None and ( value.ndim > len(self.shape) or value.shape != self.shape[:value.ndim] ): raise TensorforceError.value( name='TensorSpec', argument=(name + ' shape'), value=value.shape, hint='incompatible with {}'.format(self.shape) ) if name == 'min_value' and self.max_value is not None and \ (value > self.max_value - util.epsilon).any(): raise TensorforceError.value( name='TensorSpec', argument=name, value=value, condition='max_value = {}'.format(self.max_value) ) elif name == 'max_value' and self.min_value is not None and \ (value < self.min_value + util.epsilon).any(): raise TensorforceError.value( name='TensorSpec', argument=name, value=value, condition='min_value = {}'.format(self.min_value) ) else: if name == 'min_value' and self.max_value is not None: if isinstance(self.max_value, np.ndarray): if (value > self.max_value - util.epsilon).any(): raise TensorforceError.value( name='TensorSpec', argument=name, value=value, condition='max_value = {}'.format(self.max_value) ) elif value > self.max_value - util.epsilon: raise TensorforceError.value( name='TensorSpec', argument=name, value=value, condition='max_value = {}'.format(self.max_value) ) elif name == 'max_value' and self.min_value is not None: if isinstance(self.min_value, np.ndarray): if (value < self.min_value + util.epsilon).any(): raise TensorforceError.value( name='TensorSpec', argument=name, value=value, condition='min_value = {}'.format(self.min_value) ) elif value < self.min_value + util.epsilon: raise TensorforceError.value( name='TensorSpec', argument=name, value=value, condition='min_value = {}'.format(self.min_value) ) # Set min/max_value attribute super().__setattr__(name, value) elif name == 'num_values': # Invalid for type != 'int' if self.type != 'int' and (not isinstance(self.type, tuple) or 'int' not in self.type): raise TensorforceError.invalid( name='TensorSpec', argument=name, condition='type is {}'.format(self.type) ) if value is None: # Num values: None pass else: # Num values: int >= 0 try: value = int(value) except BaseException: raise TensorforceError.type(name='TensorSpec', argument=name, value=type(value)) if value < 0: raise TensorforceError.value(name='TensorSpec', argument=name, value=value) # Set num_values attribute and min/max_value accordingly super().__setattr__(name, value) if value is not None and value > 0: super().__setattr__('min_value', 0) super().__setattr__('max_value', value - 1) else: super().__setattr__('min_value', None) super().__setattr__('max_value', None) else: raise TensorforceError.invalid(name='TensorSpec', argument=name)
def __init__(self, agent, environment=None, num_parallel=None, environments=None, max_episode_timesteps=None, evaluation_environment=None, save_best_agent=None): self.environments = list() if environment is None: assert num_parallel is None and environments is not None if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) num_parallel = len(environments) environment = environments[0] self.is_environment_external = isinstance(environment, Environment) environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) states = environment.states() actions = environment.actions() self.environments.append(environment) for environment in environments[1:]: assert isinstance(environment, Environment) == self.is_environment_external environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) assert environment.states() == states assert environment.actions() == actions self.environments.append(environment) else: assert num_parallel is not None and environments is None assert not isinstance(environment, Environment) self.is_environment_external = False for _ in range(num_parallel): environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps) self.environments(environment) if evaluation_environment is None: self.evaluation_environment = None else: self.is_eval_environment_external = isinstance( evaluation_environment, Environment) self.evaluation_environment = Environment.create( environment=evaluation_environment, max_episode_timesteps=max_episode_timesteps) assert self.evaluation_environment.states() == environment.states() assert self.evaluation_environment.actions( ) == environment.actions() self.is_agent_external = isinstance(agent, Agent) kwargs = dict(parallel_interactions=num_parallel) self.agent = Agent.create(agent=agent, environment=environment, **kwargs) self.save_best_agent = save_best_agent self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.episode_agent_seconds = list() self.evaluation_rewards = list() self.evaluation_timesteps = list() self.evaluation_seconds = list() self.evaluation_agent_seconds = list()
def __init__(self, agent, learner, environment, p_network, global_dict, report_frequency, algorithm, callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, parallel_interactions=1, num_episodes=None, **kwargs ): if isinstance(environment, BaseEnvironment): fruit_environment = environment self.tf_environment = TensorForcePlugin.convert(environment) else: environment = Environment.create(environment=environment) fruit_environment = TensorForcePlugin.convert(environment) self.tf_environment = environment super().__init__(agent=agent, name=learner, environment=fruit_environment, network=p_network, global_dict=global_dict, report_frequency=report_frequency) self.algorithm = algorithm self.tf_agent = Agent.create( algorithm, self.tf_environment, **kwargs ) if not self.tf_agent.model.is_initialized: self.tf_agent.initialize() self.episode_rewards = list() self.episode_timesteps = list() self.episode_seconds = list() self.parallel_interactions = parallel_interactions if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r: True) elif util.is_iterable(x=callback): def sequential_callback(runner): result = True for fn in callback: x = fn(runner) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner): result = callback(runner) if isinstance(result, bool): return result else: return True self.callback = boolean_callback
def _process_states_input(self, states, function_name): if self.states_spec.is_singleton() and not isinstance( states, dict) and not (util.is_iterable(x=states) and isinstance(states[0], dict)): # Single state states = np.asarray(states) if states.shape == self.states_spec.value().shape: # Single state is not batched states = ArrayDict(singleton=np.expand_dims(states, axis=0)) batched = False num_instances = 1 is_iter_of_dicts = None else: # Single state is batched, iter[state] assert states.shape[1:] == self.states_spec.value().shape assert type(states) in (tuple, list, np.ndarray) num_instances = states.shape[0] states = ArrayDict(singleton=states) batched = True is_iter_of_dicts = True # Default elif util.is_iterable(x=states): # States is batched, iter[dict[state]] batched = True num_instances = len(states) is_iter_of_dicts = True assert type(states) in (tuple, list) if num_instances == 0: raise TensorforceError.value(name=function_name, argument='len(states)', value=num_instances, hint='= 0') for n, state in enumerate(states): if not isinstance(state, dict): raise TensorforceError.type( name=function_name, argument='states[{}]'.format(n), dtype=type(state), hint='is not dict') # Turn iter of dicts into dict of arrays # (Doesn't use self.states_spec since states also contains auxiliaries) states = [ArrayDict(state) for state in states] states = states[0].fmap( function=(lambda *xs: np.stack(xs, axis=0)), zip_values=states[1:]) elif isinstance(states, dict): # States is dict, turn into arrays states = ArrayDict(states) name, spec = self.states_spec.item() if name is None: name = 'state' if states[name].shape == spec.shape: # States is not batched, dict[state] states = states.fmap( function=(lambda state: np.expand_dims(state, axis=0))) batched = False num_instances = 1 is_iter_of_dicts = None else: # States is batched, dict[iter[state]] assert states[name].shape[1:] == spec.shape assert type(states[name]) in (tuple, list, np.ndarray) batched = True num_instances = states[name].shape[0] is_iter_of_dicts = False if num_instances == 0: raise TensorforceError.value(name=function_name, argument='len(states)', value=num_instances, hint='= 0') else: raise TensorforceError.type(name=function_name, argument='states', dtype=type(states), hint='is not array/tuple/list/dict') # Check number of inputs if any(state.shape[0] != num_instances for state in states.values()): raise TensorforceError.value( name=function_name, argument='len(states)', value=[state.shape[0] for state in states.values()], hint='inconsistent') return states, batched, num_instances, is_iter_of_dicts
def add_variable( self, name, dtype, shape, is_trainable, initializer='zeros', is_saved=True, summarize=None, shared=None ): # name if not util.is_valid_name(name=name): raise TensorforceError.value(name='Module.add_variable', argument='name', value=name) elif name in self.variables: raise TensorforceError.exists(name='variable', value=name) # dtype if not util.is_valid_type(dtype=dtype): raise TensorforceError.value(name='Module.add_variable', argument='dtype', value=dtype) # shape if not util.is_iterable(x=shape) or not all(isinstance(dims, int) for dims in shape): raise TensorforceError.value(name='Module.add_variable', argument='shape', value=shape) elif not all(dims > 0 for dims in shape): raise TensorforceError.value(name='Module.add_variable', argument='shape', value=shape) # is_trainable if not isinstance(is_trainable, bool): raise TensorforceError.type( name='Module.add_variable', argument='is_trainable', dtype=type(is_trainable) ) elif is_trainable and dtype != 'float': raise TensorforceError.value( name='Module.add_variable', argument='is_trainable', value=is_trainable, condition='dtype != float' ) # initializer initializer_names = ( 'normal', 'normal-relu', 'orthogonal', 'orthogonal-relu', 'zeros', 'ones' ) if not isinstance(initializer, (util.py_dtype(dtype=dtype), np.ndarray, tf.Tensor)) and \ initializer not in initializer_names: raise TensorforceError.value( name='Module.add_variable', argument='initializer', value=initializer ) elif isinstance(initializer, np.ndarray) and \ initializer.dtype != util.np_dtype(dtype=dtype): raise TensorforceError.type( name='Module.add_variable', argument='initializer', dtype=type(initializer) ) elif isinstance(initializer, tf.Tensor) and util.dtype(x=initializer) != dtype: raise TensorforceError.type( name='Module.add_variable', argument='initializer', dtype=type(initializer) ) # is_saved if not isinstance(is_saved, bool): raise TensorforceError.type( name='Module.add_variable', argument='is_saved', dtype=type(is_saved) ) # summarize if summarize is not None and not isinstance(summarize, bool): raise TensorforceError.type( name='Module.add_variable', argument='summarize', dtype=type(summarize) ) # shared if shared is not None and not isinstance(shared, str): raise TensorforceError.type( name='Module.add_variable', argument='shared',dtype=type(shared) ) variable = None if shared is not None and len(self.graph.get_collection(name=shared)) > 0: # Retrieve shared variable from TensorFlow collection = self.graph.get_collection(name=shared) if len(collection) > 1: raise TensorforceError.unexpected() variable = collection[0] else: tf_dtype = util.tf_dtype(dtype=dtype) # Variable initializer if isinstance(initializer, util.py_dtype(dtype=dtype)): initializer = tf.constant(value=initializer, dtype=tf_dtype, shape=shape) elif isinstance(initializer, np.ndarray): if initializer.shape != shape: raise TensorforceError.mismatch( name='Module.add_variable', value1='shape', value2='initializer' ) initializer = tf.constant(value=initializer, dtype=tf_dtype) elif isinstance(initializer, tf.Tensor): if util.shape(x=initializer) != shape: raise TensorforceError.mismatch( name='Module.add_variable', value1='shape', value2='initializer' ) initializer = initializer elif not isinstance(initializer, str): raise TensorforceError("Invalid variable initializer: {}".format(initializer)) elif initializer[:6] == 'normal': if dtype != 'float': raise TensorforceError( message="Invalid variable initializer value for non-float variable: {}.".format( initializer ) ) if initializer[6:] == '-relu': stddev = min(0.1, sqrt(2.0 / util.product(xs=shape[:-1]))) else: stddev = min(0.1, sqrt(2.0 / (util.product(xs=shape[:-1]) + shape[-1]))) initializer = tf.random.normal(shape=shape, stddev=stddev, dtype=tf_dtype) elif initializer[:10] == 'orthogonal': if dtype != 'float': raise TensorforceError( message="Invalid variable initializer value for non-float variable: {}.".format( initializer ) ) if len(shape) < 2: raise TensorforceError( message="Invalid variable initializer value for 0/1-rank variable: {}.".format( initializer ) ) normal = np.random.normal(size=(util.product(xs=shape[:-1]), shape[-1])) u, _, v = np.linalg.svd(a=normal, full_matrices=False) orthogonal = u if u.shape[1] == shape[-1] else v if initializer[10:] == '-relu': orthogonal = orthogonal * sqrt(2.0) initializer = tf.constant(value=orthogonal.reshape(shape), dtype=tf_dtype) elif initializer == 'zeros': initializer = tf.zeros(shape=shape, dtype=tf_dtype) elif initializer == 'ones': initializer = tf.ones(shape=shape, dtype=tf_dtype) # Variable variable = tf.Variable( initial_value=initializer, trainable=is_trainable, validate_shape=True, name=name, dtype=tf_dtype, shape=shape ) # Register shared variable with TensorFlow if shared is not None: self.graph.add_to_collection(name=shared, value=variable) # Register variable self.variables[name] = variable if is_trainable: self.trainable_variables[name] = variable if is_saved: self.saved_variables[name] = variable # Add summary if (summarize is None and is_trainable) or summarize: variable = self.add_summary( label='variables', name=name, tensor=variable, mean_variance=True ) variable = self.add_summary(label='variables-histogram', name=name, tensor=variable) return variable
def unify(self, *, other, name='TensorSpec.unify'): # Unify type if self.type is None: dtype = other.type elif other.type is None: dtype = self.type elif util.is_iterable(x=self.type): if util.is_iterable(x=other.type): if set(self.type) <= set(other.type): dtype = self.type elif set(other.type) <= set(self.type): dtype = other.type else: raise TensorforceError.mismatch( name=name, argument='type', value1=self.type, value2=other.type ) elif other.type in self.type: dtype = other.type else: raise TensorforceError.mismatch( name=name, argument='type', value1=self.type, value2=other.type ) elif util.is_iterable(x=other.type): if self.type in other.type: dtype = self.type else: raise TensorforceError.mismatch( name=name, argument='type', value1=self.type, value2=other.type ) elif self.type == other.type: dtype = self.type else: raise TensorforceError.mismatch( name=name, argument='type', value1=self.type, value2=other.type ) # Unify shape if self.shape is None: shape = other.shape elif other.shape is None: shape = self.shape else: reverse_shape = list() start = len(self.shape) - 1 if self.shape[-1] is None: reverse_shape.extend(other.shape[len(self.shape) - 1:]) start = len(self.shape) - 2 elif other.shape[-1] is None: reverse_shape.extend(self.shape[len(other.shape) - 1:]) start = len(other.shape) - 2 elif len(self.shape) != len(other.shape): raise TensorforceError.mismatch( name=name, argument='rank', value1=self.rank, value2=other.rank ) for n in range(start, -1, -1): if self.shape[n] == 0: reverse_shape.append(other.shape[n]) elif other.shape[n] == 0: reverse_shape.append(self.shape[n]) elif self.shape[n] == -1 and other.shape[n] > 0: reverse_shape.append(other.shape[n]) elif other.shape[n] == -1 and self.shape[n] > 0: reverse_shape.append(self.shape[n]) elif self.shape[n] == other.shape[n]: reverse_shape.append(self.shape[n]) else: raise TensorforceError.mismatch( name=name, argument='shape', value1=self.shape, value2=other.shape ) shape = tuple(reversed(reverse_shape)) # Unify min_value if dtype == 'bool': min_value = None elif self.type != 'bool' and self.min_value is not None: if other.type != 'bool' and other.min_value is not None: if isinstance(self.min_value, np.ndarray) or \ isinstance(other.min_value, np.ndarray): min_value = np.minimum(self.min_value, other.min_value) elif self.min_value < other.min_value: min_value = other.min_value else: min_value = self.min_value else: min_value = self.min_value elif other.type != 'bool' and other.min_value is not None: min_value = other.min_value else: min_value = None # Unify max_value if dtype == 'bool': max_value = None elif self.type != 'bool' and self.max_value is not None: if other.type != 'bool' and other.max_value is not None: if isinstance(self.max_value, np.ndarray) or \ isinstance(other.max_value, np.ndarray): max_value = np.maximum(self.max_value, other.max_value) elif self.max_value < other.max_value: max_value = other.max_value else: max_value = self.max_value else: max_value = self.max_value elif other.type != 'bool' and other.max_value is not None: max_value = other.max_value else: max_value = None if min_value is not None and max_value is not None: if isinstance(min_value, np.ndarray) or isinstance(max_value, np.ndarray): if (min_value > max_value).any(): raise TensorforceError.mismatch( name=name, argument='min/max_value', value1=min_value, value2=max_value ) else: if min_value > max_value: raise TensorforceError.mismatch( name=name, argument='min/max_value', value1=min_value, value2=max_value ) # Unify num_values if dtype != 'int' and (not isinstance(dtype, tuple) or 'int' not in dtype): num_values = None elif self.type == 'int' and self.num_values is not None: if other.type == 'int' and other.num_values is not None: if self.num_values == 0: num_values = other.num_values elif other.num_values == 0: num_values = self.num_values elif self.num_values == other.num_values: num_values = self.num_values else: raise TensorforceError.mismatch( name=name, argument='num_values', value1=self.num_values, value2=other.num_values ) else: num_values = self.num_values elif other.type == 'int' and other.num_values is not None: num_values = other.num_values else: num_values = None if num_values is not None: min_value = None max_value = None # Unified tensor spec return TensorSpec( type=dtype, shape=shape, min_value=min_value, max_value=max_value, num_values=num_values )
def run( self, # General num_episodes=None, num_timesteps=None, num_updates=None, join_agent_calls=False, sync_timesteps=False, sync_episodes=False, num_sleep_secs=0.01, # Callback callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, # Tqdm use_tqdm=True, mean_horizon=1, # Evaluation evaluation_callback=None, ): # General if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes if num_timesteps is None: self.num_timesteps = float('inf') else: self.num_timesteps = num_timesteps if num_updates is None: self.num_updates = float('inf') else: self.num_updates = num_updates self.join_agent_calls = join_agent_calls if self.join_agent_calls: sync_timesteps = True self.sync_timesteps = sync_timesteps self.sync_episodes = sync_episodes self.num_sleep_secs = num_sleep_secs # Callback assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r, p: True) elif util.is_iterable(x=callback): def sequential_callback(runner, parallel): result = True for fn in callback: x = fn(runner, parallel) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner, parallel): result = callback(runner, parallel) if isinstance(result, bool): return result else: return True self.callback = boolean_callback # Timestep/episode/update counter self.timesteps = 0 self.episodes = 0 self.updates = 0 # Tqdm if use_tqdm: if hasattr(self, 'tqdm'): self.tqdm.close() assert self.num_episodes != float( 'inf') or self.num_timesteps != float('inf') inner_callback = self.callback if self.num_episodes != float('inf'): # Episode-based tqdm (default option if both num_episodes and num_timesteps set) assert self.num_episodes != float('inf') bar_format = ( '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep=' '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent=' '{postfix[4]:.1f}%]') postfix = [0.0, 0, 0.0, 0.0, 0.0] self.tqdm = tqdm(desc='Episodes', total=self.num_episodes, bar_format=bar_format, initial=self.episodes, postfix=postfix) self.tqdm_last_update = self.episodes def tqdm_callback(runner, parallel): mean_reward = float( np.mean(runner.episode_rewards[-mean_horizon:])) mean_ts_per_ep = int( np.mean(runner.episode_timesteps[-mean_horizon:])) mean_sec_per_ep = float( np.mean(runner.episode_seconds[-mean_horizon:])) mean_agent_sec = float( np.mean(runner.episode_agent_seconds[-mean_horizon:])) mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep runner.tqdm.postfix[0] = mean_reward runner.tqdm.postfix[1] = mean_ts_per_ep runner.tqdm.postfix[2] = mean_sec_per_ep runner.tqdm.postfix[3] = mean_ms_per_ts runner.tqdm.postfix[4] = mean_rel_agent runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update)) runner.tqdm_last_update = runner.episodes return inner_callback(runner, parallel) else: # Timestep-based tqdm self.tqdm = tqdm(desc='Timesteps', total=self.num_timesteps, initial=self.timesteps, postfix=dict(mean_reward='n/a')) self.tqdm_last_update = self.timesteps def tqdm_callback(runner, parallel): # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:]) # num_timesteps = min(num_mean_reward, runner.episode_timestep) # mean_reward = sum_timesteps_reward / num_episodes runner.tqdm.set_postfix(mean_reward='n/a') runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update)) runner.tqdm_last_update = runner.timesteps return inner_callback(runner, parallel) self.callback = tqdm_callback # Evaluation if self.evaluation_environment is None: assert evaluation_callback is None assert self.save_best_agent is None else: if evaluation_callback is None: self.evaluation_callback = (lambda r: None) else: self.evaluation_callback = evaluation_callback if self.save_best_agent is not None: inner_evaluation_callback = self.evaluation_callback def mean_reward_callback(runner): result = inner_evaluation_callback(runner) if result is None: return runner.evaluation_reward else: return result self.evaluation_callback = mean_reward_callback self.best_evaluation_score = None # Required if agent was previously stopped mid-episode self.agent.reset() # Reset environments and episode statistics for environment in self.environments: environment.start_reset() self.episode_reward = [0.0 for _ in self.environments] self.episode_timestep = [0 for _ in self.environments] if self.join_agent_calls: self.episode_agent_second = 0.0 self.episode_start = time.time() else: self.episode_agent_second = [0.0 for _ in self.environments] self.episode_start = [time.time() for _ in self.environments] environments = list(self.environments) if self.evaluation_environment is not None: self.evaluation_environment.start_reset() self.evaluation_reward = 0.0 self.evaluation_timestep = 0 if not self.join_agent_calls: self.evaluation_agent_second = 0.0 environments.append(self.evaluation_environment) self.finished = False self.prev_terminals = [0 for _ in environments] self.states = [None for _ in environments] self.terminals = [None for _ in environments] self.rewards = [None for _ in environments] if self.join_agent_calls: self.joint # Runner loop while not self.finished: if self.join_agent_calls: # Retrieve observations (only if not already terminated) self.observations = [None for _ in environments] while any(observation is None for observation in self.observations): for n, (environment, terminal) in enumerate( zip(environments, self.prev_terminals)): if self.observations[n] is not None: continue if terminal == 0: self.observations[n] = environment.receive_execute( ) else: self.observations[n] = (None, terminal, None) self.states, self.terminals, self.rewards = zip( self.observations) self.terminals[parallel] = [ terminal if terminal is None else int(terminal) for terminal in terminals ] self.handle_observe_joint() self.handle_act_joint() # if not self.join_agent_calls: # !!!!!! # self.episode_seconds.append(time.time() - episode_start[parallel]) # self.episode_agent_seconds.append(self.episode_agent_second[parallel]) else: self.terminals = list(self.prev_terminals) if not self.sync_timesteps: no_environment_ready = True # Parallel environments loop for parallel, environment in enumerate(environments): # Is evaluation environment? evaluation = (parallel == len(self.environments)) if self.sync_episodes and self.prev_terminals[parallel] > 0: # Continue if episode already terminated continue elif self.join_agent_calls: pass elif self.sync_timesteps: # Wait until environment is ready while True: observation = environment.receive_execute() if observation is not None: break else: # Check whether environment is ready, otherwise continue observation = environment.receive_execute() if observation is None: continue no_environment_ready = False if not self.join_agent_calls: self.states[parallel], self.terminals[ parallel], self.rewards[parallel] = observation if self.terminals[parallel] is not None: self.terminals[parallel] = int( self.terminals[parallel]) if self.terminals[parallel] is None: # Initial act if evaluation: self.handle_act_evaluation() else: self.handle_act(parallel=parallel) else: # Observe if evaluation: self.handle_observe_evaluation() else: self.handle_observe(parallel=parallel) if self.terminals[parallel] == 0: # Act if evaluation: self.handle_act_evaluation() else: self.handle_act(parallel=parallel) else: # Terminal if evaluation: self.handle_terminal_evaluation() else: self.handle_terminal(parallel=parallel) # # Update global timesteps/episodes/updates # self.global_timesteps = self.agent.timesteps # self.global_episodes = self.agent.episodes # self.global_updates = self.agent.updates print(self.sync_episodes) if self.sync_episodes and all(terminal > 0 for terminal in self.terminals): # Reset if all episodes terminated self.prev_terminals = [0 for _ in environments] for environment in environments: environment.start_reset() else: self.prev_terminals = list(self.terminals) if not self.sync_timesteps and no_environment_ready: # Sleep if no environment was ready time.sleep(self.num_sleep_secs)
def run( self, # General num_episodes=None, num_timesteps=None, num_updates=None, num_sleep_secs=0.01, sync_timesteps=False, sync_episodes=False, # Callback callback=None, callback_episode_frequency=None, callback_timestep_frequency=None, # Tqdm use_tqdm=True, mean_horizon=1, # Evaluation evaluation_callback=None): # General if num_episodes is None: self.num_episodes = float('inf') else: self.num_episodes = num_episodes if num_timesteps is None: self.num_timesteps = float('inf') else: self.num_timesteps = num_timesteps if num_updates is None: self.num_updates = float('inf') else: self.num_updates = num_updates self.num_sleep_secs = num_sleep_secs self.sync_timesteps = sync_timesteps self.sync_episodes = sync_episodes # Callback assert callback_episode_frequency is None or callback_timestep_frequency is None if callback_episode_frequency is None and callback_timestep_frequency is None: callback_episode_frequency = 1 if callback_episode_frequency is None: self.callback_episode_frequency = float('inf') else: self.callback_episode_frequency = callback_episode_frequency if callback_timestep_frequency is None: self.callback_timestep_frequency = float('inf') else: self.callback_timestep_frequency = callback_timestep_frequency if callback is None: self.callback = (lambda r, p: True) elif util.is_iterable(x=callback): def sequential_callback(runner, parallel): result = True for fn in callback: x = fn(runner, parallel) if isinstance(result, bool): result = result and x return result self.callback = sequential_callback else: def boolean_callback(runner, parallel): result = callback(runner, parallel) if isinstance(result, bool): return result else: return True self.callback = boolean_callback # Timestep/episode/update counter self.timesteps = 0 self.episodes = 0 self.updates = 0 # Tqdm if use_tqdm: if hasattr(self, 'tqdm'): self.tqdm.close() assert self.num_episodes != float( 'inf') or self.num_timesteps != float('inf') inner_callback = self.callback if self.num_episodes != float('inf'): # Episode-based tqdm (default option if both num_episodes and num_timesteps set) assert self.num_episodes != float('inf') bar_format = ( '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep=' '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent=' '{postfix[4]:.1f}%]') postfix = [0.0, 0, 0.0, 0.0, 0.0] self.tqdm = tqdm(desc='Episodes', total=self.num_episodes, bar_format=bar_format, initial=self.episodes, postfix=postfix) self.tqdm_last_update = self.episodes def tqdm_callback(runner, parallel): mean_reward = float( np.mean(runner.episode_rewards[-mean_horizon:])) mean_ts_per_ep = int( np.mean(runner.episode_timesteps[-mean_horizon:])) mean_sec_per_ep = float( np.mean(runner.episode_seconds[-mean_horizon:])) mean_agent_sec = float( np.mean(runner.episode_agent_seconds[-mean_horizon:])) mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep runner.tqdm.postfix[0] = mean_reward runner.tqdm.postfix[1] = mean_ts_per_ep runner.tqdm.postfix[2] = mean_sec_per_ep runner.tqdm.postfix[3] = mean_ms_per_ts runner.tqdm.postfix[4] = mean_rel_agent runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update)) runner.tqdm_last_update = runner.episodes return inner_callback(runner, parallel) else: # Timestep-based tqdm self.tqdm = tqdm(desc='Timesteps', total=self.num_timesteps, initial=self.timesteps, postfix=dict(mean_reward='n/a')) self.tqdm_last_update = self.timesteps def tqdm_callback(runner, parallel): # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:]) # num_timesteps = min(num_mean_reward, runner.episode_timestep) # mean_reward = sum_timesteps_reward / num_episodes runner.tqdm.set_postfix(mean_reward='n/a') runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update)) runner.tqdm_last_update = runner.timesteps return inner_callback(runner, parallel) self.callback = tqdm_callback # Evaluation if self.evaluation_environment is None: assert evaluation_callback is None assert self.save_best_agent is False else: if evaluation_callback is None: self.evaluation_callback = (lambda r: None) else: self.evaluation_callback = evaluation_callback if self.save_best_agent is not False: inner_evaluation_callback = self.evaluation_callback def mean_reward_callback(runner): result = inner_evaluation_callback(runner) if result is None: return runner.evaluation_reward else: return result self.evaluation_callback = mean_reward_callback self.best_evaluation_score = None # Reset agent self.agent.reset() # Reset environments and episode statistics for environment in self.environments: environment.start_reset() self.episode_reward = [0.0 for _ in self.environments] self.episode_timestep = [0 for _ in self.environments] self.episode_agent_second = [0.0 for _ in self.environments] episode_start = [time.time() for _ in self.environments] environments = list(self.environments) if self.evaluation_environment is not None: self.evaluation_environment.start_reset() self.evaluation_reward = 0.0 self.evaluation_timestep = 0 self.evaluation_agent_second = 0.0 evaluation_start = time.time() environments.append(self.evaluation_environment) if self.sync_episodes: terminated = [False for _ in environments] # Runner loop while True: if not self.sync_timesteps: no_environment_ready = True # Parallel environments loop for parallel, environment in enumerate(environments): # Is evaluation environment? evaluation = (parallel == len(self.environments)) if self.sync_episodes and terminated[parallel]: # Continue if episode terminated continue if self.sync_timesteps: # Wait until environment is ready while True: observation = environment.retrieve_execute() if observation is not None: break time.sleep(num_sleep_secs) else: # Check whether environment is ready observation = environment.retrieve_execute() if observation is None: continue no_environment_ready = False states, terminal, reward = observation # Episode start or evaluation if terminal is None: # Retrieve actions from agent agent_start = time.time() actions = self.agent.act(states=states, parallel=(parallel - int(evaluation)), evaluation=evaluation) if evaluation: self.evaluation_agent_second += time.time( ) - agent_start self.evaluation_timestep += 1 else: self.timesteps += 1 self.episode_agent_second[parallel] += time.time( ) - agent_start self.episode_timestep[parallel] += 1 # Execute actions in environment environment.start_execute(actions=actions) continue elif isinstance(terminal, bool): terminal = int(terminal) # Observe unless episode just started or evaluation # assert (terminal is None) == (self.episode_timestep[parallel] == 0) # if terminal is not None and not evaluation: if evaluation: self.evaluation_reward += reward else: agent_start = time.time() updated = self.agent.observe(terminal=terminal, reward=reward, parallel=parallel) self.updates += int(updated) self.episode_agent_second[parallel] += time.time( ) - agent_start self.episode_reward[parallel] += reward # # Update global timesteps/episodes/updates # self.global_timesteps = self.agent.timesteps # self.global_episodes = self.agent.episodes # self.global_updates = self.agent.updates # Callback plus experiment termination check if not evaluation and \ self.episode_timestep[parallel] % self.callback_timestep_frequency == 0 and \ not self.callback(self, parallel): return if terminal > 0: if evaluation: # Update experiment statistics self.evaluation_rewards.append(self.evaluation_reward) self.evaluation_timesteps.append( self.evaluation_timestep) self.evaluation_seconds.append(time.time() - evaluation_start) self.evaluation_agent_seconds.append( self.evaluation_agent_second) # Evaluation callback if self.save_best_agent is not False: evaluation_score = self.evaluation_callback(self) assert isinstance(evaluation_score, float) if self.best_evaluation_score is None: self.best_evaluation_score = evaluation_score elif evaluation_score > self.best_evaluation_score: self.best_evaluation_score = evaluation_score if self.save_best_agent is True: self.agent.save(filename='best-model', append_timestep=False) else: self.agent.save( directory=self.save_best_agent, filename='best-model', append_timestep=False) else: self.evaluation_callback(self) else: # Increment episode counter (after calling callback) self.episodes += 1 # Update experiment statistics self.episode_rewards.append( self.episode_reward[parallel]) self.episode_timesteps.append( self.episode_timestep[parallel]) self.episode_seconds.append(time.time() - episode_start[parallel]) self.episode_agent_seconds.append( self.episode_agent_second[parallel]) # Callback if self.episodes % self.callback_episode_frequency == 0 and \ not self.callback(self, parallel): return # Terminate experiment if too long if self.timesteps >= self.num_timesteps: return elif self.episodes >= self.num_episodes: return elif self.updates >= self.num_updates: return elif self.agent.should_stop(): return # Check whether episode terminated if terminal > 0: if self.sync_episodes: terminated[parallel] = True if evaluation: # Reset environment and episode statistics environment.start_reset() self.evaluation_reward = 0.0 self.evaluation_timestep = 0 self.evaluation_agent_second = 0.0 evaluation_start = time.time() else: # Reset environment and episode statistics environment.start_reset() self.episode_reward[parallel] = 0.0 self.episode_timestep[parallel] = 0 self.episode_agent_second[parallel] = 0.0 episode_start[parallel] = time.time() else: # Retrieve actions from agent agent_start = time.time() actions = self.agent.act(states=states, parallel=(parallel - int(evaluation)), evaluation=evaluation) if evaluation: self.evaluation_agent_second += time.time( ) - agent_start self.evaluation_timestep += 1 else: self.timesteps += 1 self.episode_agent_second[parallel] += time.time( ) - agent_start self.episode_timestep[parallel] += 1 # Execute actions in environment environment.start_execute(actions=actions) if not self.sync_timesteps and no_environment_ready: # Sleep if no environment was ready time.sleep(num_sleep_secs) if self.sync_episodes and all(terminated): # Reset if all episodes terminated terminated = [False for _ in environments]
def __init__(self, agent, environment=None, max_episode_timesteps=None, evaluation=False, num_parallel=None, environments=None, remote=None, blocking=False, host=None, port=None): if environment is None and environments is None: assert num_parallel is not None and remote == 'socket-client' environments = [None for _ in range(num_parallel)] elif environment is None: assert environments is not None assert num_parallel is None or num_parallel == len(environments) if not util.is_iterable(x=environments): raise TensorforceError.type(name='parallel-runner', argument='environments', value=environments) elif len(environments) == 0: raise TensorforceError.value(name='parallel-runner', argument='environments', value=environments) num_parallel = len(environments) environments = list(environments) elif num_parallel is None: assert environments is None num_parallel = 1 environments = [environment] else: assert environments is None assert not isinstance(environment, Environment) environments = [environment for _ in range(num_parallel)] if port is None or isinstance(port, int): if isinstance(host, str): port = [port + n for n in range(num_parallel)] else: port = [port for _ in range(num_parallel)] else: assert len(port) == num_parallel if host is None or isinstance(host, str): host = [host for _ in range(num_parallel)] else: assert len(host) == num_parallel self.environments = list() self.is_environment_external = isinstance(environments[0], Environment) environment = Environment.create( environment=environments[0], max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[0], port=port[0]) self.is_environment_remote = isinstance(environment, RemoteEnvironment) states = environment.states() actions = environment.actions() self.environments.append(environment) for n, environment in enumerate(environments[1:], start=1): assert isinstance(environment, Environment) == self.is_environment_external environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[n], port=port[n]) assert isinstance(environment, RemoteEnvironment) == self.is_environment_remote assert environment.states() == states assert environment.actions() == actions self.environments.append(environment) self.evaluation = evaluation self.is_agent_external = isinstance(agent, Agent) if num_parallel - int(self.evaluation) > 1: self.agent = Agent.create( agent=agent, environment=environment, parallel_interactions=(num_parallel - int(self.evaluation))) else: self.agent = Agent.create(agent=agent, environment=environment)
def __init__( self, agent, environment=None, max_episode_timesteps=None, num_parallel=None, environments=None, evaluation=False, remote=None, blocking=False, host=None, port=None ): if environment is None and environments is None: if remote != 'socket-client': raise TensorforceError.required( name='Runner', argument='environment or environments' ) if num_parallel is None: raise TensorforceError.required( name='Runner', argument='num_parallel', condition='socket-client remote mode' ) environments = [None for _ in range(num_parallel)] elif environment is None: if environments is None: raise TensorforceError.required( name='Runner', argument='environment or environments' ) if not util.is_iterable(x=environments): raise TensorforceError.type( name='Runner', argument='environments', value=environments ) if len(environments) <= 1: raise TensorforceError.value( name='Runner', argument='len(environments)', value=len(environments) ) if num_parallel is not None and num_parallel != len(environments): raise TensorforceError.value( name='Runner', argument='num_parallel', value=num_parallel, hint='!= len(environments)' ) num_parallel = len(environments) environments = list(environments) elif num_parallel is None: if environments is not None: raise TensorforceError.invalid( name='Runner', argument='environments', condition='environment is specified' ) if evaluation: raise TensorforceError.invalid( name='Runner', argument='evaluation', condition='single environment' ) num_parallel = 1 environments = [environment] else: if not isinstance(num_parallel, int): raise TensorforceError.value( name='Runner', argument='num_parallel', dtype=type(num_parallel) ) elif num_parallel < 2: raise TensorforceError.value( name='Runner', argument='num_parallel', value=num_parallel, hint='< 2' ) if environments is not None: raise TensorforceError.invalid( name='Runner', argument='environments', condition='environment is specified' ) if isinstance(environment, Environment): raise TensorforceError.type( name='Runner', argument='environment', dtype=type(environment), condition='num_parallel', hint='is not specification' ) environments = [environment for _ in range(num_parallel)] if port is None or isinstance(port, int): if isinstance(host, str): port = [port + n for n in range(num_parallel)] else: port = [port for _ in range(num_parallel)] else: if len(port) != num_parallel: raise TensorforceError.value( name='Runner', argument='len(port)', value=len(port), hint='!= num_parallel' ) if host is None or isinstance(host, str): host = [host for _ in range(num_parallel)] else: if len(host) != num_parallel: raise TensorforceError.value( name='Runner', argument='len(host)', value=len(host), hint='!= num_parallel' ) self.environments = list() self.is_environment_external = isinstance(environments[0], Environment) environment = Environment.create( environment=environments[0], max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[0], port=port[0] ) self.is_environment_remote = isinstance(environment, RemoteEnvironment) states = environment.states() actions = environment.actions() self.environments.append(environment) for n, environment in enumerate(environments[1:], start=1): assert isinstance(environment, Environment) == self.is_environment_external environment = Environment.create( environment=environment, max_episode_timesteps=max_episode_timesteps, remote=remote, blocking=blocking, host=host[n], port=port[n] ) assert isinstance(environment, RemoteEnvironment) == self.is_environment_remote assert util.is_equal(x=environment.states(), y=states) assert util.is_equal(x=environment.actions(), y=actions) self.environments.append(environment) self.evaluation = evaluation self.is_agent_external = isinstance(agent, Agent) if num_parallel - int(self.evaluation) > 1: self.agent = Agent.create( agent=agent, environment=environment, parallel_interactions=(num_parallel - int(self.evaluation)) ) else: self.agent = Agent.create(agent=agent, environment=environment)
def observe(self, reward=0.0, terminal=False, parallel=0): """ Observes reward and whether a terminal state is reached, needs to be preceded by `act()`. Args: reward (float | iter[float]): Reward (<span style="color:#00C000"><b>default</b></span>: 0.0). terminal (bool | 0 | 1 | 2 | iter[...]): Whether a terminal state is reached, or 2 if the episode was aborted (<span style="color:#00C000"><b>default</b></span>: false). parallel (int, iter[int]): Parallel execution index (<span style="color:#00C000"><b>default</b></span>: 0). Returns: int: Number of performed updates. """ # Check whether inputs are batched if util.is_iterable(x=reward): reward = np.asarray(reward) num_parallel = reward.shape[0] if terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) if parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=terminal): terminal = np.asarray([int(t) for t in terminal]) num_parallel = terminal.shape[0] if reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if parallel == 0: assert num_parallel == self.parallel_interactions parallel = np.asarray(list(range(num_parallel))) else: parallel = np.asarray(parallel) elif util.is_iterable(x=parallel): parallel = np.asarray(parallel) num_parallel = parallel.shape[0] if reward == 0.0: reward = np.asarray([0.0 for _ in range(num_parallel)]) else: reward = np.asarray(reward) if terminal is False: terminal = np.asarray([0 for _ in range(num_parallel)]) else: terminal = np.asarray(terminal) else: reward = np.asarray([float(reward)]) terminal = np.asarray([int(terminal)]) parallel = np.asarray([int(parallel)]) num_parallel = 1 # Check whether shapes/lengths are consistent if parallel.shape[0] == 0: raise TensorforceError.value( name='Agent.observe', argument='len(parallel)', value=parallel.shape[0], hint='= 0' ) if reward.shape != parallel.shape: raise TensorforceError.value( name='Agent.observe', argument='len(reward)', value=reward.shape, hint='!= parallel length' ) if terminal.shape != parallel.shape: raise TensorforceError.value( name='Agent.observe', argument='len(terminal)', value=terminal.shape, hint='!= parallel length' ) # Convert terminal to int if necessary if terminal.dtype is util.np_dtype(dtype='bool'): zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int')) ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int')) terminal = np.where(terminal, ones, zeros) # Check whether current timesteps are not completed if self.timestep_completed[parallel].any(): raise TensorforceError(message="Calling agent.observe must be preceded by agent.act.") self.timestep_completed[parallel] = True # Process per parallel interaction num_updates = 0 for n in range(num_parallel): # Buffer inputs p = parallel[n] self.buffers['terminal'][p].append(terminal[n]) self.buffers['reward'][p].append(reward[n]) # Check whether episode is too long if self.max_episode_timesteps is not None and \ len(self.buffers['terminal'][p]) > self.max_episode_timesteps: raise TensorforceError(message="Episode longer than max_episode_timesteps.") # Continue if not terminal and buffer_observe if terminal[n].item() == 0 and ( self.config.buffer_observe == 'episode' or len(self.buffers['terminal'][p]) < self.config.buffer_observe ): continue # Buffered terminal/reward inputs t = np.asarray(self.buffers['terminal'][p], dtype=self.terminal_spec.np_type()) r = np.asarray(self.buffers['reward'][p], dtype=self.reward_spec.np_type()) self.buffers['terminal'][p].clear() self.buffers['reward'][p].clear() # Recorder if self.recorder_spec is not None and \ self.episodes >= self.recorder_spec.get('start', 0): # Store buffered values for name in self.states_spec: self.recorded['states'][name].append( np.stack(self.buffers['states'][name][p], axis=0) ) self.buffers['states'][name][p].clear() for name in self.auxiliaries_spec: self.recorded['auxiliaries'][name].append( np.stack(self.buffers['auxiliaries'][name][p], axis=0) ) self.buffers['auxiliaries'][name][p].clear() for name, spec in self.actions_spec.items(): self.recorded['actions'][name].append( np.stack(self.buffers['actions'][name][p], axis=0) ) self.buffers['actions'][name][p].clear() self.recorded['terminal'].append(t.copy()) self.recorded['reward'].append(r.copy()) # If terminal if t[-1] > 0: self.num_episodes += 1 # Check whether recording step if self.num_episodes == self.recorder_spec.get('frequency', 1): self.num_episodes = 0 # Manage recorder directory directory = self.recorder_spec['directory'] if os.path.isdir(directory): files = sorted( f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f)) and os.path.splitext(f)[1] == '.npz' ) else: os.makedirs(directory) files = list() max_traces = self.recorder_spec.get('max-traces') if max_traces is not None and len(files) > max_traces - 1: for filename in files[:-max_traces + 1]: filename = os.path.join(directory, filename) os.remove(filename) # Write recording file filename = os.path.join(directory, 'trace-{:09d}.npz'.format(self.episodes)) # time.strftime('%Y%m%d-%H%M%S') kwargs = self.recorded.fmap(function=np.concatenate, cls=ArrayDict).items() np.savez_compressed(file=filename, **dict(kwargs)) # Clear recorded values for recorded in self.recorded.values(): recorded.clear() # Inputs to tensors terminal_tensor = self.terminal_spec.to_tensor(value=t, batched=True) reward_tensor = self.reward_spec.to_tensor(value=r, batched=True) parallel_tensor = self.parallel_spec.to_tensor(value=p, batched=False) # Model.observe() updated, episodes, updates = self.model.observe( terminal=terminal_tensor, reward=reward_tensor, parallel=parallel_tensor ) num_updates += int(updated.numpy().item()) self.episodes = episodes.numpy().item() self.updates = updates.numpy().item() if self.model.saver is not None: self.model.save() return num_updates