def tf_parametrize(self, x): log_epsilon = tf.constant(value=log(util.epsilon), dtype=util.tf_dtype(dtype='float')) shape = (-1,) + self.action_spec['shape'] # Mean mean = self.mean.apply(x=x) mean = tf.reshape(tensor=mean, shape=shape) # Log standard deviation log_stddev = self.log_stddev.apply(x=x) log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Clip log_stddev for numerical stability # epsilon < 1.0, hence negative log_stddev = tf.clip_by_value( t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon ) # Standard deviation stddev = tf.exp(x=log_stddev) Module.update_tensor(name=(self.name + '-mean'), tensor=mean) Module.update_tensor(name=(self.name + '-stddev'), tensor=stddev) mean, log_stddev = self.add_summary( label=('distributions', 'gaussian'), name='mean', tensor=mean, pass_tensors=(mean, log_stddev) ) stddev, log_stddev = self.add_summary( label=('distributions', 'gaussian'), name='stddev', tensor=stddev, pass_tensors=(stddev, log_stddev) ) return mean, stddev, log_stddev
def tf_parametrize(self, x, mask): epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) shape = (-1,) + self.action_spec['shape'] + (self.action_spec['num_values'],) # Logits logits = self.logits.apply(x=x) logits = tf.reshape(tensor=logits, shape=shape) min_float = tf.fill(dims=tf.shape(input=logits), value=util.tf_dtype(dtype='float').min) logits = tf.where(condition=mask, x=logits, y=min_float) # States value states_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=logits, axis=-1) # "Normalized" logits logits = tf.log(x=tf.maximum(x=probabilities, y=epsilon)) # Logits as pass_tensor since used for sampling Module.update_tensor(name=(self.name + '-probabilities'), tensor=probabilities) logits, probabilities, states_value = self.add_summary( label=('distributions', 'categorical'), name='probabilities', tensor=probabilities, pass_tensors=(logits, probabilities, states_value), enumerate_last_rank=True ) return logits, probabilities, states_value
def tf_baseline_loss(self, states, internals, reward, reference=None): """ Creates the TensorFlow operations for calculating the baseline loss of a batch. Args: states: Dict of state tensors. internals: List of prior internal state tensors. reward: Reward tensor. reference: Optional reference tensor(s), in case of a comparative loss. Returns: Loss tensor. """ Module.update_tensors(**states, **internals, reward=reward) if self.baseline_mode == 'states': loss = self.baseline.total_loss(states=states, internals=internals, reward=reward) elif self.baseline_mode == 'network': embedding = self.network.apply(x=states, internals=internals) embedding = tf.stop_gradient(input=embedding) Module.update_tensors(embedding=embedding) loss = self.baseline.total_loss( states=OrderedDict(embedding=embedding), internals=internals, reward=reward) regularization_loss = self.baseline.regularize() if regularization_loss is not None: loss += regularization_loss return loss
def tf_core_update(self): Module.update_tensor(name='update', tensor=self.global_update) true = tf.constant(value=True, dtype=util.tf_dtype(dtype='bool')) one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long')) # Retrieve batch batch_size = self.update_batch_size.value() if self.update_unit == 'timesteps': # Timestep-based batch # Dependency horizon past_horizon = self.policy.past_horizon(is_optimization=True) past_horizon = tf.math.maximum( x=past_horizon, y=self.baseline_policy.past_horizon(is_optimization=True) ) future_horizon = self.estimator.future_horizon() indices = self.memory.retrieve_timesteps( n=batch_size, past_horizon=past_horizon, future_horizon=future_horizon ) elif self.update_unit == 'episodes': # Episode-based batch indices = self.memory.retrieve_episodes(n=batch_size) # Optimization optimized = self.optimize(indices=indices) # Increment update with tf.control_dependencies(control_inputs=(optimized,)): assignment = self.global_update.assign_add(delta=one, read_value=False) with tf.control_dependencies(control_inputs=(assignment,)): return util.identity_operation(x=true)
def __init__(self, name, action_spec, embedding_size, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_size=embedding_size, summary_labels=summary_labels) action_size = util.product(xs=self.action_spec['shape'], empty=0) input_spec = dict(type='float', shape=(self.embedding_size, )) self.mean = self.add_module(name='mean', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) self.log_stddev = self.add_module(name='log-stddev', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-mean'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True) Module.register_tensor(name=(self.name + '-stddev'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True)
def tf_parametrize(self, x, mask): epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) shape = (-1,) + self.action_spec['shape'] + (self.action_spec['num_values'],) value_shape = (-1,) + self.action_spec['shape'] + (1,) # Deviations action_values = self.deviations.apply(x=x) action_values = tf.reshape(tensor=action_values, shape=shape) min_float = tf.fill( dims=tf.shape(input=action_values), value=util.tf_dtype(dtype='float').min ) # States value if self.value is None: action_values = tf.where(condition=mask, x=action_values, y=min_float) states_value = tf.reduce_logsumexp(input_tensor=action_values, axis=-1) else: states_value = self.value.apply(x=x) if len(self.embedding_shape) == 1: states_value = tf.reshape(tensor=states_value, shape=value_shape) action_values = states_value + action_values - tf.math.reduce_mean( input_tensor=action_values, axis=-1, keepdims=True ) states_value = tf.squeeze(input=states_value, axis=-1) action_values = tf.where(condition=mask, x=action_values, y=min_float) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=action_values, axis=-1) # "Normalized" logits logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon)) Module.update_tensor(name=(self.name + '-probabilities'), tensor=probabilities) return logits, probabilities, states_value, action_values
def tf_apply(self, x): if len(self.tensors) == 1: if self.tensors == '*': return x else: return Module.retrieve_tensor(name=self.tensors[0]) tensors = list() for tensor in self.tensors: if tensor == '*': tensors.append(x) else: tensors.append(Module.retrieve_tensor(name=tensor)) shape = self.output_spec['shape'] for n, tensor in enumerate(tensors): for axis in range(util.rank(x=tensor), len(shape)): tensor = tf.expand_dims(input=tensor, axis=axis) tensors[n] = tensor if self.aggregation == 'concat': x = tf.concat(values=tensors, axis=(self.axis + 1)) elif self.aggregation == 'product': x = tf.stack(values=tensors, axis=(self.axis + 1)) x = tf.reduce_prod(input_tensor=x, axis=(self.axis + 1)) elif self.aggregation == 'stack': x = tf.stack(values=tensors, axis=(self.axis + 1)) elif self.aggregation == 'sum': x = tf.stack(values=tensors, axis=(self.axis + 1)) x = tf.reduce_sum(input_tensor=x, axis=(self.axis + 1)) return x
def api_update(self): # Set global tensors Module.update_tensors( deterministic=tf.constant(value=True, dtype=util.tf_dtype(dtype='bool')), independent=tf.constant(value=False, dtype=util.tf_dtype(dtype='bool')), optimization=tf.constant(value=True, dtype=util.tf_dtype(dtype='bool')), timestep=self.global_timestep, episode=self.global_episode, update=self.global_update) # Core update: retrieve update operation updated = self.core_update() with tf.control_dependencies(control_inputs=(updated, )): # Function-level identity operation for retrieval (plus enforce dependency) timestep = util.identity_operation( x=self.global_timestep, operation_name='timestep-output') episode = util.identity_operation(x=self.global_episode, operation_name='episode-output') update = util.identity_operation(x=self.global_update, operation_name='update-output') return timestep, episode, update
def __init__(self, name, action_spec, embedding_shape, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_shape=embedding_shape, summary_labels=summary_labels) input_spec = dict(type='float', shape=self.embedding_shape) if len(self.embedding_shape) == 1: action_size = util.product(xs=self.action_spec['shape'], empty=0) self.alpha = self.add_module(name='alpha', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) self.beta = self.add_module(name='beta', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) else: if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3: raise TensorforceError.value(name=name, argument='embedding_shape', value=self.embedding_shape, hint='invalid rank') if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]: size = self.action_spec['shape'][-1] elif self.embedding_shape[:-1] == self.action_spec['shape']: size = 0 else: raise TensorforceError.value( name=name, argument='embedding_shape', value=self.embedding_shape, hint='not flattened and incompatible with action shape') self.alpha = self.add_module(name='alpha', module='linear', modules=layer_modules, size=size, input_spec=input_spec) self.beta = self.add_module(name='beta', module='linear', modules=layer_modules, size=size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-alpha'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True) Module.register_tensor(name=(self.name + '-beta'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True)
def tf_states_value(self, states, internals, auxiliaries, reduced=True, include_per_action=False): if self.value is None: return ActionValue.tf_states_value( self=self, states=states, internals=internals, auxiliaries=auxiliaries, reduced=reduced, include_per_action=include_per_action) else: if not reduced or include_per_action: raise TensorforceError.invalid(name='policy.states_value', argument='reduced') embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) states_value = self.value.apply(x=embedding) return states_value
def tf_parametrize(self, x): # Softplus to ensure alpha and beta >= 1 one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float')) epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) log_epsilon = tf.constant(value=log(util.epsilon), dtype=util.tf_dtype(dtype='float')) shape = (-1,) + self.action_spec['shape'] # Alpha alpha = self.alpha.apply(x=x) # epsilon < 1.0, hence negative alpha = tf.clip_by_value(t=alpha, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) alpha = tf.math.softplus(features=alpha) + one if len(self.embedding_shape) == 1: alpha = tf.reshape(tensor=alpha, shape=shape) # Beta beta = self.beta.apply(x=x) # epsilon < 1.0, hence negative beta = tf.clip_by_value(t=beta, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) beta = tf.math.softplus(features=beta) + one if len(self.embedding_shape) == 1: beta = tf.reshape(tensor=beta, shape=shape) # Alpha + Beta alpha_beta = tf.maximum(x=(alpha + beta), y=epsilon) # Log norm log_norm = tf.math.lgamma(x=alpha) + tf.math.lgamma(x=beta) - tf.math.lgamma(x=alpha_beta) Module.update_tensor(name=(self.name + '-alpha'), tensor=alpha) Module.update_tensor(name=(self.name + '-beta'), tensor=beta) return alpha, beta, alpha_beta, log_norm
def tf_parametrize(self, x): log_epsilon = tf.constant(value=log(util.epsilon), dtype=util.tf_dtype(dtype='float')) shape = (-1, ) + self.action_spec['shape'] # Mean mean = self.mean.apply(x=x) if len(self.embedding_shape) == 1: mean = tf.reshape(tensor=mean, shape=shape) # Log standard deviation log_stddev = self.log_stddev.apply(x=x) if len(self.embedding_shape) == 1: log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Clip log_stddev for numerical stability # epsilon < 1.0, hence negative log_stddev = tf.clip_by_value(t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) # Standard deviation stddev = tf.exp(x=log_stddev) Module.update_tensor(name=(self.name + '-mean'), tensor=mean) Module.update_tensor(name=(self.name + '-stddev'), tensor=stddev) return mean, stddev, log_stddev
def tf_sample_actions(self, states, internals, auxiliaries, deterministic, return_internals): if return_internals: embedding, internals = self.network.apply( x=states, internals=internals, return_internals=return_internals ) else: embedding = self.network.apply( x=states, internals=internals, return_internals=return_internals ) Module.update_tensor(name=self.name, tensor=embedding) actions = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) action = distribution.sample(parameters=parameters, deterministic=deterministic) entropy = distribution.entropy(parameters=parameters) entropy = tf.reshape(tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) mean_entropy = tf.reduce_mean(input_tensor=entropy, axis=1) actions[name] = self.add_summary( label='entropy', name=(name + '-entropy'), tensor=mean_entropy, pass_tensors=action ) if return_internals: return actions, internals else: return actions
def tf_core_observe(self, states, internals, auxiliaries, actions, terminal, reward): zero = tf.constant(value=0, dtype=util.tf_dtype(dtype='long')) # Experience experienced = self.core_experience(states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, terminal=terminal, reward=reward) # If no periodic update if self.update_frequency == 'never': return experienced # Periodic update with tf.control_dependencies(control_inputs=(experienced, )): batch_size = self.update_batch_size.value() frequency = self.update_frequency.value() start = self.update_start.value() if self.update_unit == 'timesteps': # Timestep-based batch one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long')) past_horizon = self.policy.dependency_horizon( is_optimization=True) if self.baseline_policy is not None: past_horizon = tf.math.maximum( x=past_horizon, y=self.baseline_policy.dependency_horizon( is_optimization=True)) future_horizon = self.estimator.horizon.value() + one start = tf.math.maximum(x=start, y=(batch_size + past_horizon + future_horizon)) timestep = Module.retrieve_tensor(name='timestep') timestep = timestep - self.estimator.capacity is_frequency = tf.math.equal(x=tf.mod(x=timestep, y=frequency), y=zero) at_least_start = tf.math.greater_equal(x=timestep, y=start) elif self.update_unit == 'episodes': # Episode-based batch start = tf.math.maximum(x=start, y=batch_size) episode = Module.retrieve_tensor(name='episode') is_frequency = tf.math.equal(x=tf.mod(x=episode, y=frequency), y=zero) # Only update once per episode increment terminal = tf.concat(values=((zero, ), terminal), axis=0) is_frequency = tf.math.logical_and(x=is_frequency, y=(terminal[-1] > zero)) at_least_start = tf.math.greater_equal(x=episode, y=start) is_updated = self.cond(pred=tf.math.logical_and(x=is_frequency, y=at_least_start), true_fn=self.core_update, false_fn=util.no_operation) return is_updated
def tf_initialize(self): super().tf_initialize() if self.unit is None: step = None elif self.unit == 'timesteps': step = Module.retrieve_tensor(name='timestep') elif self.unit == 'episodes': step = Module.retrieve_tensor(name='episode') elif self.unit == 'updates': step = Module.retrieve_tensor(name='update') default = self.get_parameter_value(step=step) # Temporarily leave module variable scope, otherwise placeholder name is unnecessarily long if self.device is not None: raise TensorforceError.unexpected() self.scope.__exit__(None, None, None) self.parameter_input = self.add_placeholder(name=self.name, dtype=self.dtype, shape=self.shape, batched=False, default=default) self.scope.__enter__()
def tf_parametrize(self, x): one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float')) epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) shape = (-1, ) + self.action_spec['shape'] # Logit logit = self.logit.apply(x=x) if len(self.embedding_shape) == 1: logit = tf.reshape(tensor=logit, shape=shape) # States value states_value = logit # Sigmoid for corresponding probability probability = tf.sigmoid(x=logit) # Clip probability for numerical stability probability = tf.clip_by_value(t=probability, clip_value_min=epsilon, clip_value_max=(one - epsilon)) # "Normalized" logits true_logit = tf.math.log(x=probability) false_logit = tf.math.log(x=(one - probability)) Module.update_tensor(name=(self.name + '-probability'), tensor=probability) return true_logit, false_logit, probability, states_value
def tf_parametrize(self, x): one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float')) epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) shape = (-1, ) + self.action_spec['shape'] # Logit logit = self.logit.apply(x=x) logit = tf.reshape(tensor=logit, shape=shape) # States value states_value = logit # Sigmoid for corresponding probability probability = tf.sigmoid(x=logit) # Clip probability for numerical stability probability = tf.clip_by_value(t=probability, clip_value_min=epsilon, clip_value_max=(one - epsilon)) # "Normalized" logits true_logit = tf.log(x=probability) false_logit = tf.log(x=(one - probability)) Module.update_tensor(name=(self.name + '-probability'), tensor=probability) true_logit, false_logit, probability, states_value = self.add_summary( label=('distributions', 'bernoulli'), name='probability', tensor=probability, pass_tensors=(true_logit, false_logit, probability, states_value)) return true_logit, false_logit, probability, states_value
def tf_sample_actions(self, states, internals, auxiliaries, temperature, return_internals): if return_internals: embedding, internals = self.network.apply( x=states, internals=internals, return_internals=return_internals) else: embedding = self.network.apply(x=states, internals=internals, return_internals=return_internals) Module.update_tensor(name=self.name, tensor=embedding) actions = OrderedDict() for name, spec, distribution, temp in util.zip_items( self.actions_spec, self.distributions, temperature): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) actions[name] = distribution.sample(parameters=parameters, temperature=temp) if return_internals: return actions, internals else: return actions
def __init__(self, name, action_spec, embedding_size, infer_states_value=True, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_size=embedding_size, summary_labels=summary_labels) shape = self.action_spec['shape'] num_values = self.action_spec['num_values'] action_size = util.product(xs=shape) input_spec = dict(type='float', shape=(self.embedding_size, )) self.deviations = self.add_module(name='deviations', module='linear', modules=layer_modules, size=(action_size * num_values), input_spec=input_spec) if infer_states_value: self.value = None else: self.value = self.add_module(name='value', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-probabilities'), spec=dict(type='float', shape=(shape + (num_values, ))), batched=True)
def __init__(self, name, dtype, unit=None, shape=(), min_value=None, max_value=None, summary_labels=None): super().__init__(name=name, summary_labels=summary_labels) assert unit in (None, 'timesteps', 'episodes', 'updates') self.unit = unit spec = dict(type=dtype, shape=shape) spec = util.valid_value_spec(value_spec=spec, return_normalized=True) self.dtype = spec['type'] self.shape = spec['shape'] assert min_value is None or max_value is None or min_value < max_value if self.dtype == 'bool': if min_value is not None or max_value is not None: raise TensorforceError.unexpected() elif self.dtype in ('int', 'long'): if (min_value is not None and not isinstance(min_value, int)) or \ (max_value is not None and not isinstance(max_value, int)): raise TensorforceError.unexpected() elif self.dtype == 'float': if (min_value is not None and not isinstance(min_value, float)) or \ (max_value is not None and not isinstance(max_value, float)): raise TensorforceError.unexpected() else: assert False assert self.min_value() is None or self.max_value() is None or \ self.min_value() <= self.max_value() if min_value is not None: if self.min_value() is None: raise TensorforceError.value(name=self.name, argument='lower bound', value=self.min_value(), hint=('not >= ' + str(min_value))) elif self.min_value() < min_value: raise TensorforceError.value(name=self.name, argument='lower bound', value=self.min_value(), hint=('< ' + str(min_value))) if max_value is not None: if self.max_value() is None: raise TensorforceError.value(name=self.name, argument='upper bound', value=self.max_value(), hint=('not <= ' + str(max_value))) elif self.max_value() > max_value: raise TensorforceError.value(name=self.name, argument='upper bound', value=self.max_value(), hint=('> ' + str(max_value))) Module.register_tensor(name=self.name, spec=spec, batched=False)
def tf_loss_per_instance( self, states, internals, actions, terminal, reward, next_states, next_internals, reference=None ): # Really state value instead of q value? # Michael: doubling this function because NAF needs V'(s) not Q'(s), see comment below embedding = self.network.apply(x=states, internals=internals) # Both networks can use the same internals, could that be a problem? # Otherwise need to handle internals indices correctly everywhere target_internals = OrderedDict() for name, internal in next_internals.items(): target_internals['target-' + name] = internal Module.update_tensors(**target_internals) target_embedding = self.target_network.apply(x=next_states, internals=target_internals) deltas = list() for name in sorted(self.distributions): distribution = self.distributions[name] target_distribution = self.target_distributions[name] parameters = distribution.parametrize(x=embedding) target_parameters = target_distribution.parametrize(x=target_embedding) q_value = self.tf_q_value( embedding=embedding, parameters=parameters, action=actions[name], name=name ) # Notice, this is V', not Q' because NAF outputs V(s) separately next_state_value = target_distribution.states_value(parameters=target_parameters) delta = self.tf_q_delta( q_value=q_value, next_q_value=next_state_value, terminal=terminal, reward=reward ) collapsed_size = util.product(xs=util.shape(delta)[1:]) delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size)) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) # Optional Huber loss huber_loss = self.huber_loss.value() def no_huber_loss(): return tf.square(x=loss_per_instance) def apply_huber_loss(): return tf.where( condition=(tf.abs(x=loss_per_instance) <= huber_loss), x=(0.5 * tf.square(x=loss_per_instance)), y=(huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * huber_loss)) ) zero = tf.constant(value=0.0, dtype=util.tf_dtype(dtype='float')) skip_huber_loss = tf.math.equal(x=huber_loss, y=zero) return self.cond(pred=skip_huber_loss, true_fn=no_huber_loss, false_fn=apply_huber_loss)
def tf_apply(self, x): def no_update(): return self.moving_mean, self.moving_variance def apply_update(): one = tf.constant(value=1.0, dtype=util.tf_dtype(dtype='float')) axes = tuple(1 + axis for axis in self.axes) decay = self.decay.value() batch_size = tf.dtypes.cast(x=tf.shape(input=x)[0], dtype=util.tf_dtype(dtype='float')) decay = tf.math.pow(x=decay, y=batch_size) mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) mean = tf.where( condition=self.after_first_call, x=(decay * self.moving_mean + (one - decay) * mean), y=mean ) variance = tf.reduce_mean( input_tensor=tf.math.squared_difference(x=x, y=mean), axis=axes, keepdims=True ) variance = tf.where( condition=self.after_first_call, x=(decay * self.moving_variance + (one - decay) * variance), y=variance ) with tf.control_dependencies(control_inputs=(mean, variance)): assignment = self.after_first_call.assign( value=tf.constant(value=True, dtype=util.tf_dtype(dtype='bool')), read_value=False ) with tf.control_dependencies(control_inputs=(assignment,)): variance = self.moving_variance.assign(value=variance) mean = self.moving_mean.assign(value=mean) return mean, variance optimization = Module.retrieve_tensor(name='optimization') update_on_optimization = tf.where( condition=self.after_first_call, x=self.update_on_optimization, y=optimization ) update_on_optimization = self.update_on_optimization.assign(value=update_on_optimization) skip_update = tf.math.logical_or( x=Module.retrieve_tensor(name='independent'), y=tf.math.not_equal(x=update_on_optimization, y=optimization) ) mean, variance = self.cond(pred=skip_update, true_fn=no_update, false_fn=apply_update) epsilon = tf.constant(value=util.epsilon, dtype=util.tf_dtype(dtype='float')) reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=epsilon)) x = (x - tf.stop_gradient(input=mean)) * tf.stop_gradient(input=reciprocal_stddev) return x
def __init__(self, name, dtype, shape=(), summary_labels=None): super().__init__(name=name, summary_labels=summary_labels) spec = dict(type=dtype, shape=shape) spec = util.valid_value_spec(value_spec=spec, return_normalized=True) self.dtype = spec['type'] self.shape = spec['shape'] Module.register_tensor(name=self.name, spec=spec, batched=False)
def get_output_spec(self, input_spec): if len(self.tensors) == 1: return Module.get_tensor_spec(name=self.tensors[0]) # Get tensor types and shapes dtypes = list() shapes = list() for tensor in self.tensors: # Tensor specification if tensor == '*': spec = input_spec else: spec = Module.get_tensor_spec(name=tensor) dtypes.append(spec['type']) shapes.append(spec['shape']) # Check tensor types if all(dtype == dtypes[0] for dtype in dtypes): dtype = dtypes[0] else: raise TensorforceError.value(name='tensor types', value=dtypes) if self.aggregation == 'concat': if any(len(shape) != len(shapes[0]) for shape in shapes): raise TensorforceError.value(name='tensor shapes', value=shapes) elif any( shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape)) if n != self.axis ): raise TensorforceError.value(name='tensor shapes', value=shapes) shape = tuple( sum(shape[n] for shape in shapes) if n == self.axis else shapes[0][n] for n in range(len(shapes[0])) ) elif self.aggregation == 'stack': if any(len(shape) != len(shapes[0]) for shape in shapes): raise TensorforceError.value(name='tensor shapes', value=shapes) elif any(shape[n] != shapes[0][n] for shape in shapes for n in range(len(shape))): raise TensorforceError.value(name='tensor shapes', value=shapes) shape = tuple( len(shapes) if n == self.axis else shapes[0][n - int(n > self.axis)] for n in range(len(shapes[0]) + 1) ) else: # Check and unify tensor shapes for shape in shapes: if len(shape) != len(shapes[0]): raise TensorforceError.value(name='tensor shapes', value=shapes) if any(x != y and x != 1 and y != 1 for x, y in zip(shape, shapes[0])): raise TensorforceError.value(name='tensor shapes', value=shapes) shape = tuple(max(shape[n] for shape in shapes) for n in range(len(shapes[0]))) # Missing num_values, min/max_value!!! return dict(type=dtype, shape=shape)
def tf_value(self): parameter = tf.identity(input=self.parameter_input) parameter = self.add_summary(label='parameters', name='value', tensor=parameter) # Required for TensorFlow optimizers learning_rate if Module.global_tensors is not None: Module.update_tensor(name=self.name, tensor=parameter) return parameter
def tf_optimize_baseline(self, indices): # Retrieve states, internals, actions and reward dependency_horizon = self.baseline_policy.dependency_horizon(is_optimization=True) # horizon change: see timestep-based batch sampling starts, lengths, states, internals = self.memory.predecessors( indices=indices, horizon=dependency_horizon, sequence_values='states', initial_values='internals' ) Module.update_tensors(dependency_starts=starts, dependency_lengths=lengths) auxiliaries, actions, reward = self.memory.retrieve( indices=indices, values=('auxiliaries', 'actions', 'reward') ) # Reward estimation reward = self.estimator.estimate1( baseline=self.baseline_policy, memory=self.memory, indices=indices, reward=reward ) # Optimizer arguments variables = self.baseline_policy.get_variables(only_trainable=True) if self.shared_baseline_network: variables += self.policy.network.get_variables(only_trainable=True) arguments = dict( states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, reward=reward ) fn_loss = self.baseline_loss def fn_kl_divergence(states, internals, auxiliaries, actions, reward, other=None): return self.baseline_policy.kl_divergence( states=states, internals=internals, auxiliaries=auxiliaries, other=other ) source_variables = self.policy.get_variables(only_trainable=True) if self.global_model is None: global_variables = None else: global_variables = self.global_model.baseline_policy.get_variables(only_trainable=True) if self.baseline_objective is None: kwargs = dict() else: kwargs = self.baseline_objective.optimizer_arguments(policy=self.baseline_policy) # Optimization optimized = self.baseline_optimizer.minimize( variables=variables, arguments=arguments, fn_loss=fn_loss, fn_kl_divergence=fn_kl_divergence, source_variables=source_variables, global_variables=global_variables, **kwargs ) return optimized
def tf_core_act(self, states, internals, auxiliaries): zero = tf.constant(value=0, dtype=util.tf_dtype(dtype='long')) # Dependency horizon dependency_horizon = self.policy.dependency_horizon( is_optimization=False) dependency_horizon = tf.math.maximum( x=dependency_horizon, y=self.baseline_policy.dependency_horizon(is_optimization=False)) # TODO: handle arbitrary non-optimization horizons! assertion = tf.debugging.assert_equal(x=dependency_horizon, y=zero) with tf.control_dependencies(control_inputs=(assertion, )): some_state = next(iter(states.values())) if util.tf_dtype(dtype='long') in (tf.int32, tf.int64): batch_size = tf.shape(input=some_state, out_type=util.tf_dtype(dtype='long'))[0] else: batch_size = tf.dtypes.cast(x=tf.shape(input=some_state)[0], dtype=util.tf_dtype(dtype='long')) starts = tf.range(start=batch_size, dtype=util.tf_dtype(dtype='long')) lengths = tf.ones(shape=(batch_size, ), dtype=util.tf_dtype(dtype='long')) Module.update_tensors(dependency_starts=starts, dependency_lengths=lengths) # Separate baseline internals # if self.separate_baseline_internals: # baseline_internals = OrderedDict() # for name in iter(internals): # if name.startswith('baseline-'): # baseline_internals[name] = internals.pop(name) # Policy act actions, next_internals = self.policy.act(states=states, internals=internals, auxiliaries=auxiliaries, return_internals=True) # TODO: entropy etc summaries! if any(name not in next_internals for name in internals): # Baseline policy act to retrieve next internals _, baseline_internals = self.baseline_policy.act( states=states, internals=internals, auxiliaries=auxiliaries, return_internals=True) assert all(name not in next_internals for name in baseline_internals) next_internals.update(baseline_internals) return actions, next_internals
def __init__(self, name, dtype, shape=(), unit=None, summary_labels=None): super().__init__(name=name, summary_labels=summary_labels) assert unit in (None, 'timesteps', 'episodes', 'updates') spec = dict(type=dtype, shape=shape) spec = util.valid_value_spec(value_spec=spec, return_normalized=True) self.dtype = spec['type'] self.shape = spec['shape'] self.unit = unit Module.register_tensor(name=self.name, spec=spec, batched=False)
def __init__(self, name, action_spec, embedding_shape, summary_labels=None): super().__init__(name=name, action_spec=action_spec, embedding_shape=embedding_shape, summary_labels=summary_labels) input_spec = dict(type='float', shape=self.embedding_shape) if len(self.embedding_shape) == 1: action_size = util.product(xs=self.action_spec['shape'], empty=0) self.mean = self.add_module(name='mean', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) self.log_stddev = self.add_module(name='log-stddev', module='linear', modules=layer_modules, size=action_size, input_spec=input_spec) else: if len(self.embedding_shape) < 1 or len(self.embedding_shape) > 3: raise TensorforceError.unexpected() if self.embedding_shape[:-1] == self.action_spec['shape'][:-1]: size = self.action_spec['shape'][-1] elif self.embedding_shape[:-1] == self.action_spec['shape']: size = 0 else: raise TensorforceError.unexpected() self.mean = self.add_module(name='mean', module='linear', modules=layer_modules, size=size, input_spec=input_spec) self.log_stddev = self.add_module(name='log-stddev', module='linear', modules=layer_modules, size=size, input_spec=input_spec) Module.register_tensor(name=(self.name + '-mean'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True) Module.register_tensor(name=(self.name + '-stddev'), spec=dict(type='float', shape=self.action_spec['shape']), batched=True)
def tf_core_observe(self, states, internals, auxiliaries, actions, terminal, reward): zero = tf.constant(value=0, dtype=util.tf_dtype(dtype='long')) one = tf.constant(value=1, dtype=util.tf_dtype(dtype='long')) # Experience experienced = self.core_experience( states=states, internals=internals, auxiliaries=auxiliaries, actions=actions, terminal=terminal, reward=reward ) # If no periodic update if self.update_frequency is None: return experienced # Periodic update with tf.control_dependencies(control_inputs=(experienced,)): batch_size = self.update_batch_size.value() frequency = self.update_frequency.value() start = self.update_start.value() if self.update_unit == 'timesteps': # Timestep-based batch policy_horizon = self.policy.past_horizon(is_optimization=True) baseline_horizon = self.baseline_policy.past_horizon(is_optimization=True) - \ self.estimator.future_horizon() past_horizon = tf.math.maximum(x=policy_horizon, y=baseline_horizon) future_horizon = self.estimator.future_horizon() start = tf.math.maximum( x=start, y=(frequency + past_horizon + future_horizon + one) ) unit = Module.retrieve_tensor(name='timestep') elif self.update_unit == 'episodes': # Episode-based batch start = tf.math.maximum(x=start, y=frequency) unit = Module.retrieve_tensor(name='episode') unit = unit - start is_frequency = tf.math.equal(x=tf.math.mod(x=unit, y=frequency), y=zero) is_frequency = tf.math.logical_and(x=is_frequency, y=(unit > self.last_update)) def perform_update(): assignment = self.last_update.assign(value=unit, read_value=False) with tf.control_dependencies(control_inputs=(assignment,)): return self.core_update() is_updated = self.cond( pred=is_frequency, true_fn=perform_update, false_fn=util.no_operation ) return is_updated