def linesearch(): loss_after = fn_loss(**arguments.to_kwargs()) with tf.control_dependencies(control_inputs=(loss_after, )): # Replace "/" with "_" to ensure TensorDict is flat _deltas = TensorDict( ((var.name[:-2].replace('/', '_'), delta) for var, delta in zip(variables, deltas))) # TODO: should be moved to initialize_given_variables, but fn_loss... def evaluate_step(arguments, deltas): assignments = list() for variable, delta in zip(variables, deltas.values()): assignments.append( variable.assign_add(delta=delta, read_value=False)) with tf.control_dependencies( control_inputs=assignments): return fn_loss(**arguments.to_kwargs()) _deltas = self.line_search.solve(arguments=arguments, x_init=_deltas, base_value=loss_before, zero_value=loss_after, fn_x=evaluate_step) return tuple(_deltas.values())
def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): assert len(internals) == 0 actions = TensorDict() for name, spec in self.actions_spec.items(): shape = tf.concat(values=( tf_util.cast(x=tf.shape(input=states.value())[:1], dtype='int'), tf_util.constant(value=spec.shape, dtype='int') ), axis=0) if self.action_values is not None and name in self.action_values: # If user-specified, choose given action action = tf_util.constant(value=self.action_values[name], dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) elif self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: # If masking, choose first unmasked action mask = auxiliaries[name]['mask'] choices = tf_util.constant( value=list(range(spec.num_values)), dtype='int', shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values)) ) one = tf_util.constant(value=1, dtype='int', shape=(1,)) multiples = tf.concat(values=(shape, one), axis=0) choices = tf.tile(input=choices, multiples=multiples) choices = tf.boolean_mask(tensor=choices, mask=mask) mask = tf_util.cast(x=mask, dtype='int') num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1)) num_valid = tf.reshape(tensor=num_valid, shape=(-1,)) masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) action = tf.gather(params=choices, indices=masked_offset) actions[name] = tf.reshape(tensor=action, shape=shape) elif spec.type != 'bool' and spec.min_value is not None: if spec.max_value is not None: # If min/max_value given, choose mean action action = spec.min_value + 0.5 * (spec.max_value - spec.min_value) action = tf_util.constant(value=action, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) else: # If only min_value given, choose min_value action = tf_util.constant(value=spec.min_value, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) elif spec.type != 'bool' and spec.max_value is not None: # If only max_value given, choose max_value action = tf_util.constant(value=spec.max_value, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) else: # Else choose zero actions[name] = tf_util.zeros(shape=shape, dtype=spec.type) return actions, TensorDict()
def independent_act(self, *, states, internals=None, auxiliaries=None): if internals is None: assert len(self.internals_spec) == 0 internals = TensorDict() if auxiliaries is None: assert len(self.auxiliaries_spec) == 0 auxiliaries = TensorDict() true = tf_util.constant(value=True, dtype='bool') batch_size = tf_util.cast(x=tf.shape(input=states.value())[0], dtype='int') # Input assertions assertions = list() if self.config.create_tf_assertions: assertions.extend(self.states_spec.tf_assert( x=states, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} state input.' )) assertions.extend(self.internals_spec.tf_assert( x=internals, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} internal input.' )) assertions.extend(self.auxiliaries_spec.tf_assert( x=auxiliaries, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} input.' )) # Mask assertions if self.config.enable_int_action_masking: for name, spec in self.actions_spec.items(): if spec.type == 'int': assertions.append(tf.debugging.assert_equal( x=tf.reduce_all(input_tensor=tf.math.reduce_any( input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1) )), y=true, message="Agent.independent_act: at least one action has to be valid." )) with tf.control_dependencies(control_inputs=assertions): # Core act parallel = tf_util.zeros(shape=(1,), dtype='int') actions, internals = self.core_act( states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, independent=True ) # Skip action assertions # SavedModel requires flattened output if len(self.internals_spec) > 0: return OrderedDict(TensorDict(actions=actions, internals=internals)) else: return OrderedDict(actions)
def apply(self, *, x, deterministic, independent): assert x.is_singleton() x = x.singleton() registered_tensors = TensorDict(input=x) for layer in self.layers: if isinstance(layer, Register): if layer.tensor in registered_tensors: raise TensorforceError.exists(name='registered tensor', value=layer.tensor) x = layer.apply(x=x) registered_tensors[layer.tensor] = x elif isinstance(layer, MultiInputLayer): if layer.tensors not in registered_tensors: raise TensorforceError.exists_not(name='registered tensor', value=layer.tensors) x = layer.apply(x=registered_tensors[layer.tensors]) elif isinstance(layer, NondeterministicLayer): x = layer.apply(x=x, deterministic=deterministic) elif isinstance(layer, StatefulLayer): x = layer.apply(x=x, independent=independent) else: x = layer.apply(x=x) return x
def state_value(self, *, states, horizons, internals, auxiliaries): if self.state_value_mode == 'separate': deterministic = tf_util.constant(value=True, dtype='bool') embedding, _ = self.network.apply( x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=True ) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) return self.s_value.apply(x=embedding.get('state-embedding', embedding['embedding'])) else: return super().state_value( states=states, horizons=horizons, internals=internals, auxiliaries=auxiliaries )
def fisher_matrix_product(arguments, deltas): # Second-order gradients with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape1: for variable in variables: tape1.watch(tensor=variable) with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape2: for variable in variables: tape2.watch(tensor=variable) # kldiv kldiv = fn_kl_divergence(**arguments.to_kwargs()) # grad(kldiv) kldiv_grads = tape2.gradient(target=kldiv, sources=variables) kldiv_grads = [ tf.zeros_like(input=var) if grad is None else grad for var, grad in zip(variables, kldiv_grads) ] # delta' * grad(kldiv) multiply = functools.partial( tf_util.lift_indexedslices, tf.math.multiply, with_assertions=self.config.create_tf_assertions ) delta_kldiv_grads = tf.math.add_n(inputs=[ tf.math.reduce_sum(input_tensor=multiply(delta, grad)) for delta, grad in zip(deltas.values(), kldiv_grads) ]) # [delta' * F] = grad(delta' * grad(kldiv)) delta_kldiv_grads2 = tape1.gradient(target=delta_kldiv_grads, sources=variables) return TensorDict(( (var.name, tf.zeros_like(input=var) if grad is None else grad) for var, grad in zip(variables, delta_kldiv_grads2) ))
def action_values(self, *, states, horizons, internals, auxiliaries, actions): deterministic = tf_util.constant(value=True, dtype='bool') embedding, _ = self.network.apply(x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=True) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) def function(name, distribution, action): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize(x=x, conditions=conditions) return distribution.action_value(parameters=parameters, action=action) return self.distributions.fmap(function=function, cls=TensorDict, with_names=True, zip_values=actions)
def parametrize(self, *, x, conditions): # Softplus to ensure alpha and beta >= 1 one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') shape = (-1,) + self.action_spec.shape # Alpha alpha = self.alpha.apply(x=x) # epsilon < 1.0, hence negative alpha = tf.clip_by_value(t=alpha, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) alpha = tf.math.softplus(features=alpha) + one if len(self.input_spec.shape) == 1: alpha = tf.reshape(tensor=alpha, shape=shape) # Beta beta = self.beta.apply(x=x) # epsilon < 1.0, hence negative beta = tf.clip_by_value(t=beta, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) beta = tf.math.softplus(features=beta) + one if len(self.input_spec.shape) == 1: beta = tf.reshape(tensor=beta, shape=shape) # Alpha + Beta alpha_beta = tf.maximum(x=(alpha + beta), y=epsilon) # Log norm log_norm = tf.math.lgamma(x=alpha) + tf.math.lgamma(x=beta) - tf.math.lgamma(x=alpha_beta) return TensorDict(alpha=alpha, beta=beta, alpha_beta=alpha_beta, log_norm=log_norm)
def parametrize(self, *, x, conditions): log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') shape = (-1,) + self.action_spec.shape # Mean mean = self.mean.apply(x=x) if len(self.input_spec.shape) == 1: mean = tf.reshape(tensor=mean, shape=shape) # Log standard deviation if self.global_stddev: multiples = (tf.shape(input=x)[0],) + tuple(1 for _ in range(self.action_spec.rank)) log_stddev = tf.tile(input=self.log_stddev, multiples=multiples) else: log_stddev = self.log_stddev.apply(x=x) if len(self.input_spec.shape) == 1: log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Shift log stddev to reduce zero value (TODO: 0.1 random choice) if self.action_spec.min_value is not None and self.action_spec.max_value is not None: log_stddev += tf_util.constant(value=np.log(0.1), dtype='float') # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative) log_stddev = tf.clip_by_value( t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon ) # Standard deviation stddev = tf.math.exp(x=log_stddev) return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
def function(name, distribution): conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize( x=embedding, conditions=conditions) return distribution.sample(parameters=parameters, temperature=temperature, independent=independent)
def parametrize(self, *, x, conditions): log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') shape = (-1, ) + self.action_spec.shape # Mean mean = self.mean.apply(x=x) if len(self.input_spec.shape) == 1: mean = tf.reshape(tensor=mean, shape=shape) # Log standard deviation if self.global_stddev: log_stddev = self.log_stddev else: log_stddev = self.log_stddev.apply(x=x) if len(self.input_spec.shape) == 1: log_stddev = tf.reshape(tensor=log_stddev, shape=shape) # Clip log_stddev for numerical stability (epsilon < 1.0, hence negative) log_stddev = tf.clip_by_value(t=log_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) # Standard deviation stddev = tf.exp(x=log_stddev) return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
def apply(self, *, x, horizons, internals, deterministic, independent): if x.is_singleton(): registered_tensors = TensorDict(state=x.singleton()) else: registered_tensors = x.copy() x = x.value() for layer in self.layers: if isinstance(layer, Register): if layer.tensor in registered_tensors: raise TensorforceError.exists(name='registered tensor', value=layer.tensor) x = layer.apply(x=x) registered_tensors[layer.tensor] = x elif isinstance(layer, MultiInputLayer): if layer.tensors not in registered_tensors: raise TensorforceError.exists_not(name='registered tensor', value=layer.tensors) x = layer.apply(x=registered_tensors[layer.tensors]) elif isinstance(layer, NondeterministicLayer): x = layer.apply(x=x, deterministic=deterministic) elif isinstance(layer, StatefulLayer): x = layer.apply(x=x, independent=independent) elif isinstance(layer, TemporalLayer): x, internals[layer.name] = layer.apply( x=x, horizons=horizons, internals=internals[layer.name]) else: x = layer.apply(x=x) return x, internals
def function(name, distribution): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) return distribution.parametrize(x=x, conditions=conditions)
def parametrize(self, *, x, conditions): one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') shape = (-1, ) + self.action_spec.shape # Logit logit = self.logit.apply(x=x) if len(self.input_spec.shape) == 1: logit = tf.reshape(tensor=logit, shape=shape) # States value state_value = logit # Sigmoid for corresponding probability probability = tf.sigmoid(x=logit) # Clip probability for numerical stability probability = tf.clip_by_value(t=probability, clip_value_min=epsilon, clip_value_max=(one - epsilon)) # "Normalized" logits true_logit = tf.math.log(x=probability) false_logit = tf.math.log(x=(one - probability)) return TensorDict(true_logit=true_logit, false_logit=false_logit, probability=probability, state_value=state_value)
def function(name, distribution): conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize(x=embedding, conditions=conditions) mode = distribution.mode(parameters=parameters, independent=independent) entropy = distribution.entropy(parameters=parameters) return mode, entropy
def apply(self, *, x, deterministic, independent): assert x.is_singleton() x = x.singleton() registered_tensors = TensorDict(input=x) x = Preprocessor._recursive_apply( layer=self.layers, x=x, deterministic=deterministic, independent=independent, registered_tensors=registered_tensors) return x
def function(name, distribution): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize( x=x, conditions=conditions) return distribution.sample(parameters=parameters, temperature=temperature, independent=independent)
def function(name, distribution): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize(x=x, conditions=conditions) mode = distribution.mode(parameters=parameters, independent=independent) entropy = distribution.entropy(parameters=parameters) return mode, entropy
def next_internals(self, *, states, horizons, internals, actions, deterministic, independent): inputs = TensorDict() if self.states_spec.is_singleton(): inputs['states'] = states.singleton() else: inputs['states'] = states if self.actions_spec.is_singleton(): inputs['actions'] = actions.singleton() else: inputs['actions'] = actions return super().next_internals( states=inputs, horizons=horizons, internals=internals, deterministic=deterministic, independent=independent )
def apply(self, *, x, horizons, internals, deterministic, independent): if x.is_singleton(): registered_tensors = TensorDict(state=x.singleton()) else: registered_tensors = x.copy() x = x.value() temporal_layer_check = False x, _ = LayeredNetwork._recursive_apply( layer=self.layers, x=x, horizons=horizons, internals=internals, deterministic=deterministic, independent=independent, registered_tensors=registered_tensors, temporal_layer_check=temporal_layer_check) if self.outputs is not None: x = TensorDict(embedding=x) x.update(((output, registered_tensors[output]) for output in self.outputs)) return x, internals
def parametrize(self, *, x, conditions): epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') # Mean mean = self.mean.apply(x=x) if self.input_spec.rank == 1: shape = (-1, ) + self.action_spec.shape mean = tf.reshape(tensor=mean, shape=shape) # Softplus standard deviation if self.stddev_mode == 'global': multiples = (tf.shape(input=x)[0], ) + tuple( 1 for _ in range(self.action_spec.rank)) softplus_stddev = tf.tile(input=self.softplus_stddev, multiples=multiples) else: softplus_stddev = self.softplus_stddev.apply(x=x) if self.input_spec.rank == 1: softplus_stddev = tf.reshape(tensor=softplus_stddev, shape=shape) # # Shift softplus_stddev to reduce zero value to 0.25 (TODO: 0.25 random choice) # if self.action_spec.min_value is not None and self.action_spec.max_value is not None: # softplus_stddev += tf_util.constant(value=np.log(0.25), dtype='float') # Clip softplus_stddev for numerical stability (epsilon < 1.0, hence negative) softplus_stddev = tf.clip_by_value(t=softplus_stddev, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) # Softplus transformation (based on https://arxiv.org/abs/2007.06059) softplus_shift = tf_util.constant(value=0.2, dtype='float') log_two = tf_util.constant(value=np.log(2.0), dtype='float') stddev = (tf.nn.softplus(features=softplus_stddev) + softplus_shift) / \ (log_two + softplus_shift) # Divide stddev to reduce zero value to 0.25 (TODO: 0.25 random choice) if self.action_spec.min_value is not None and self.action_spec.max_value is not None: stddev *= tf_util.constant(value=0.25, dtype='float') # Log stddev log_stddev = tf.math.log(x=(stddev + epsilon)) return TensorDict(mean=mean, stddev=stddev, log_stddev=log_stddev)
def action_value(self, *, states, horizons, internals, auxiliaries, actions): inputs = TensorDict() if self.states_spec.is_singleton(): inputs['states'] = states.singleton() else: inputs['states'] = states if self.actions_spec.is_singleton(): inputs['actions'] = actions.singleton() else: inputs['actions'] = actions deterministic = tf_util.constant(value=True, dtype='bool') embedding, _ = self.network.apply( x=inputs, horizons=horizons, internals=internals, deterministic=deterministic, independent=True ) return self.value.apply(x=embedding)
def parametrize(self, *, x, conditions): epsilon = tf_util.constant(value=util.epsilon, dtype='float') shape = (-1, ) + self.action_spec.shape + ( self.action_spec.num_values, ) # Action values action_values = self.action_values.apply(x=x) action_values = tf.reshape(tensor=action_values, shape=shape) # States value if self.state_value is None: # Implicit states value (TODO: experimental) states_value = tf.reduce_logsumexp(input_tensor=action_values, axis=-1) else: # Explicit states value and advantage-based action values states_value = self.state_value.apply(x=x) states_value = tf.reshape(tensor=states_value, shape=shape[:-1]) action_values = tf.expand_dims(input=states_value, axis=-1) + action_values action_values -= tf.math.reduce_mean(input_tensor=action_values, axis=-1, keepdims=True) # Masking (TODO: before or after above?) if self.config.enable_int_action_masking: min_float = tf.fill(dims=tf.shape(input=action_values), value=tf_util.get_dtype(type='float').min) action_values = tf.where(condition=conditions['mask'], x=action_values, y=min_float) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=action_values, axis=-1) # "Normalized" logits logits = tf.math.log(x=tf.maximum(x=probabilities, y=epsilon)) return TensorDict(logits=logits, probabilities=probabilities, states_value=states_value, action_values=action_values)
def apply(self, *, x, horizons, internals, deterministic, independent): if x.is_singleton(): registered_tensors = TensorDict(state=x.singleton()) else: registered_tensors = x.copy() x = x.value() temporal_layer_check = False x, _ = LayeredNetwork._recursive_apply( layer=self.layers, x=x, horizons=horizons, internals=internals, deterministic=deterministic, independent=independent, registered_tensors=registered_tensors, temporal_layer_check=temporal_layer_check) return x, internals
def apply(self, *, x, horizons, internals, deterministic, independent): if x.is_singleton(): registered_tensors = TensorDict(state=x.singleton()) else: registered_tensors = x.copy() x = x.value() temporal_layer_check = False for layer in self.layers: if isinstance(layer, Register): if layer.tensor in registered_tensors: raise TensorforceError.exists(name='registered tensor', value=layer.tensor) x = layer.apply(x=x) registered_tensors[layer.tensor] = x elif isinstance(layer, MultiInputLayer): if layer.tensors not in registered_tensors: raise TensorforceError.exists_not(name='registered tensor', value=layer.tensors) x = layer.apply(x=registered_tensors[layer.tensors]) temporal_layer_check = False elif isinstance(layer, NondeterministicLayer): x = layer.apply(x=x, deterministic=deterministic) elif isinstance(layer, StatefulLayer): x = layer.apply(x=x, independent=independent) elif isinstance(layer, TemporalLayer): if temporal_layer_check: raise TensorforceError( "Multiple successive temporal layers like RNNs are currently not supported." ) x, internals[layer.name] = layer.apply( x=x, horizons=horizons, internals=internals[layer.name]) temporal_layer_check = True else: x = layer.apply(x=x) return x, internals
def fn_initial_gradients(*, states, horizons, internals, auxiliaries, actions, reward, reference): if 'policy' in internals: policy_internals = internals['policy'] baseline_internals = internals['baseline'] else: policy_internals = internals # TODO: Baseline currently cannot have internal states, since generally only policy # internals are passed to policy optimizer assert len(baseline.internals_spec) == 0 baseline_internals = TensorDict() actions = policy.act(states=states, horizons=horizons, internals=policy_internals, auxiliaries=auxiliaries, independent=True, return_internals=False) assert len(actions) == 1 action = actions.value() shape = tf_util.shape(x=action) assert len(shape) <= 2 with tf.GradientTape(persistent=False, watch_accessed_variables=False) as tape: tape.watch(tensor=action) actions_value = baseline.actions_value( states=states, horizons=horizons, internals=baseline_internals, auxiliaries=auxiliaries, actions=actions, reduced=True, return_per_action=False) if len(shape) == 1: return -tape.gradient(target=actions_value, sources=action)[0] elif len(shape) == 2 and shape[1] == 1: return -tape.gradient(target=actions_value, sources=action)[0][0] else: assert False
def apply(self, *, x, independent): registered_tensors = TensorDict(x=x) for layer in self.layers: if isinstance(layer, Register): if layer.tensor in registered_tensors: raise TensorforceError.exists(name='registered tensor', value=layer.tensor) x = layer.apply(x=x) registered_tensors[layer.tensor] = x elif isinstance(layer, MultiInputLayer): if layer.tensors not in registered_tensors: raise TensorforceError.exists_not(name='registered tensor', value=layer.tensors) x = layer.apply(x=registered_tensors[layer.tensors]) elif isinstance(layer, StatefulLayer): x = layer.apply(x=x, independent=independent) else: x = layer.apply(x=x) return x
def act_entropy(self, *, states, horizons, internals, auxiliaries, deterministic, independent): assertions = list() if self.config.create_tf_assertions: if not independent: false = tf_util.constant(value=False, dtype='bool') assertions.append( tf.debugging.assert_equal(x=deterministic, y=false)) embedding, internals = self.network.apply(x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=independent) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) def fn_mode(): def function(name, distribution): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize(x=x, conditions=conditions) mode = distribution.mode(parameters=parameters, independent=independent) entropy = distribution.entropy(parameters=parameters) return mode, entropy return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) def fn_sample(): if isinstance(self.temperature, dict): def function(name, distribution, temp): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize( x=x, conditions=conditions) sample = distribution.sample(parameters=parameters, temperature=temp, independent=independent) entropy = distribution.entropy(parameters=parameters) return sample, entropy temperature = self.temperature.fmap( function=(lambda x: x.value()), cls=TensorDict) return self.distributions.fmap(function=function, cls=TensorDict, with_names=True, zip_values=(temperature, )) else: temperature = self.temperature.value() def function(name, distribution): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize( x=x, conditions=conditions) sample = distribution.sample(parameters=parameters, temperature=temperature, independent=independent) entropy = distribution.entropy(parameters=parameters) return sample, entropy return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) with tf.control_dependencies(control_inputs=assertions): actions, entropies = tf.cond(pred=deterministic, true_fn=fn_mode, false_fn=fn_sample) def function(value, spec): return tf.reshape(tensor=value, shape=(-1, spec.size)) # See also implementation of StochasticPolicy.entropy() entropies = entropies.fmap(function=function, zip_values=self.actions_spec) entropies = tf.concat(values=tuple(entropies.values()), axis=1) entropy = tf.math.reduce_mean(input_tensor=entropies, axis=1) return actions, internals, entropy
def act(self, *, states, horizons, internals, auxiliaries, deterministic, independent): assertions = list() if self.config.create_tf_assertions: if not independent: false = tf_util.constant(value=False, dtype='bool') assertions.append( tf.debugging.assert_equal(x=deterministic, y=false)) embedding, internals = self.network.apply(x=states, horizons=horizons, internals=internals, deterministic=deterministic, independent=independent) if not isinstance(embedding, TensorDict): embedding = TensorDict(embedding=embedding) def fn_mode(): def function(name, distribution): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize(x=x, conditions=conditions) return distribution.mode(parameters=parameters, independent=independent) return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) def fn_sample(): if isinstance(self.temperature, dict): def function(name, distribution, temp): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize( x=x, conditions=conditions) return distribution.sample(parameters=parameters, temperature=temp, independent=independent) temperature = self.temperature.fmap( function=(lambda x: x.value()), cls=TensorDict) return self.distributions.fmap(function=function, cls=TensorDict, with_names=True, zip_values=(temperature, )) else: temperature = self.temperature.value() def function(name, distribution): if name is None: x = embedding.get('action-embedding', embedding['embedding']) else: x = embedding.get(name + '-embedding', embedding['embedding']) conditions = auxiliaries.get(name, default=TensorDict()) parameters = distribution.parametrize( x=x, conditions=conditions) return distribution.sample(parameters=parameters, temperature=temperature, independent=independent) return self.distributions.fmap(function=function, cls=TensorDict, with_names=True) with tf.control_dependencies(control_inputs=assertions): actions = tf.cond(pred=deterministic, true_fn=fn_mode, false_fn=fn_sample) return actions, internals
def parametrize(self, *, x, conditions): epsilon = tf_util.constant(value=util.epsilon, dtype='float') log_epsilon = tf_util.constant(value=np.log(util.epsilon), dtype='float') log_two = tf_util.constant(value=np.log(2.0), dtype='float') # Action values action_values = self.action_values.apply(x=x) shape = (-1, ) + self.action_spec.shape + ( self.action_spec.num_values, ) action_values = tf.reshape(tensor=action_values, shape=shape) # Softplus standard deviation if self.temperature_mode == 'global': multiples = (tf.shape(input=x)[0], ) + tuple( 1 for _ in range(self.action_spec.rank + 1)) softplus_temperature = tf.tile(input=self.softplus_temperature, multiples=multiples) elif self.temperature_mode == 'predicted': softplus_temperature = self.softplus_temperature.apply(x=x) shape = (-1, ) + self.action_spec.shape + (1, ) softplus_temperature = tf.reshape(tensor=softplus_temperature, shape=shape) if self.temperature_mode is None: # Logits logits = action_values # Implicit states value state_value = tf.reduce_logsumexp(input_tensor=logits, axis=-1) else: # Clip softplus_temperature for numerical stability (epsilon < 1.0, hence negative) softplus_temperature = tf.clip_by_value( t=softplus_temperature, clip_value_min=log_epsilon, clip_value_max=-log_epsilon) # Softplus transformation (based on https://arxiv.org/abs/2007.06059) softplus_shift = tf_util.constant(value=0.2, dtype='float') temperature = (tf.nn.softplus(features=softplus_temperature) + softplus_shift) / \ (log_two + softplus_shift) # Logits logits = action_values / temperature # Implicit states value temperature = tf.squeeze(input=temperature, axis=-1) state_value = temperature * tf.reduce_logsumexp( input_tensor=logits, axis=-1) # # Explicit states value and advantage-based action values # state_value = self.state_value.apply(x=x) # state_value = tf.reshape(tensor=state_value, shape=shape[:-1]) # action_values = tf.expand_dims(input=state_value, axis=-1) + action_values # action_values -= tf.math.reduce_mean(input_tensor=action_values, axis=-1, keepdims=True) # Action masking, affects action_values/probabilities/logits but not state_value if self.config.enable_int_action_masking: min_float = tf.fill(dims=tf.shape(input=action_values), value=tf_util.get_dtype(type='float').min) action_values = tf.where(condition=conditions['mask'], x=action_values, y=min_float) logits = tf.where(condition=conditions['mask'], x=logits, y=min_float) # Softmax for corresponding probabilities probabilities = tf.nn.softmax(logits=logits, axis=-1) # "Normalized" logits logits = tf.math.log(x=(probabilities + epsilon)) # Unstable # logits = tf.nn.log_softmax(logits=logits, axis=-1) # Doesn't take masking into account # logits = action_values - tf.expand_dims(input=state_value, axis=-1) ... / temperature if self.temperature_mode is None: return TensorDict(probabilities=probabilities, logits=logits, action_values=action_values, state_value=state_value) else: return TensorDict(probabilities=probabilities, temperature=temperature, logits=logits, action_values=action_values, state_value=state_value)