def sample(self, *, parameters, temperature): logits, probabilities, action_values = parameters.get( ('logits', 'probabilities', 'action_values')) # Distribution parameter summaries def fn_summary(): axis = range(self.action_spec.rank + 1) probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis) return [probs[n] for n in range(self.action_spec.num_values)] prefix = 'distributions/' + self.name + '-probability' names = [prefix + str(n) for n in range(self.action_spec.num_values)] dependencies = self.summary(label='distribution', name=names, data=fn_summary, step='timesteps') # Entropy summary def fn_summary(): entropy = -tf.reduce_sum(input_tensor=(probabilities * logits), axis=-1) return tf.math.reduce_mean(input_tensor=entropy) name = 'entropies/' + self.name dependencies.extend( self.summary(label='entropy', name=name, data=fn_summary, step='timesteps')) one = tf_util.constant(value=1.0, dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') # Deterministic: maximum likelihood action definite = tf.argmax(input=action_values, axis=-1) definite = tf_util.cast(x=definite, dtype='int') # Set logits to minimal value min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min) logits = logits / temperature logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=logits) # Non-deterministic: sample action using Gumbel distribution uniform_distribution = tf.random.uniform( shape=tf.shape(input=logits), minval=epsilon, maxval=(one - epsilon), dtype=tf_util.get_dtype(type='float')) gumbel_distribution = -tf.math.log( x=-tf.math.log(x=uniform_distribution)) sampled = tf.argmax(input=(logits + gumbel_distribution), axis=-1) sampled = tf_util.cast(x=sampled, dtype='int') with tf.control_dependencies(control_inputs=dependencies): return tf.where(condition=(temperature < epsilon), x=definite, y=sampled)
def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): assert len(internals) == 0 actions = TensorDict() for name, spec in self.actions_spec.items(): shape = tf.concat(values=( tf_util.cast(x=tf.shape(input=states.value())[:1], dtype='int'), tf_util.constant(value=spec.shape, dtype='int') ), axis=0) if self.action_values is not None and name in self.action_values: # If user-specified, choose given action action = tf_util.constant(value=self.action_values[name], dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) elif self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: # If masking, choose first unmasked action mask = auxiliaries[name]['mask'] choices = tf_util.constant( value=list(range(spec.num_values)), dtype='int', shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values)) ) one = tf_util.constant(value=1, dtype='int', shape=(1,)) multiples = tf.concat(values=(shape, one), axis=0) choices = tf.tile(input=choices, multiples=multiples) choices = tf.boolean_mask(tensor=choices, mask=mask) mask = tf_util.cast(x=mask, dtype='int') num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1)) num_valid = tf.reshape(tensor=num_valid, shape=(-1,)) masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) action = tf.gather(params=choices, indices=masked_offset) actions[name] = tf.reshape(tensor=action, shape=shape) elif spec.type != 'bool' and spec.min_value is not None: if spec.max_value is not None: # If min/max_value given, choose mean action action = spec.min_value + 0.5 * (spec.max_value - spec.min_value) action = tf_util.constant(value=action, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) else: # If only min_value given, choose min_value action = tf_util.constant(value=spec.min_value, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) elif spec.type != 'bool' and spec.max_value is not None: # If only max_value given, choose max_value action = tf_util.constant(value=spec.max_value, dtype=spec.type) actions[name] = tf.fill(dims=shape, value=action) else: # Else choose zero actions[name] = tf_util.zeros(shape=shape, dtype=spec.type) return actions, TensorDict()
def apply(self, *, x): x = tf_util.float32(x=x) x = self.rnn(inputs=x, initial_state=None) if not self.return_final_state: x = tf_util.cast(x=x[0], dtype='float') elif self.cell_type == 'gru': x = tf_util.cast(x=x[1], dtype='float') elif self.cell_type == 'lstm': x = tf_util.cast(x=tf.concat(values=(x[1], x[2]), axis=1), dtype='float') return super().apply(x=x)
def fn_summary(): one = tf_util.constant(value=1.0, dtype='float') digamma_alpha = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float' ) digamma_beta = tf_util.cast(x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float') digamma_alpha_beta = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float' ) entropy = log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \ (alpha_beta - one - one) * digamma_alpha_beta return tf.math.reduce_mean(input_tensor=entropy)
def entropy(self, *, parameters): alpha, beta, alpha_beta, log_norm = parameters.get( ('alpha', 'beta', 'alpha_beta', 'log_norm')) one = tf_util.constant(value=1.0, dtype='float') digamma_alpha = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha)), dtype='float') digamma_beta = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=beta)), dtype='float') digamma_alpha_beta = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha_beta)), dtype='float') return log_norm - (beta - one) * digamma_beta - (alpha - one) * digamma_alpha + \ (alpha_beta - one - one) * digamma_alpha_beta
def parameter_value(self, *, step): parameter = tf.keras.optimizers.schedules.PiecewiseConstantDecay( boundaries=self.boundaries, values=self.values)(step=step) parameter = tf_util.cast(x=parameter, dtype=self.spec.type) return parameter
def log_probability(self, *, parameters, action): mean, stddev, log_stddev = parameters.get(('mean', 'stddev', 'log_stddev')) # Inverse bounded transformation if self.bounded_transform is not None: if self.action_spec.min_value is not None and self.action_spec.max_value is not None: one = tf_util.constant(value=1.0, dtype='float') two = tf_util.constant(value=2.0, dtype='float') min_value = tf_util.constant(value=self.action_spec.min_value, dtype='float') max_value = tf_util.constant(value=self.action_spec.max_value, dtype='float') action = two * (action - min_value) / (max_value - min_value) - one if self.bounded_transform == 'tanh': clip = tf_util.constant(value=(1.0 - util.epsilon), dtype='float') action = tf.clip_by_value(t=action, clip_value_min=-clip, clip_value_max=clip) action = tf_util.cast(x=tf.math.atanh(x=tf_util.float32(x=action)), dtype='float') epsilon = tf_util.constant(value=util.epsilon, dtype='float') half = tf_util.constant(value=0.5, dtype='float') half_log_two_pi = tf_util.constant(value=(0.5 * np.log(2.0 * np.pi)), dtype='float') sq_mean_distance = tf.square(x=(action - mean)) sq_stddev = tf.maximum(x=tf.square(x=stddev), y=epsilon) log_prob = -half * sq_mean_distance / sq_stddev - log_stddev - half_log_two_pi if self.bounded_transform == 'tanh': log_two = tf_util.constant(value=np.log(2.0), dtype='float') log_prob -= two * (log_two - action - tf.math.softplus(features=(-two * action))) return log_prob
def apply_l2_regularization(): l2_variables = list() for variable in self.this_trainable_variables: variable = tf_util.cast(x=variable, dtype='float') l2_variables.append( tf.reduce_sum(input_tensor=tf.square(x=variable))) return l2_regularization * tf.math.add_n(inputs=l2_variables)
def kl_divergence(self, *, parameters1, parameters2): alpha1, beta1, alpha_beta1, log_norm1 = parameters1.get( ('alpha', 'beta', 'alpha_beta', 'log_norm')) alpha2, beta2, alpha_beta2, log_norm2 = parameters2.get( ('alpha', 'beta', 'alpha_beta', 'log_norm')) digamma_alpha1 = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha1)), dtype='float') digamma_beta1 = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=beta1)), dtype='float') digamma_alpha_beta1 = tf_util.cast( x=tf.math.digamma(x=tf_util.float32(x=alpha_beta1)), dtype='float') return log_norm2 - log_norm1 - digamma_beta1 * (beta2 - beta1) - \ digamma_alpha1 * (alpha2 - alpha1) + digamma_alpha_beta1 * \ (alpha_beta2 - alpha_beta1)
def step(self, *, arguments, variables, fn_loss, **kwargs): learning_rate = self.learning_rate.value() unperturbed_loss = fn_loss(**arguments.to_kwargs()) deltas = [tf.zeros_like(input=variable) for variable in variables] previous_perturbations = [ tf.zeros_like(input=variable) for variable in variables ] def body(deltas, previous_perturbations): with tf.control_dependencies(control_inputs=deltas): perturbations = [ learning_rate * tf.random.normal(shape=tf_util.shape(x=variable), dtype=tf_util.get_dtype(type='float')) for variable in variables ] perturbation_deltas = [ pert - prev_pert for pert, prev_pert in zip( perturbations, previous_perturbations) ] assignments = list() for variable, delta in zip(variables, perturbation_deltas): assignments.append( variable.assign_add(delta=delta, read_value=False)) with tf.control_dependencies(control_inputs=assignments): perturbed_loss = fn_loss(**arguments.to_kwargs()) direction = tf.math.sign(x=(unperturbed_loss - perturbed_loss)) deltas = [ delta + direction * perturbation for delta, perturbation in zip(deltas, perturbations) ] return deltas, perturbations num_samples = self.num_samples.value() deltas, perturbations = tf.while_loop( cond=tf_util.always_true, body=body, loop_vars=(deltas, previous_perturbations), maximum_iterations=tf_util.int32(x=num_samples)) with tf.control_dependencies(control_inputs=deltas): num_samples = tf_util.cast(x=num_samples, dtype='float') deltas = [delta / num_samples for delta in deltas] perturbation_deltas = [ delta - pert for delta, pert in zip(deltas, perturbations) ] assignments = list() for variable, delta in zip(variables, perturbation_deltas): assignments.append( variable.assign_add(delta=delta, read_value=False)) with tf.control_dependencies(control_inputs=assignments): # Trivial operation to enforce control dependency return [tf_util.identity(input=delta) for delta in deltas]
def apply(self, *, x, horizons, internals, deterministic, independent): if x.is_singleton(): inputs = x.singleton() else: inputs = list(x.values()) x = self.keras_model(inputs=inputs, training=(not independent)) return tf_util.cast(x=x, dtype='float'), internals
def mode(self, *, parameters, independent): if self.temperature_mode is None: probabilities, action_values = parameters.get( ('probabilities', 'action_values')) else: probabilities, temperature, action_values = parameters.get( ('probabilities', 'temperature', 'action_values')) # Distribution parameter summaries dependencies = list() if not independent: def fn_summary(): axis = range(self.action_spec.rank + 1) probs = tf.math.reduce_mean(input_tensor=probabilities, axis=axis) probs = [probs[n] for n in range(self.action_spec.num_values)] if self.temperature_mode is not None: probs.append( tf.math.reduce_mean(input_tensor=temperature, axis=axis)) return probs prefix = 'distributions/' + self.name + '-probability' names = [ prefix + str(n) for n in range(self.action_spec.num_values) ] if self.temperature_mode is not None: names.append('distributions/' + self.name + '-temperature') dependencies.extend( self.summary(label='distribution', name=names, data=fn_summary, step='timesteps')) # Distribution parameter tracking def fn_tracking(): return tf.math.reduce_mean(input_tensor=probabilities, axis=0) dependencies.extend( self.track(label='distribution', name='probabilities', data=fn_tracking)) if self.temperature_mode is not None: def fn_tracking(): return tf.math.reduce_mean(input_tensor=temperature, axis=0) dependencies.extend( self.track(label='distribution', name='temperature', data=fn_tracking)) with tf.control_dependencies(control_inputs=dependencies): action = tf.math.argmax(input=action_values, axis=-1) return tf_util.cast(x=action, dtype='int')
def parameter_value(self, *, step): delta = self.theta * (self.mu - self.process) + self.sigma * tf.random.normal(shape=()) if self.absolute: parameter = self.process.assign(value=tf.math.abs(x=(self.process + delta))) else: parameter = self.process.assign_add(delta=delta) parameter = tf_util.cast(x=parameter, dtype=self.spec.type) return parameter
def iterative_apply(self, *, x, internals): x = tf_util.float32(x=x) state = tf_util.float32(x=internals['state']) if self.cell_type == 'gru': state = (state, ) elif self.cell_type == 'lstm': state = (state[:, 0, :], state[:, 1, :]) x, state = self.cell(inputs=x, states=state) if self.cell_type == 'gru': state = state[0] elif self.cell_type == 'lstm': state = tf.stack(values=state, axis=1) x = tf_util.cast(x=x, dtype='float') internals['state'] = tf_util.cast(x=state, dtype='float') return x, internals
def apply(self, *, x): output_shape = tf.concat(values=[ tf_util.cast(x=tf.shape(input=x)[:1], dtype='int'), tf_util.constant(value=self.output_shape, dtype='int') ], axis=0) x = tf.nn.conv2d_transpose( input=x, filters=self.weights, output_shape=tf_util.int32(x=output_shape), strides=self.stride, padding=self.padding.upper(), dilations=self.dilation ) return super().apply(x=x)
def step(self, *, arguments, **kwargs): if not self.is_fraction_absolute and self.fraction.is_constant(value=1.0): return self.optimizer.step(arguments=arguments, **kwargs) batch_size = tf_util.cast(x=tf.shape(input=arguments['reward'])[0], dtype='int') if self.is_fraction_absolute: fraction = self.fraction.is_constant() if fraction is None: fraction = self.fraction.value() else: fraction = self.fraction.value() * tf_util.cast(x=batch_size, dtype='float') fraction = tf_util.cast(x=fraction, dtype='int') one = tf_util.constant(value=1, dtype='int') fraction = tf.math.maximum(x=fraction, y=one) def subsampled_step(): subsampled_arguments = TensorDict() indices = tf.random.uniform( shape=(fraction,), maxval=batch_size, dtype=tf_util.get_dtype(type='int') ) if 'states' in arguments and 'horizons' in arguments: horizons = tf.gather(params=arguments['horizons'], indices=indices) starts = horizons[:, 0] lengths = horizons[:, 1] states_indices = tf.ragged.range(starts=starts, limits=(starts + lengths)).values function = (lambda x: tf.gather(params=x, indices=states_indices)) subsampled_arguments['states'] = arguments['states'].fmap(function=function) starts = tf.math.cumsum(x=lengths, exclusive=True) subsampled_arguments['horizons'] = tf.stack(values=(starts, lengths), axis=1) for name, argument in arguments.items(): if name not in subsampled_arguments: subsampled_arguments[name] = tf.gather(params=argument, indices=indices) return self.optimizer.step(arguments=subsampled_arguments, **kwargs) def normal_step(): return self.optimizer.step(arguments=arguments, **kwargs) return tf.cond(pred=(fraction < batch_size), true_fn=subsampled_step, false_fn=normal_step)
def mode(self, *, parameters): probabilities, action_values = parameters.get(('probabilities', 'action_values')) # Distribution parameter tracking def fn_tracking(): return tf.math.reduce_mean(input_tensor=probabilities, axis=0) dependencies = self.track(label='distribution', name='probabilities', data=fn_tracking) with tf.control_dependencies(control_inputs=dependencies): action = tf.math.argmax(input=action_values, axis=-1) return tf_util.cast(x=action, dtype='int')
def apply(self, *, x, independent): dependencies = list() if independent: mean = self.moving_mean variance = self.moving_variance else: one = tf_util.constant(value=1.0, dtype='float') axes = (0, ) + tuple(1 + axis for axis in self.axes) decay = self.decay.value() batch_size = tf_util.cast(x=tf.shape(input=x)[0], dtype='float') decay = tf.math.pow(x=decay, y=batch_size) condition = tf.math.logical_or(x=self.after_first_call, y=tf.math.equal(x=batch_size, y=0)) mean = tf.math.reduce_mean(input_tensor=x, axis=axes, keepdims=True) mean = tf.where(condition=condition, x=(decay * self.moving_mean + (one - decay) * mean), y=mean) variance = tf.reduce_mean(input_tensor=tf.math.squared_difference( x=x, y=mean), axis=axes, keepdims=True) variance = tf.where(condition=condition, x=(decay * self.moving_variance + (one - decay) * variance), y=variance) with tf.control_dependencies(control_inputs=(mean, variance)): value = tf.math.logical_or(x=self.after_first_call, y=(batch_size > 0)) dependencies.append( self.after_first_call.assign(value=value, read_value=False)) mean = self.moving_mean.assign(value=mean) variance = self.moving_variance.assign(value=variance) epsilon = tf_util.constant(value=util.epsilon, dtype='float') reciprocal_stddev = tf.math.rsqrt(x=tf.maximum(x=variance, y=epsilon)) with tf.control_dependencies(control_inputs=dependencies): x = (x - tf.stop_gradient(input=mean)) * tf.stop_gradient( input=reciprocal_stddev) return x
def independent_act(self, *, states, internals=None, auxiliaries=None): if internals is None: assert len(self.internals_spec) == 0 internals = TensorDict() if auxiliaries is None: assert len(self.auxiliaries_spec) == 0 auxiliaries = TensorDict() true = tf_util.constant(value=True, dtype='bool') batch_size = tf_util.cast(x=tf.shape(input=states.value())[0], dtype='int') # Input assertions assertions = list() if self.config.create_tf_assertions: assertions.extend(self.states_spec.tf_assert( x=states, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} state input.' )) assertions.extend(self.internals_spec.tf_assert( x=internals, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} internal input.' )) assertions.extend(self.auxiliaries_spec.tf_assert( x=auxiliaries, batch_size=batch_size, message='Agent.independent_act: invalid {issue} for {name} input.' )) # Mask assertions if self.config.enable_int_action_masking: for name, spec in self.actions_spec.items(): if spec.type == 'int': assertions.append(tf.debugging.assert_equal( x=tf.reduce_all(input_tensor=tf.math.reduce_any( input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1) )), y=true, message="Agent.independent_act: at least one action has to be valid." )) with tf.control_dependencies(control_inputs=assertions): # Core act parallel = tf_util.zeros(shape=(1,), dtype='int') actions, internals = self.core_act( states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, independent=True ) # Skip action assertions # SavedModel requires flattened output if len(self.internals_spec) > 0: return OrderedDict(TensorDict(actions=actions, internals=internals)) else: return OrderedDict(actions)
def fn_sample(): # Set logits to minimal value min_float = tf.fill(dims=tf.shape(input=logits), value=tf_util.get_dtype(type='float').min) temp_logits = logits / tf.math.maximum(x=temperature, y=epsilon) temp_logits = tf.where(condition=(probabilities < epsilon), x=min_float, y=temp_logits) # Non-deterministic: sample action using Gumbel distribution one = tf_util.constant(value=1.0, dtype='float') uniform_distribution = tf.random.uniform( shape=tf.shape(input=temp_logits), minval=epsilon, maxval=(one - epsilon), dtype=tf_util.get_dtype(type='float') ) # Second log numerically stable since log(1-eps) ~ -eps gumbel_distribution = -tf.math.log(x=-tf.math.log(x=uniform_distribution)) action = tf.math.argmax(input=(temp_logits + gumbel_distribution), axis=-1) return tf_util.cast(x=action, dtype='int')
def apply(self, *, x): queries = self.query.apply(x=x) keys = self.key.apply(x=x) values = self.value.apply(x=x) if self.input_spec.rank > 2: batch_size = tf_util.cast(x=tf.shape(input=x)[:1], dtype='int') flattened_shape = tf_util.constant( value=(util.product(xs=self.input_spec.shape[:-1]), self.attention_size), dtype='int') flattened_shape = tf.concat(values=(batch_size, flattened_shape), axis=0) queries = tf.reshape(tensor=queries, shape=flattened_shape) keys = tf.reshape(tensor=keys, shape=flattened_shape) flattened_shape = tf_util.constant( value=(util.product(xs=self.input_spec.shape[:-1]), self.size), dtype='int') flattened_shape = tf.concat(values=(batch_size, flattened_shape), axis=0) values = tf.reshape(tensor=values, shape=flattened_shape) attention = tf.linalg.matmul(a=queries, b=keys, transpose_b=True) attention = attention / tf_util.constant( value=np.sqrt(self.attention_size), dtype='float') attention = tf.nn.softmax(logits=attention, axis=-1) x = tf.linalg.matmul(a=attention, b=values) if self.input_spec.rank > 2: shape = tf_util.constant(value=self.output_spec().shape, dtype='int') shape = tf.concat(values=(batch_size, shape), axis=0) x = tf.reshape(tensor=x, shape=shape) return super().apply(x=x)
def iterative_body(self, x, indices, remaining, current_x, current_internals): batch_size = tf_util.cast(x=tf.shape(input=current_x)[:1], dtype='int') zeros = tf_util.zeros(shape=batch_size, dtype='int') ones = tf_util.ones(shape=batch_size, dtype='int') batch_size = batch_size[0] current_x = tf.gather(params=x, indices=indices) next_x, next_internals = self.iterative_apply( x=current_x, internals=current_internals) with tf.control_dependencies(control_inputs=(current_x, next_x)): is_finished = tf.math.equal(x=remaining, y=zeros) if isinstance(next_internals, dict): for name, current_internal, next_internal in current_internals.zip_items( next_internals): condition = is_finished for _ in range(tf_util.rank(x=current_internal) - 1): condition = tf.expand_dims(input=condition, axis=1) next_internals[name] = tf.where(condition=condition, x=current_internal, y=next_internal) else: condition = is_finished for _ in range(tf_util.rank(x=current_internals) - 1): condition = tf.expand_dims(input=condition, axis=1) next_internals = tf.where(condition=condition, x=current_internals, y=next_internals) remaining -= tf.where(condition=is_finished, x=zeros, y=ones) indices += tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) return x, indices, remaining, next_x, next_internals
def mode(self, *, parameters): action_values = parameters['action_values'] action = tf.math.argmax(input=action_values, axis=-1) return tf_util.cast(x=action, dtype='int')
def fn_mode(): # Deterministic: maximum likelihood action action = tf.math.argmax(input=action_values, axis=-1) return tf_util.cast(x=action, dtype='int')
def core_act(self, *, states, internals, auxiliaries, parallel, deterministic, independent): assert len(internals) == 0 actions = TensorDict() for name, spec in self.actions_spec.items(): shape = tf.concat(values=(tf_util.cast( x=tf.shape(input=states.value())[:1], dtype='int'), tf_util.constant(value=spec.shape, dtype='int')), axis=0) if spec.type == 'bool': # Random bool action: uniform[True, False] half = tf_util.constant(value=0.5, dtype='float') uniform = tf.random.uniform( shape=shape, dtype=tf_util.get_dtype(type='float')) actions[name] = (uniform < half) elif self.config.enable_int_action_masking and spec.type == 'int' and \ spec.num_values is not None: # Random masked action: uniform[unmasked] # (Similar code as for Model.apply_exploration) mask = auxiliaries[name]['mask'] choices = tf_util.constant(value=list(range(spec.num_values)), dtype=spec.type, shape=(tuple(1 for _ in spec.shape) + (1, spec.num_values))) one = tf_util.constant(value=1, dtype='int', shape=(1, )) multiples = tf.concat(values=(shape, one), axis=0) choices = tf.tile(input=choices, multiples=multiples) choices = tf.boolean_mask(tensor=choices, mask=mask) mask = tf_util.cast(x=mask, dtype='int') num_valid = tf.math.reduce_sum(input_tensor=mask, axis=(spec.rank + 1)) num_valid = tf.reshape(tensor=num_valid, shape=(-1, )) masked_offset = tf.math.cumsum(x=num_valid, axis=0, exclusive=True) uniform = tf.random.uniform( shape=shape, dtype=tf_util.get_dtype(type='float')) uniform = tf.reshape(tensor=uniform, shape=(-1, )) num_valid = tf_util.cast(x=num_valid, dtype='float') random_offset = tf.dtypes.cast(x=(uniform * num_valid), dtype=tf.dtypes.int64) action = tf.gather(params=choices, indices=(masked_offset + random_offset)) actions[name] = tf.reshape(tensor=action, shape=shape) elif spec.type != 'bool' and spec.min_value is not None: if spec.max_value is not None: # Random bounded action: uniform[min_value, max_value] actions[name] = tf.random.uniform(shape=shape, minval=spec.min_value, maxval=spec.max_value, dtype=spec.tf_type()) else: # Random left-bounded action: not implemented raise NotImplementedError elif spec.type != 'bool' and spec.max_value is not None: # Random right-bounded action: not implemented raise NotImplementedError else: # Random unbounded int/float action actions[name] = tf.random.normal(shape=shape, dtype=spec.tf_type()) return actions, TensorDict()
def observe(self, *, terminal, reward, parallel): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') batch_size = tf_util.cast(x=tf.shape(input=terminal)[0], dtype='int') expanded_parallel = tf.expand_dims(input=tf.expand_dims(input=parallel, axis=0), axis=1) is_terminal = tf.math.greater(x=terminal[-1], y=zero) # Input assertions assertions = list() if self.config.create_tf_assertions: assertions.extend( self.terminal_spec.tf_assert( x=terminal, batch_size=batch_size, message='Agent.observe: invalid {issue} for terminal input.' )) assertions.extend( self.reward_spec.tf_assert( x=reward, batch_size=batch_size, message='Agent.observe: invalid {issue} for terminal input.' )) assertions.extend( self.parallel_spec.tf_assert( x=parallel, message='Agent.observe: invalid {issue} for parallel input.' )) # Assertion: at most one terminal num_terms = tf.math.count_nonzero( input=terminal, dtype=tf_util.get_dtype(type='int')) assertions.append( tf.debugging.assert_less_equal( x=num_terms, y=one, message= "Agent.observe: input contains more than one terminal.")) # Assertion: if terminal, last timestep in batch assertions.append( tf.debugging.assert_equal( x=tf.math.greater(x=num_terms, y=zero), y=is_terminal, message= "Agent.observe: terminal is not the last input timestep.")) with tf.control_dependencies(control_inputs=assertions): dependencies = list() # Reward summary if self.summaries == 'all' or 'reward' in self.summaries: with self.summarizer.as_default(): x = tf.math.reduce_mean(input_tensor=reward) dependencies.append( tf.summary.scalar(name='reward', data=x, step=self.timesteps)) # Update episode length/reward updates = tf.expand_dims(input=batch_size, axis=0) value = tf.tensor_scatter_nd_add(tensor=self.episode_length, indices=expanded_parallel, updates=updates) dependencies.append(self.episode_length.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=batch_size, indices=parallel) # dependencies.append(self.episode_length.scatter_add(sparse_delta=sparse_delta)) sum_reward = tf.math.reduce_sum(input_tensor=reward, keepdims=True) value = tf.tensor_scatter_nd_add(tensor=self.episode_reward, indices=expanded_parallel, updates=sum_reward) dependencies.append(self.episode_reward.assign(value=value)) # sum_reward = tf.math.reduce_sum(input_tensor=reward) # sparse_delta = tf.IndexedSlices(values=sum_reward, indices=parallel) # dependencies.append(self.episode_reward.scatter_add(sparse_delta=sparse_delta)) # Core observe (before terminal handling) updated = self.core_observe(terminal=terminal, reward=reward, parallel=parallel) dependencies.append(updated) # Handle terminal (after core observe and episode reward) with tf.control_dependencies(control_inputs=dependencies): def fn_terminal(): operations = list() # Reset internals def function(spec, initial): return tf_util.constant(value=initial, dtype=spec.type) initials = self.internals_spec.fmap( function=function, cls=TensorDict, zip_values=self.initial_internals) for name, previous, initial in self.previous_internals.zip_items( initials): updates = tf.expand_dims(input=initial, axis=0) value = tf.tensor_scatter_nd_update( tensor=previous, indices=expanded_parallel, updates=updates) operations.append(previous.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=initial, indices=parallel) # operations.append(previous.scatter_update(sparse_delta=sparse_delta)) # Episode length/reward summaries (before episode reward reset / episodes increment) dependencies = list() if self.summaries == 'all' or 'reward' in self.summaries: with self.summarizer.as_default(): x = tf.gather(params=self.episode_length, indices=parallel) dependencies.append( tf.summary.scalar(name='episode-length', data=x, step=self.episodes)) x = tf.gather(params=self.episode_reward, indices=parallel) dependencies.append( tf.summary.scalar(name='episode-reward', data=x, step=self.episodes)) # Reset episode length/reward with tf.control_dependencies(control_inputs=dependencies): zeros = tf_util.zeros(shape=(1, ), dtype='int') value = tf.tensor_scatter_nd_update( tensor=self.episode_length, indices=expanded_parallel, updates=zeros) operations.append(self.episode_length.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=zero, indices=parallel) # operations.append(self.episode_length.scatter_update(sparse_delta=sparse_delta)) zeros = tf_util.zeros(shape=(1, ), dtype='float') value = tf.tensor_scatter_nd_update( tensor=self.episode_reward, indices=expanded_parallel, updates=zeros) operations.append(self.episode_reward.assign(value=value)) # zero_float = tf_util.constant(value=0.0, dtype='float') # sparse_delta = tf.IndexedSlices(values=zero_float, indices=parallel) # operations.append(self.episode_reward.scatter_update(sparse_delta=sparse_delta)) # Increment episodes counter operations.append( self.episodes.assign_add(delta=one, read_value=False)) return tf.group(*operations) handle_terminal = tf.cond(pred=is_terminal, true_fn=fn_terminal, false_fn=tf.no_op) with tf.control_dependencies(control_inputs=(handle_terminal, )): episodes = tf_util.identity(input=self.episodes) updates = tf_util.identity(input=self.updates) return updated, episodes, updates
def act(self, *, states, auxiliaries, parallel): batch_size = tf_util.cast(x=tf.shape(input=parallel)[0], dtype='int') # Input assertions assertions = list() if self.config.create_tf_assertions: assertions.extend( self.states_spec.tf_assert( x=states, batch_size=batch_size, message='Agent.act: invalid {issue} for {name} state input.' )) assertions.extend( self.auxiliaries_spec.tf_assert( x=auxiliaries, batch_size=batch_size, message='Agent.act: invalid {issue} for {name} input.')) assertions.extend( self.parallel_spec.tf_assert( x=parallel, batch_size=batch_size, message='Agent.act: invalid {issue} for parallel input.')) # Mask assertions if self.config.enable_int_action_masking: true = tf_util.constant(value=True, dtype='bool') for name, spec in self.actions_spec.items(): if spec.type == 'int': assertions.append( tf.debugging.assert_equal( x=tf.reduce_all( input_tensor=tf.math.reduce_any( input_tensor=auxiliaries[name]['mask'], axis=(spec.rank + 1))), y=true, message= "Agent.independent_act: at least one action has to be valid." )) with tf.control_dependencies(control_inputs=assertions): # Retrieve internals internals = self.previous_internals.fmap( function=(lambda x: tf.gather(params=x, indices=parallel)), cls=TensorDict) # Core act deterministic = tf_util.constant(value=False, dtype='bool') actions, internals = self.core_act(states=states, internals=internals, auxiliaries=auxiliaries, parallel=parallel, deterministic=deterministic, independent=False) # Action assertions assertions = list() if self.config.create_tf_assertions: assertions.extend( self.actions_spec.tf_assert(x=actions, batch_size=batch_size)) if self.config.enable_int_action_masking: for name, spec, action in self.actions_spec.zip_items(actions): if spec.type == 'int': is_valid = tf.reduce_all(input_tensor=tf.gather( params=auxiliaries[name]['mask'], indices=tf.expand_dims(input=action, axis=(spec.rank + 1)), batch_dims=(spec.rank + 1))) assertions.append( tf.debugging.assert_equal( x=is_valid, y=true, message="Action mask check.")) # Remember internals dependencies = list() for name, previous, internal in self.previous_internals.zip_items( internals): indices = tf.expand_dims(input=parallel, axis=1) value = tf.tensor_scatter_nd_update(tensor=previous, indices=indices, updates=internal) dependencies.append(previous.assign(value=value)) # sparse_delta = tf.IndexedSlices(values=internal, indices=parallel) # dependencies.append(previous.scatter_update(sparse_delta=sparse_delta)) # Increment timestep (after core act) with tf.control_dependencies(control_inputs=(actions.flatten() + internals.flatten())): dependencies.append( self.timesteps.assign_add(delta=batch_size, read_value=False)) with tf.control_dependencies(control_inputs=(dependencies + assertions)): actions = actions.fmap(function=tf_util.identity) timestep = tf_util.identity(input=self.timesteps) return actions, timestep
def parameter_value(self, *, step): initial_value = tf_util.constant(value=self.initial_value, dtype='float') if self.decay == 'cosine': assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 parameter = tf.keras.experimental.CosineDecay( initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), alpha=self.kwargs.get('alpha', 0.0))(step=step) elif self.decay == 'cosine_restarts': assert 0.0 <= self.kwargs.get('alpha', 0.0) <= 1.0 parameter = tf.keras.experimental.CosineDecayRestarts( initial_learning_rate=initial_value, first_decay_steps=(self.num_steps + 1), t_mul=self.kwargs.get('t_mul', 2.0), m_mul=self.kwargs.get('m_mul', 1.0), alpha=self.kwargs.get('alpha', 0.0))(step=step) elif self.decay == 'exponential': assert self.kwargs['decay_rate'] >= 0.0 parameter = tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), decay_rate=self.kwargs['decay_rate'], staircase=self.kwargs.get('staircase', False))(step=step) elif self.decay == 'inverse_time': assert self.kwargs['decay_rate'] >= 0.0 parameter = tf.keras.optimizers.schedules.InverseTimeDecay( initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), decay_rate=self.kwargs['decay_rate'], staircase=self.kwargs.get('staircase', False))(step=step) elif self.decay == 'linear_cosine': assert self.kwargs.get('beta', 0.001) >= 0.0 parameter = tf.keras.experimental.LinearCosineDecay( initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), num_periods=self.kwargs.get('num_periods', 0.5), alpha=self.kwargs.get('alpha', 0.0), beta=self.kwargs.get('beta', 0.001))(step=step) elif self.decay == 'linear_cosine_noisy': assert self.kwargs.get('beta', 0.001) >= 0.0 parameter = tf.keras.experimental.NoisyLinearCosineDecay( initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), initial_variance=self.kwargs.get('initial_variance', 1.0), variance_decay=self.kwargs.get('variance_decay', 0.55), num_periods=self.kwargs.get('num_periods', 0.5), alpha=self.kwargs.get('alpha', 0.0), beta=self.kwargs.get('beta', 0.001))(step=step) elif self.decay == 'polynomial': assert self.kwargs.get('power', 1.0) >= 0.0 parameter = tf.keras.optimizers.schedules.PolynomialDecay( initial_learning_rate=initial_value, decay_steps=(self.num_steps + 1), end_learning_rate=self.kwargs['final_value'], power=self.kwargs.get('power', 1.0), cycle=self.kwargs.get('cycle', False))(step=step) if self.increasing: one = tf_util.constant(value=1.0, dtype='float') parameter = one - parameter if self.inverse: one = tf_util.constant(value=1.0, dtype='float') parameter = tf.math.reciprocal(x=parameter) if self.scale != 1.0: scale = tf_util.constant(value=self.scale, dtype='float') parameter = parameter * scale parameter = tf_util.cast(x=parameter, dtype=self.spec.type) return parameter
def fn_summary(): return tf.linalg.global_norm(t_list=[ tf_util.cast(x=delta, dtype='float') for delta in deltas ])
def apply(self, *, x, horizons, internals): zero = tf_util.constant(value=0, dtype='int') one = tf_util.constant(value=1, dtype='int') batch_size = tf_util.cast(x=tf.shape(input=horizons)[0], dtype='int') zeros = tf_util.zeros(shape=(batch_size, ), dtype='int') ones = tf_util.ones(shape=(batch_size, ), dtype='int') # including 0th step horizon = self.horizon.value() + one # in case of longer horizon than necessary (e.g. main vs baseline policy) starts = horizons[:, 0] + tf.maximum(x=(horizons[:, 1] - horizon), y=zeros) lengths = horizons[:, 1] - tf.maximum(x=(horizons[:, 1] - horizon), y=zeros) horizon = tf.minimum(x=horizon, y=tf.math.reduce_max(input_tensor=lengths, axis=0)) output_spec = self.output_spec() if self.temporal_processing == 'cumulative': if self.horizon.is_constant(value=0): x = self.iterative_apply(xs=x, lengths=ones) else: def body(x, indices, remaining, xs): current_x = tf.gather(params=x, indices=indices) current_x = tf.expand_dims(input=current_x, axis=1) xs = tf.concat(values=(xs, current_x), axis=1) remaining -= tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) indices += tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) return x, indices, remaining, xs initial_xs = tf_util.zeros(shape=((batch_size, 0) + output_spec.shape), dtype=output_spec.type) _, final_indices, final_remaining, xs = tf.while_loop( cond=tf_util.always_true, body=body, loop_vars=(x, starts, lengths, initial_xs), maximum_iterations=tf_util.int64(x=horizon)) x = self.cumulative_apply(xs=xs, lengths=lengths) elif self.temporal_processing == 'iterative': if self.horizon.is_constant(value=0): x, final_internals = self.iterative_apply(x=x, internals=internals) else: initial_x = tf_util.zeros(shape=((batch_size, ) + output_spec.shape), dtype=output_spec.type) signature = self.input_signature(function='iterative_body') internals = signature['current_internals'].kwargs_to_args( kwargs=internals) _, final_indices, final_remaining, x, final_internals = tf.while_loop( cond=tf_util.always_true, body=self.iterative_body, loop_vars=(x, starts, lengths, initial_x, internals), maximum_iterations=tf_util.int32(x=horizon)) internals = signature['current_internals'].args_to_kwargs( args=final_internals) assertions = list() if self.config.create_tf_assertions: assertions.append( tf.debugging.assert_equal(x=final_indices, y=(tf.math.cumsum(x=lengths) - ones))) assertions.append( tf.debugging.assert_equal( x=tf.math.reduce_sum(input_tensor=final_remaining), y=zero)) with tf.control_dependencies(control_inputs=assertions): if self.temporal_processing == 'cumulative': return tf_util.identity(input=super().apply(x=x)) elif self.temporal_processing == 'iterative': return tf_util.identity(input=super().apply(x=x)), internals