def true_fn(): reset_values = self.estimator.reset(baseline=self.baseline_policy) new_overwritten_values = OrderedDict() for name, value1, value2 in util.zip_items(overwritten_values, reset_values): if util.is_nested(name=name): new_overwritten_values[name] = OrderedDict() for inner_name, value1, value2 in util.zip_items(value1, value2): new_overwritten_values[name][inner_name] = tf.concat( values=(value1, value2), axis=0 ) else: new_overwritten_values[name] = tf.concat(values=(value1, value2), axis=0) return new_overwritten_values
def tf_sample_actions(self, states, internals, auxiliaries, temperature, return_internals): if return_internals: embedding, internals = self.network.apply( x=states, internals=internals, return_internals=return_internals) else: embedding = self.network.apply(x=states, internals=internals, return_internals=return_internals) Module.update_tensor(name=self.name, tensor=embedding) actions = OrderedDict() for name, spec, distribution, temp in util.zip_items( self.actions_spec, self.distributions, temperature): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) actions[name] = distribution.sample(parameters=parameters, temperature=temp) if return_internals: return actions, internals else: return actions
def body(indices, remaining, current_x, current_aggregates): current_x = tf.gather(params=x, indices=indices) next_x, next_aggregates = self.iterative_step( x=current_x, previous=current_aggregates) with tf.control_dependencies(control_inputs=(current_x, next_x)): is_finished = tf.math.equal(x=remaining, y=zeros) if isinstance(next_aggregates, dict): for name, current_aggregate, next_aggregate in util.zip_items( current_aggregates, next_aggregates): condition = is_finished for _ in range(util.rank(x=current_aggregate) - 1): condition = tf.expand_dims(input=condition, axis=1) next_aggregates[name] = tf.where( condition=condition, x=current_aggregate, y=next_aggregate) else: condition = is_finished for _ in range(util.rank(x=current_aggregates) - 1): condition = tf.expand_dims(input=condition, axis=1) next_aggregates = tf.where(condition=condition, x=current_aggregates, y=next_aggregates) remaining -= tf.where(condition=is_finished, x=zeros, y=ones) indices += tf.where(condition=tf.math.equal(x=remaining, y=zeros), x=zeros, y=ones) return indices, remaining, next_x, next_aggregates
def tf_loss_per_instance( self, states, internals, actions, terminal, reward, next_states, next_internals, reference=None ): embedding = self.network.apply(x=states, internals=internals) log_probs = list() for name, distribution, action in util.zip_items(self.distributions, actions): parameters = distribution.parametrize(x=embedding) log_prob = distribution.log_probability(parameters=parameters, action=action) collapsed_size = util.product(xs=util.shape(log_prob)[1:]) log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) log_probs.append(log_prob) log_probs = tf.concat(values=log_probs, axis=1) if reference is None: old_log_probs = tf.stop_gradient(input=log_probs) else: old_log_probs = reference # Comment on log_ratio 1.0 and gradient perspective prob_ratios = tf.exp(x=(log_probs - old_log_probs)) prob_ratio_per_instance = tf.reduce_mean(input_tensor=prob_ratios, axis=1) likelihood_ratio_clipping = self.likelihood_ratio_clipping.value() clipped_prob_ratio_per_instance = tf.clip_by_value( t=prob_ratio_per_instance, clip_value_min=(1.0 / (1.0 + likelihood_ratio_clipping)), clip_value_max=(1.0 + likelihood_ratio_clipping) ) return -tf.minimum( x=(prob_ratio_per_instance * reward), y=(clipped_prob_ratio_per_instance * reward) )
def tf_entropy(self, states, internals, auxiliaries, reduced=True, include_per_action=False): entropies = self.entropies(states=states, internals=internals, auxiliaries=auxiliaries) for name, spec, entropy in util.zip_items(self.actions_spec, entropies): entropies[name] = tf.reshape( tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) entropy = tf.concat(values=tuple(entropies.values()), axis=1) if reduced: entropy = tf.math.reduce_mean(input_tensor=entropy, axis=1) if include_per_action: for name in self.actions_spec: entropies[name] = tf.math.reduce_mean( input_tensor=entropies[name], axis=1) if include_per_action: entropies['*'] = entropy return entropies else: return entropy
def tf_actions_value(self, states, internals, auxiliaries, actions, reduced=True, include_per_action=False): actions_values = self.actions_values(states=states, internals=internals, auxiliaries=auxiliaries, actions=actions) for name, spec, actions_value in util.zip_items( self.actions_spec, actions_values): actions_values[name] = tf.reshape( tensor=actions_value, shape=(-1, util.product(xs=spec['shape']))) actions_value = tf.concat(values=tuple(actions_values.values()), axis=1) if reduced: actions_value = tf.math.reduce_mean(input_tensor=actions_value, axis=1) if include_per_action: for name in self.actions_spec: actions_values[name] = tf.math.reduce_mean( input_tensor=actions_values[name], axis=1) if include_per_action: actions_values['*'] = actions_value return actions_values else: return actions_value
def tf_log_probability(self, states, internals, auxiliaries, actions, reduced=True, include_per_action=False): log_probabilities = self.log_probabilities(states=states, internals=internals, auxiliaries=auxiliaries, actions=actions) for name, spec, log_probability in util.zip_items( self.actions_spec, log_probabilities): log_probabilities[name] = tf.reshape( tensor=log_probability, shape=(-1, util.product(xs=spec['shape']))) log_probability = tf.concat(values=tuple(log_probabilities.values()), axis=1) if reduced: log_probability = tf.math.reduce_sum(input_tensor=log_probability, axis=1) if include_per_action: log_probabilities['*'] = log_probability return log_probabilities else: return log_probability
def tf_kl_divergence(self, states, internals, auxiliaries, other=None, reduced=True, include_per_action=False): kl_divergences = self.kl_divergences(states=states, internals=internals, auxiliaries=auxiliaries, other=other) for name, spec, kl_divergence in util.zip_items( self.actions_spec, kl_divergences): kl_divergences[name] = tf.reshape( tensor=kl_divergence, shape=(-1, util.product(xs=spec['shape']))) kl_divergence = tf.concat(values=tuple(kl_divergences.values()), axis=1) if reduced: kl_divergence = tf.math.reduce_sum(input_tensor=kl_divergence, axis=1) if include_per_action: kl_divergences['*'] = kl_divergence return kl_divergences else: return kl_divergence
def tf_kl_divergences(self, states, internals, auxiliaries, other=None): assert other is None or isinstance(other, ParametrizedDistributions) embedding = self.network.apply(x=states, internals=internals) if other is not None: other_embedding = other.network.apply(x=states, internals=internals) kl_divergences = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) if other is None: other_parameters = tuple(tf.stop_gradient(input=value) for value in parameters) elif spec['type'] == 'int': other_parameters = other.distributions[name].parametrize( x=other_embedding, mask=mask ) else: other_parameters = other.distributions[name].parametrize(x=other_embedding) kl_divergences[name] = distribution.kl_divergence( parameters1=other_parameters, parameters2=parameters # order???? ) return kl_divergences
def tf_sample_actions(self, states, internals, auxiliaries, deterministic, return_internals): if return_internals: embedding, internals = self.network.apply( x=states, internals=internals, return_internals=return_internals ) else: embedding = self.network.apply( x=states, internals=internals, return_internals=return_internals ) Module.update_tensor(name=self.name, tensor=embedding) actions = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) action = distribution.sample(parameters=parameters, deterministic=deterministic) entropy = distribution.entropy(parameters=parameters) entropy = tf.reshape(tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) mean_entropy = tf.reduce_mean(input_tensor=entropy, axis=1) actions[name] = self.add_summary( label='entropy', name=(name + '-entropy'), tensor=mean_entropy, pass_tensors=action ) if return_internals: return actions, internals else: return actions
def tf_states_values(self, states, internals, auxiliaries): embedding = self.network.apply(x=states, internals=internals) Module.update_tensor(name=self.name, tensor=embedding) states_values = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] parameters = distribution.parametrize(x=embedding, mask=mask) else: parameters = distribution.parametrize(x=embedding) states_values[name] = distribution.states_value(parameters=parameters) return states_values
def tf_kldiv_reference(self, states, internals, auxiliaries): embedding = self.network.apply(x=states, internals=internals) kldiv_reference = OrderedDict() for name, spec, distribution in util.zip_items(self.actions_spec, self.distributions): if spec['type'] == 'int': mask = auxiliaries[name + '_mask'] kldiv_reference[name] = distribution.parametrize(x=embedding, mask=mask) else: kldiv_reference[name] = distribution.parametrize(x=embedding) return kldiv_reference
def tf_act(self, states, internals, auxiliaries): actions_values = self.actions_values(states=states, internals=internals, auxiliaries=auxiliaries) actions = OrderedDict() for name, spec, action_values in util.zip_items( self.actions_spec, actions_values): actions[name] = tf.math.argmax(input=action_values, axis=-1, output_type=util.tf_dtype( spec['type'])) return actions
def tf_reference( self, states, internals, actions, terminal, reward, next_states, next_internals ): embedding = self.network.apply(x=states, internals=internals) log_probs = list() for name, distribution, action in util.zip_items(self.distributions, actions): parameters = distribution.parametrize(x=embedding) log_prob = distribution.log_probability(parameters=parameters, action=action) collapsed_size = util.product(xs=util.shape(log_prob)[1:]) log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) log_probs.append(log_prob) log_probs = tf.concat(values=log_probs, axis=1) return tf.stop_gradient(input=log_probs)
def tf_entropy(self, states, internals, auxiliaries, mean=True): entropies = self.entropies(states=states, internals=internals, auxiliaries=auxiliaries) for name, spec, entropy in util.zip_items(self.actions_spec, entropies): entropies[name] = tf.reshape( tensor=entropy, shape=(-1, util.product(xs=spec['shape']))) entropy = tf.concat(values=tuple(entropies.values()), axis=1) if mean: entropy = tf.math.reduce_mean(input_tensor=entropy, axis=1) return entropy
def tf_states_values(self, states, internals, auxiliaries): if not all(spec['type'] in ('bool', 'int') for spec in self.states_spec.values()): raise NotImplementedError actions_values = self.actions_values(states=states, internals=internals, auxiliaries=auxiliaries) states_values = OrderedDict() for name, spec, action_values in util.zip_items( self.actions_spec, actions_values): states_values[name] = tf.math.reduce_max( input_tensor=action_values, axis=-1) return states_values
def tf_states_value(self, states, internals, auxiliaries, mean=True): states_values = self.states_values(states=states, internals=internals, auxiliaries=auxiliaries) for name, spec, states_value in util.zip_items(self.actions_spec, states_values): states_values[name] = tf.reshape( tensor=states_value, shape=(-1, util.product(xs=spec['shape']))) states_value = tf.concat(values=tuple(states_values.values()), axis=1) if mean: states_value = tf.math.reduce_mean(input_tensor=states_value, axis=1) return states_value
def tf_log_probability(self, states, internals, auxiliaries, actions, mean=True): log_probabilities = self.log_probabilities(states=states, internals=internals, auxiliaries=auxiliaries, actions=actions) for name, spec, log_probability in util.zip_items( self.actions_spec, log_probabilities): log_probabilities[name] = tf.reshape( tensor=log_probability, shape=(-1, util.product(xs=spec['shape']))) log_probability = tf.concat(values=tuple(log_probabilities.values()), axis=1) if mean: log_probability = tf.math.reduce_mean(input_tensor=log_probability, axis=1) return log_probability
def tf_kl_divergence(self, states, internals, auxiliaries, other=None, mean=True): kl_divergences = self.kl_divergences(states=states, internals=internals, auxiliaries=auxiliaries, other=other) for name, spec, kl_divergence in util.zip_items( self.actions_spec, kl_divergences): kl_divergences[name] = tf.reshape( tensor=kl_divergence, shape=(-1, util.product(xs=spec['shape']))) kl_divergence = tf.concat(values=tuple(kl_divergences.values()), axis=1) if mean: kl_divergence = tf.math.reduce_mean(input_tensor=kl_divergence, axis=1) return kl_divergence