def __init__(self, variables): self.session = None shapes = [util.shape(variable) for variable in variables] total_size = sum(util.prod(shape) for shape in shapes) self.theta = tf.placeholder(tf.float32, [total_size]) start = 0 assigns = [] for (shape, variable) in zip(shapes, variables): size = util.prod(shape) assigns.append(tf.assign(variable, tf.reshape(self.theta[start:start + size], shape))) start += size self.set_op = tf.group(*assigns) self.get_op = tf.concat(axis=0, values=[tf.reshape(variable, (-1,)) for variable in variables])
def tf_q_value(self, embedding, distr_params, action, name): num_action = util.prod(self.actions_spec[name]['shape']) mean, stddev, _ = distr_params flat_mean = tf.reshape(tensor=mean, shape=(-1, num_action)) flat_stddev = tf.reshape(tensor=stddev, shape=(-1, num_action)) # Advantage computation # Network outputs entries of lower triangular matrix L if self.l_entries[name] is None: l_matrix = flat_stddev else: l_matrix = tf.map_fn(fn=tf.diag, elems=flat_stddev) l_entries = self.l_entries[name].apply(x=embedding) offset = 0 columns = list() for zeros, size in enumerate(xrange(num_action - 1, -1, -1), 1): column = tf.pad(tensor=l_entries[:, offset: offset + size], paddings=((0, 0), (zeros, 0))) columns.append(column) offset += size l_matrix += tf.stack(values=columns, axis=1) # P = LL^T p_matrix = tf.matmul(a=l_matrix, b=tf.transpose(a=l_matrix, perm=(0, 2, 1))) # A = -0.5 (a - mean)P(a - mean) flat_action = tf.reshape(tensor=action, shape=(-1, num_action)) difference = flat_action - flat_mean advantage = tf.matmul(a=p_matrix, b=tf.expand_dims(input=difference, axis=2)) advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1), b=advantage) advantage = tf.squeeze(input=(-advantage / 2.0), axis=2) # Q = A + V # State-value function state_value = self.state_values[name].apply(x=embedding) q_value = state_value + advantage return tf.reshape(tensor=q_value, shape=((-1,) + self.actions_spec[name]['shape']))
def create_tf_operations(self, config): if len(config.states) > 1: raise Exception() with tf.variable_scope('mlp_value_function'): self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(next(iter(config.states))[1].shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None,)) network_builder = layered_network_builder(( {'type': 'dense', 'size': self.size}, {'type': 'dense', 'size': 1}) ) network = NeuralNetwork(network_builder=network_builder, inputs=dict(state=self.state)) self.prediction = network.output loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate) self.optimize = optimizer.minimize(loss)
def create_training_operations(self, config): self.training_output = dict() q_values = dict() for name, action in self.action.items(): flat_size = util.prod(config.actions[name].shape) num_actions = config.actions[name].num_actions shape = (-1, ) + config.actions[name].shape + (num_actions, ) output = layers['linear'](x=self.training_network.output, size=(flat_size * num_actions)) output = tf.reshape(tensor=output, shape=shape) self.training_output[name] = output self.action_taken[name] = tf.argmax(input=output, axis=-1) one_hot = tf.one_hot(indices=action, depth=num_actions) q_values[name] = tf.reduce_sum(input_tensor=(output * one_hot), axis=-1) return q_values
def tf_regularization_losses(self, states, internals, update): losses = super(DistributionModel, self).tf_regularization_losses(states=states, internals=internals, update=update) network_loss = self.network.regularization_loss() if network_loss is not None: losses['network'] = network_loss for distribution in self.distributions.values(): regularization_loss = distribution.regularization_loss() if regularization_loss is not None: if 'distributions' in losses: losses['distributions'] += regularization_loss else: losses['distributions'] = regularization_loss if self.entropy_regularization is not None and self.entropy_regularization > 0.0: entropies = list() embedding = self.network.apply(x=states, internals=internals, update=update) for name, distribution in self.distributions.items(): distr_params = distribution.parameterize(x=embedding) entropy = distribution.entropy(distr_params=distr_params) collapsed_size = util.prod(util.shape(entropy)[1:]) entropy = tf.reshape(tensor=entropy, shape=(-1, collapsed_size)) entropies.append(entropy) entropy_per_instance = tf.reduce_mean(input_tensor=tf.concat( values=entropies, axis=1), axis=1) entropy = tf.reduce_mean(input_tensor=entropy_per_instance, axis=0) if 'entropy' in self.summary_labels: summary = tf.summary.scalar(name='entropy', tensor=entropy) self.summaries.append(summary) losses['entropy'] = -self.entropy_regularization * entropy return losses
def create_q_deltas(self, config): """ Creates the deltas (or advantage) of the Q values :return: A list of deltas per action """ deltas = list() terminal_float = tf.cast(x=self.terminal, dtype=tf.float32) for name, action in self.action.items(): reward = self.reward terminal = terminal_float for _ in range(len(config.actions[name].shape)): reward = tf.expand_dims(input=reward, axis=1) terminal = tf.expand_dims(input=terminal, axis=1) q_target = reward + ( 1.0 - terminal) * config.discount * self.target_values[name] delta = tf.stop_gradient(q_target) - self.q_values[name] delta = tf.reshape(tensor=delta, shape=(-1, util.prod(config.actions[name].shape))) deltas.append(delta) return deltas
def __init__(self, states_spec, actions_spec, network_spec, config): if any(action['type'] != 'float' or 'min_value' in action or 'max_value' in action for action in actions_spec.values()): raise TensorForceError( "Only unconstrained float actions valid for NAFModel.") with tf.name_scope(name=config.scope): self.state_values = dict() self.l_entries = dict() for name, action in actions_spec.items(): num_action = util.prod(action['shape']) self.state_values[name] = Linear(size=num_action, scope=(name + 'state-value')) self.l_entries[name] = Linear(size=(num_action * (num_action - 1) // 2), scope=(name + '-l-entries')) super(QNAFModel, self).__init__(states_spec=states_spec, actions_spec=actions_spec, network_spec=network_spec, config=config)
def create_target_operations(self, config): target_values = dict() for name, action in self.action_taken.items(): flat_size = util.prod(config.actions[name].shape) num_actions = config.actions[name].num_actions shape = (-1, ) + config.actions[name].shape + (num_actions, ) output = layers['linear'](x=self.target_network.output, size=(flat_size * num_actions)) output = tf.reshape(tensor=output, shape=shape) if config.double_dqn: one_hot = tf.one_hot(indices=action, depth=num_actions) target_values[name] = tf.reduce_sum(input_tensor=(output * one_hot), axis=-1) else: target_values[name] = tf.reduce_max(input_tensor=output, axis=-1) return target_values
def create_tf_operations(self, x, deterministic): flat_size = util.prod(self.shape) if isinstance(self.mean, float): bias = [self.mean for _ in range(flat_size)] else: bias = self.mean self.mean = layers['linear'](x=x, size=flat_size, bias=bias) self.mean = tf.reshape(tensor=self.mean, shape=((-1, ) + self.shape)) # self.mean = tf.squeeze(input=self.mean, axis=1) if isinstance(self.log_stddev, float): bias = [self.log_stddev for _ in range(flat_size)] else: bias = self.log_stddev self.log_stddev = layers['linear'](x=x, size=flat_size, bias=bias) self.log_stddev = tf.reshape(tensor=self.log_stddev, shape=((-1, ) + self.shape)) # self.log_stddev = tf.squeeze(input=self.log_stddev, axis=1) self.log_stddev = tf.minimum(x=self.log_stddev, y=10.0) # prevent infinity when exp self.distribution = (self.mean, self.log_stddev) self.deterministic = deterministic
def create_tf_operations(self, x, deterministic): # Flat mean and log standard deviation flat_size = util.prod(self.shape) # Softplus to ensure alpha and beta >= 1 self.alpha = layers['linear'](x=x, size=flat_size, bias=self.alpha) self.alpha = tf.nn.softplus(features=self.alpha) shape = (-1, ) + self.shape self.alpha = tf.reshape(tensor=self.alpha, shape=shape) self.beta = layers['linear'](x=x, size=flat_size, bias=self.beta) self.beta = tf.nn.softplus(features=self.beta) self.beta = tf.reshape(tensor=self.beta, shape=shape) self.sum = self.alpha + self.beta self.mean = self.alpha / tf.maximum(x=self.sum, y=util.epsilon) self.log_norm = tf.lgamma(self.alpha) + tf.lgamma( self.beta) - tf.lgamma(self.sum) self.deterministic = deterministic
def tf_kl_divergence(self, states, internals, update): embedding = self.network.apply(x=states, internals=internals, update=update) kl_divergences = list() for name, distribution in self.distributions.items(): distr_params = distribution.parameterize(x=embedding) fixed_distr_params = tuple( tf.stop_gradient(input=value) for value in distr_params) kl_divergence = distribution.kl_divergence( distr_params1=fixed_distr_params, distr_params2=distr_params) collapsed_size = util.prod(util.shape(kl_divergence)[1:]) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, collapsed_size)) kl_divergences.append(kl_divergence) kl_divergence_per_instance = tf.reduce_mean(input_tensor=tf.concat( values=kl_divergences, axis=1), axis=1) return tf.reduce_mean(input_tensor=kl_divergence_per_instance, axis=0)
def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()): """ Beta distribution. Args: shape: Action shape. min_value: Minimum value of continuous actions. max_value: Maximum value of continuous actions. alpha: Optional distribution bias for the alpha value. beta: Optional distribution bias for the beta value. """ assert min_value is None or max_value > min_value self.shape = shape self.min_value = min_value self.max_value = max_value action_size = util.prod(self.shape) self.alpha = Linear(size=action_size, bias=alpha, scope='alpha', summary_labels=summary_labels) self.beta = Linear(size=action_size, bias=beta, scope='beta', summary_labels=summary_labels) super(Beta, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)
def tf_demo_loss(self, states, actions, terminal, reward, internals, update, reference=None): """ Extends the q-model loss via the dqfd large-margin loss. """ embedding = self.network.apply(x=states, internals=internals, update=update) deltas = list() for name in sorted(actions): action = actions[name] distr_params = self.distributions[name].parameterize(x=embedding) state_action_value = self.distributions[name].state_action_value(distr_params=distr_params, action=action) # Create the supervised margin loss # Zero for the action taken, one for all other actions, now multiply by expert margin if self.actions_spec[name]['type'] == 'bool': num_actions = 2 action = tf.cast(x=action, dtype=util.tf_dtype('int')) else: num_actions = self.actions_spec[name]['num_actions'] one_hot = tf.one_hot(indices=action, depth=num_actions) ones = tf.ones_like(tensor=one_hot, dtype=tf.float32) inverted_one_hot = ones - one_hot # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others state_action_values = self.distributions[name].state_action_value(distr_params=distr_params) state_action_values = state_action_values + inverted_one_hot * self.expert_margin supervised_selector = tf.reduce_max(input_tensor=state_action_values, axis=-1) # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E) delta = supervised_selector - state_action_value action_size = util.prod(self.actions_spec[name]['shape']) delta = tf.reshape(tensor=delta, shape=(-1, action_size)) deltas.append(delta) loss_per_instance = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) loss_per_instance = tf.square(x=loss_per_instance) return tf.reduce_mean(input_tensor=loss_per_instance, axis=0)
def create_tf_operations(self, x, deterministic): self.min_value = tf.constant(value=self.min_value, dtype=tf.float32) self.max_value = tf.constant(value=self.max_value, dtype=tf.float32) # Flat mean and log standard deviation flat_size = util.prod(self.shape) log_eps = log(util.epsilon) # Softplus to ensure alpha and beta >= 1 self.alpha = layers['linear'](x=x, size=flat_size, bias=self.alpha, scope='alpha') self.alpha = tf.clip_by_value(t=self.alpha, clip_value_min=log_eps, clip_value_max=-log_eps) self.alpha = tf.log(x=(tf.exp(x=self.alpha) + 1.0)) # tf.nn.softplus(features=self.alpha) self.beta = layers['linear'](x=x, size=flat_size, bias=self.beta, scope='beta') self.beta = tf.clip_by_value(t=self.beta, clip_value_min=log_eps, clip_value_max=-log_eps) self.beta = tf.log(x=(tf.exp(x=self.beta) + 1.0)) # tf.nn.softplus(features=self.beta) shape = (-1, ) + self.shape self.alpha = tf.reshape(tensor=self.alpha, shape=shape) self.beta = tf.reshape(tensor=self.beta, shape=shape) self.sum = tf.maximum(x=(self.alpha + self.beta), y=util.epsilon) self.mean = self.beta / self.sum self.log_norm = tf.lgamma(self.alpha) + tf.lgamma( self.beta) - tf.lgamma(self.sum) self.deterministic = deterministic
def create_tf_operations(self, x, deterministic): self.deterministic = deterministic # Flat logits flat_size = util.prod(self.shape) * self.num_actions self.logits = layers['linear'](x=x, size=flat_size, bias=self.logits, scope='logits') # Reshape logits to action shape shape = (-1, ) + self.shape + (self.num_actions, ) self.logits = tf.reshape(tensor=self.logits, shape=shape) # Softmax for corresponding probabilities self.probabilities = tf.nn.softmax(logits=self.logits, dim=-1) # Min epsilon probability for numerical stability self.probabilities = tf.maximum(x=self.probabilities, y=util.epsilon) # "Normalized" logits self.logits = tf.log(x=self.probabilities)
def tf_demo_loss(self, states, actions, terminal, reward, internals, update): embedding = self.network.apply(x=states, internals=internals, update=update) deltas = list() for name, distribution in self.distributions.items(): distr_params = distribution.parameters(x=embedding) state_action_values = distribution.state_action_values( distr_params=distr_params) # Create the supervised margin loss # Zero for the action taken, one for all other actions, now multiply by expert margin if self.actions_spec[name]['type'] == 'bool': num_actions = 2 else: num_actions = self.actions_spec[name]['num_actions'] one_hot = tf.one_hot(indices=actions[name], depth=num_actions) ones = tf.ones_like(tensor=one_hot, dtype=tf.float32) inverted_one_hot = ones - one_hot # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others expert_margin = distr_params + inverted_one_hot * self.expert_margin # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E) supervised_selector = tf.reduce_max(input_tensor=expert_margin, axis=-1) delta = supervised_selector - state_action_values delta = tf.reshape( tensor=delta, shape=(-1, util.prod(self.actions_spec[name]['shape']))) deltas.append(delta) loss_per_instance = tf.reduce_mean(input_tensor=tf.concat( values=deltas, axis=1), axis=1) loss_per_instance = tf.square(x=loss_per_instance) return tf.reduce_mean(input_tensor=loss_per_instance, axis=0)
def __init__(self, shape, probability=0.5, scope='bernoulli', summary_labels=()): """ Bernoulli distribution. Args: shape: Action shape. probability: Optional distribution bias. """ self.shape = shape action_size = util.prod(self.shape) self.logit = Linear(size=action_size, bias=log(probability), scope='logit') super(Bernoulli, self).__init__(shape=shape, scope=scope, summary_labels=summary_labels)
def tf_compare(self, states, internals, actions, terminal, reward, update, reference): reward = self.fn_reward_estimation(states=states, internals=internals, terminal=terminal, reward=reward, update=update) embedding = self.network.apply(x=states, internals=internals, update=update) log_probs = list() for name in sorted(self.distributions): distribution = self.distributions[name] distr_params = distribution.parameterize(x=embedding) log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name]) collapsed_size = util.prod(util.shape(log_prob)[1:]) log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) log_probs.append(log_prob) log_prob = tf.reduce_mean(input_tensor=tf.concat(values=log_probs, axis=1), axis=1) prob_ratio = tf.exp(x=(log_prob - reference)) if self.likelihood_ratio_clipping is None: gain_per_instance = prob_ratio * reward else: clipped_prob_ratio = tf.clip_by_value( t=prob_ratio, clip_value_min=(1.0 / (1.0 + self.likelihood_ratio_clipping)), clip_value_max=(1.0 + self.likelihood_ratio_clipping)) gain_per_instance = tf.minimum(x=(prob_ratio * reward), y=(clipped_prob_ratio * reward)) gain = tf.reduce_mean(input_tensor=gain_per_instance, axis=0) losses = self.fn_regularization_losses(states=states, internals=internals, update=update) if len(losses) > 0: gain -= tf.add_n(inputs=list(losses.values())) return gain
def __init__(self, shape, min_value, max_value, alpha=0.0, beta=0.0, scope='beta', summary_labels=()): """ Beta distribution used for continuous actions. In particular, the Beta distribution allows to bound action values with min and max values. Args: shape: Shape of actions min_value: Min value of all actions for the given shape max_value: Max value of all actions for the given shape alpha: Concentration parameter of the Beta distribution beta: Concentration parameter of the Beta distribution """ assert min_value is None or max_value > min_value self.shape = shape self.min_value = min_value self.max_value = max_value action_size = util.prod(self.shape) with tf.name_scope(name=scope): self.alpha = Linear(size=action_size, bias=alpha, scope='alpha') self.beta = Linear(size=action_size, bias=beta, scope='beta') super(Beta, self).__init__(scope, summary_labels)
def create_tf_operations(self, x, deterministic): # Flat logits flat_size = util.prod(self.shape) * self.num_actions self.logits = layers['linear'](x=x, size=flat_size, bias=self.logits) # Reshape logits to action shape shape = (-1, ) + self.shape + (self.num_actions, ) self.logits = tf.reshape(tensor=self.logits, shape=shape) # Linearly shift logits for numerical stability self.logits -= tf.reduce_max(input_tensor=self.logits, axis=-1, keep_dims=True) # Softmax for corresponding probabilities self.probabilities = tf.nn.softmax(logits=self.logits, dim=-1) # "normalized" logits self.logits = tf.log(x=self.probabilities) # General distribution values self.distribution = (self.logits, ) self.deterministic = deterministic
def tf_loss_per_instance( self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None ): embedding = self.network.apply(x=states, internals=internals, update=update) log_probs = list() for name, distribution in self.distributions.items(): distr_params = distribution.parameterize(x=embedding) log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name]) collapsed_size = util.prod(util.shape(log_prob)[1:]) log_prob = tf.reshape(tensor=log_prob, shape=(-1, collapsed_size)) log_probs.append(log_prob) log_prob = tf.reduce_mean(input_tensor=tf.concat(values=log_probs, axis=1), axis=1) return -log_prob * reward
def create_tf_operations(self, config): """Create training graph. For DQFD, we build the double-dqn training graph and modify the double_q_loss function according to eq. 5 Args: config: Config dict. Returns: """ super(DQFDModel, self).create_tf_operations(config) with tf.name_scope('supervised-update'): deltas = list() for name, action in self.action.items(): # Create the supervised margin loss # Zero for the action taken, one for all other actions, now multiply by expert margin one_hot = tf.one_hot(indices=action, depth=config.actions[name].num_actions) ones = tf.ones_like(tensor=one_hot, dtype=tf.float32) inverted_one_hot = ones - one_hot # max_a([Q(s,a) + l(s,a_E,a)], l(s,a_E, a) is 0 for expert action and margin value for others expert_margin = self.training_output[name] + inverted_one_hot * config.expert_margin # J_E(Q) = max_a([Q(s,a) + l(s,a_E,a)] - Q(s,a_E) supervised_selector = tf.reduce_max(input_tensor=expert_margin, axis=-1) delta = supervised_selector - self.q_values[name] delta = tf.reshape(tensor=delta, shape=(-1, util.prod(config.actions[name].shape))) deltas.append(delta) delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) supervised_loss_per_instance = tf.square(delta) supervised_loss = tf.reduce_mean(input_tensor=supervised_loss_per_instance) # Combining double q loss with supervised loss dqfd_loss = self.q_loss + supervised_loss * config.supervised_weight self.dqfd_optimize = self.optimizer.minimize(dqfd_loss)
def tf_pg_loss_per_instance(self, states, internals, actions, terminal, reward, update): embedding = self.network.apply(x=states, internals=internals, update=update) prob_ratios = list() for name, distribution in self.distributions.items(): distr_params = distribution.parameterize(x=embedding) log_prob = distribution.log_probability(distr_params=distr_params, action=actions[name]) # works the same? # fixed_distr_params = tuple(tf.stop_gradient(input=x) for x in distr_params) # fixed_log_prob = distribution.log_probability(distr_params=fixed_distr_params, action=actions[name]) fixed_log_prob = tf.stop_gradient(input=log_prob) prob_ratio = tf.exp(x=(log_prob - fixed_log_prob)) collapsed_size = util.prod(util.shape(prob_ratio)[1:]) prob_ratio = tf.reshape(tensor=prob_ratio, shape=(-1, collapsed_size)) prob_ratios.append(prob_ratio) prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=prob_ratios, axis=1), axis=1) if self.likelihood_ratio_clipping is None: return -prob_ratio * reward else: clipped_prob_ratio = tf.clip_by_value( t=prob_ratio, clip_value_min=(1.0 / (1.0 + self.likelihood_ratio_clipping)), clip_value_max=(1.0 + self.likelihood_ratio_clipping) ) return -tf.minimum(x=(prob_ratio * reward), y=(clipped_prob_ratio * reward))
def create_tf_operations(self, state, scope='mlp_baseline'): with tf.variable_scope(scope) as scope: self.state = tf.placeholder(dtype=tf.float32, shape=(None, util.prod(state.shape))) self.returns = tf.placeholder(dtype=tf.float32, shape=(None, )) layers = [] for size in self.sizes: layers.append({'type': 'dense', 'size': size}) layers.append({'type': 'linear', 'size': 1}) network = NeuralNetwork( network_builder=layered_network_builder(layers), inputs=dict(state=self.state)) self.prediction = tf.squeeze(input=network.output, axis=1) loss = tf.nn.l2_loss(self.prediction - self.returns) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) variables = tf.contrib.framework.get_variables(scope=scope) self.optimize = optimizer.minimize(loss, var_list=variables)
def tf_apply(self, x, update): return tf.reshape(tensor=x, shape=(-1, util.prod(util.shape(x)[1:])))
def flatten(x): with tf.variable_scope('flatten'): x = tf.reshape(tensor=x, shape=(-1, util.prod(x.get_shape().as_list()[1:]))) return x
def create_tf_operations(self, config): super(CategoricalDQNModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder( dtype=util.tf_dtype(state.type), shape=(None, ) + tuple(state.shape), name=name) # setup constants delta_z and z. z represents the discretized scaling over vmin -> vmax scaling_increment = (self.distribution_max - self.distribution_min) / ( self.num_atoms - 1) # delta_z in the paper quantized_steps = self.distribution_min + np.arange( self.num_atoms) * scaling_increment # z in the paper num_actions = { name: action.num_actions for name, action in config.actions } # creating networks network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state, summary_level=config.tf_summary_level) self.network_internal_index = len(self.internal_inputs) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) training_output_logits, training_output_probabilities, training_qval, action_taken = self._create_action_outputs( self.training_network.output, quantized_steps, self.num_atoms, config, self.action, num_actions) # stack to preserve action_taken shape like (batch_size, num_actions) for action in self.action: if len(action_taken[action]) > 1: self.action_taken[action] = tf.stack(action_taken[action], axis=1) else: self.action_taken[action] = action_taken[action][0] # summarize expected reward histogram if config.tf_summary_level >= 1: for action_shaped in range(len(action_taken[action])): for action_ind in range(num_actions[action]): tf.summary.histogram( '{}-{}-{}-output-distribution'.format( action, action_shaped, action_ind), training_output_probabilities[action] [action_shaped][:, action_ind] * quantized_steps) self.training_variables = tf.contrib.framework.get_variables( scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.next_state) self.next_internal_inputs = list( self.target_network.internal_inputs) _, target_output_probabilities, target_qval, target_action = self._create_action_outputs( self.target_network.output, quantized_steps, self.num_atoms, config, self.action, num_actions) self.target_variables = tf.contrib.framework.get_variables( scope=target_scope) with tf.name_scope('update'): # broadcast rewards and discounted quantization. Shape (batchsize, num_atoms). T_z_j in the paper reward = tf.expand_dims(self.reward, axis=1) terminal = tf.expand_dims(tf.cast(x=self.terminal, dtype=tf.float32), axis=1) broadcasted_rewards = reward + (1.0 - terminal) * ( quantized_steps * self.discount) # clip into distribution_min, distribution_max quantized_discounted_reward = tf.clip_by_value( broadcasted_rewards, self.distribution_min, self.distribution_max) # compute quantization indecies. b, l, u in the paper closest_quantization = (quantized_discounted_reward - self.distribution_min) / scaling_increment lower_ind = tf.floor(closest_quantization) upper_ind = tf.ceil(closest_quantization) # create shared selections for later use dynamic_batch_size = tf.shape(self.reward)[0] batch_selection = tf.range(0, dynamic_batch_size) # tile expects a tensor of same shape, we are just repeating the selection num_atoms times across the last dimension batch_tiled_selection = tf.reshape( tf.tile(tf.reshape(batch_selection, (-1, 1)), [1, self.num_atoms]), [-1]) # combine with lower and upper ind, same as zip(flatten(batch_tiled_selection), flatten(lower_ind)) # also cast to int32 to use as index batch_lower_inds = tf.stack( (batch_tiled_selection, tf.reshape(tf.cast(lower_ind, tf.int32), [-1])), axis=1) batch_upper_inds = tf.stack( (batch_tiled_selection, tf.reshape(tf.cast(upper_ind, tf.int32), [-1])), axis=1) # create loss for each action for action in self.action: # if shape of action != () we need to process each action head separately for action_ind in range( max([util.prod(config.actions[action].shape), 1])): # project onto the supports # tensorflow indexing is still not great, we stack these two and use gather_nd later target_batch_action_selection = tf.stack( (batch_selection, target_action[action][action_ind]), axis=1) # distribute probability scaled by distance # in numpy the equivalent is target_output_probabilities[action][batch_selection, target_action] target_probabilities_of_action = tf.gather_nd( target_output_probabilities[action][action_ind], target_batch_action_selection) distance_lower = target_probabilities_of_action * ( closest_quantization - lower_ind) distance_upper = target_probabilities_of_action * ( upper_ind - closest_quantization) # sum distances aligned into quantized bins. m in the paper # scatter_nd actually sums the values into a zeros tensor instead of overwriting # this is pretty much a huge hack refer to https://github.com/tensorflow/tensorflow/issues/8102 target_quantized_probabilities_lower = tf.scatter_nd( batch_lower_inds, tf.reshape(distance_lower, [-1]), (dynamic_batch_size, self.num_atoms)) target_quantized_probabilities_upper = tf.scatter_nd( batch_upper_inds, tf.reshape(distance_upper, [-1]), (dynamic_batch_size, self.num_atoms)) # no gradient should flow back to the target network target_quantized_probabilities = tf.stop_gradient( target_quantized_probabilities_lower + target_quantized_probabilities_upper) # we must check if input action has shape if len(self.action[action].shape) > 1: input_action = self.action[action][:, action_ind] else: input_action = self.action[action] # now we have target probabilities loss is categorical cross entropy using logits # compare to the actions we actually took training_action_selection = tf.stack( (batch_selection, input_action), axis=1) probabilities_for_action = tf.gather_nd( training_output_probabilities[action][action_ind], training_action_selection) self.loss_per_instance = -tf.reduce_sum( target_quantized_probabilities * tf.log(probabilities_for_action + util.epsilon), axis=-1) loss = tf.reduce_mean(self.loss_per_instance) tf.losses.add_loss(loss) tf.summary.scalar( 'cce-loss-{}-{}'.format(action, action_ind), loss) # Update target network with tf.name_scope("update_target"): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def create_tf_operations(self, config): super(NAFModel, self).create_tf_operations(config) num_actions = sum( util.prod(config.actions[name].shape) for name in sorted(self.action)) # Get hidden layers from network generator, then add NAF outputs, same for target network with tf.variable_scope('training'): network_builder = util.get_function(fct=config.network) self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) with tf.variable_scope('training_outputs') as scope: # Action outputs flat_mean = layers['linear'](x=self.training_network.output, size=num_actions) n = 0 for name in sorted(self.action): shape = config.actions[name].shape self.action_taken[name] = tf.reshape( tensor=flat_mean[:, n:n + util.prod(shape)], shape=((-1, ) + shape)) n += util.prod(shape) # Advantage computation # Network outputs entries of lower triangular matrix L lower_triangular_size = num_actions * (num_actions + 1) // 2 l_entries = layers['linear'](x=self.training_network.output, size=lower_triangular_size) l_matrix = tf.exp( x=tf.map_fn(fn=tf.diag, elems=l_entries[:, :num_actions])) if num_actions > 1: offset = num_actions l_columns = list() for zeros, size in enumerate(xrange(num_actions - 1, -1, -1), 1): column = tf.pad(tensor=l_entries[:, offset:offset + size], paddings=((0, 0), (zeros, 0))) l_columns.append(column) offset += size l_matrix += tf.stack(values=l_columns, axis=1) # P = LL^T p_matrix = tf.matmul(a=l_matrix, b=tf.transpose(a=l_matrix, perm=(0, 2, 1))) flat_action = list() for name in sorted(self.action): shape = config.actions[name].shape flat_action.append( tf.reshape(tensor=self.action[name], shape=(-1, util.prod(shape)))) flat_action = tf.concat(values=flat_action, axis=1) difference = flat_action - flat_mean # A = -0.5 (a - mean)P(a - mean) advantage = tf.matmul(a=p_matrix, b=tf.expand_dims(input=difference, axis=2)) advantage = tf.matmul(a=tf.expand_dims(input=difference, axis=1), b=advantage) advantage = tf.squeeze(input=(-advantage / 2.0), axis=2) # Q = A + V # State-value function value = layers['linear'](x=self.training_network.output, size=num_actions) q_value = value + advantage training_output_vars = tf.contrib.framework.get_variables( scope=scope) with tf.variable_scope('target'): network_builder = util.get_function(fct=config.network) self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) with tf.variable_scope('target_outputs') as scope: # State-value function target_value = layers['linear'](x=self.target_network.output, size=num_actions) target_output_vars = tf.contrib.framework.get_variables( scope=scope) with tf.name_scope('update'): reward = tf.expand_dims(input=self.reward[:-1], axis=1) terminal = tf.expand_dims(input=tf.cast(x=self.terminal[:-1], dtype=tf.float32), axis=1) q_target = reward + (1.0 - terminal) * config.discount * target_value[1:] delta = q_target - q_value[:-1] delta = tf.reduce_mean(input_tensor=delta, axis=1) self.loss_per_instance = tf.square(x=delta) # We observe issues with numerical stability in some tests, gradient clipping can help if config.clip_gradients > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: loss = tf.reduce_mean(input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(loss) with tf.name_scope('update_target'): # Combine hidden layer variables and output layer variables training_vars = self.training_network.variables + training_output_vars target_vars = self.target_network.variables + target_output_vars self.target_network_update = list() for v_source, v_target in zip(training_vars, target_vars): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)
def act(self, states, deterministic=False): """ Return action(s) for given state(s). First, the states are preprocessed using the given preprocessing configuration. Then, the states are passed to the model to calculate the desired action(s) to execute. After obtaining the actions, exploration might be added by the agent, depending on the exploration configuration. Args: states: One state (usually a value tuple) or dict of states if multiple states are expected. deterministic: If true, no exploration and sampling is applied. Returns: Scalar value of the action or dict of multiple actions the agent wants to execute. """ self.current_internals = self.next_internals if self.unique_state: self.current_states = dict(state=np.asarray(states)) else: self.current_states = { name: np.asarray(state) for name, state in states.items() } # Preprocessing for name, preprocessing in self.preprocessing.items(): self.current_states[name] = preprocessing.process( state=self.current_states[name]) # Retrieve action self.current_actions, self.next_internals, self.timestep = self.model.act( states=self.current_states, internals=self.current_internals, deterministic=deterministic) # Exploration if not deterministic: for name, exploration in self.exploration.items(): if self.actions_spec[name]['type'] == 'bool': if random() < exploration(episode=self.episode, timestep=self.timestep): shape = self.actions_spec[name]['shape'] self.current_actions[name] = ( np.random.random_sample(size=shape) < 0.5) elif self.actions_spec[name]['type'] == 'int': if random() < exploration(episode=self.episode, timestep=self.timestep): shape = self.actions_spec[name]['shape'] num_actions = self.actions_spec[name]['num_actions'] self.current_actions[name] = np.random.randint( low=num_actions, size=shape) elif self.actions_spec[name]['type'] == 'float': explore = (lambda: exploration(episode=self.episode, timestep=self.timestep)) shape = self.actions_spec[name]['shape'] exploration = np.array( [explore() for _ in xrange(util.prod(shape))]) if 'min_value' in self.actions_spec[name]: exploration = np.clip( a=exploration, a_min=self.actions_spec[name]['min_value'], a_max=self.actions_spec[name]['max_value']) self.current_actions[name] += np.reshape( exploration, shape) if self.unique_action: return self.current_actions['action'] else: return self.current_actions
def processed_shape(self, shape): if shape[0] == -1: return -1, util.prod(shape[1:]) return util.prod(shape),
def create_tf_operations(self, config): """ Creates PPO training operations, i.e. the SGD update based on the trust region loss. :return: """ super(PPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): prob_ratios = list() entropy_penalties = list() # for diagnostics kl_divergences = list() entropies = list() self.distribution_tensors = dict() self.prev_distribution_tensors = dict() for name, action in self.action.items(): shape_size = util.prod(config.actions[name].shape) distribution = self.distribution[name] fixed_distribution = distribution.__class__.from_tensors( tensors=[ tf.stop_gradient(x) for x in distribution.get_tensors() ], deterministic=self.deterministic) # Standard policy gradient log likelihood computation log_prob = distribution.log_probability(action=action) fixed_log_prob = fixed_distribution.log_probability( action=action) log_prob_diff = log_prob - fixed_log_prob prob_ratio = tf.exp(x=log_prob_diff) prob_ratio = tf.reshape(tensor=prob_ratio, shape=(-1, shape_size)) prob_ratios.append(prob_ratio) entropy = distribution.entropy() entropy_penalty = -config.entropy_penalty * entropy entropy_penalty = tf.reshape(tensor=entropy_penalty, shape=(-1, shape_size)) entropy_penalties.append(entropy_penalty) self.distribution_tensors[name] = list( distribution.get_tensors()) prev_distribution = list( tf.placeholder(dtype=tf.float32, shape=util.shape(tensor, unknown=None)) for tensor in distribution.get_tensors()) self.prev_distribution_tensors[name] = prev_distribution prev_distribution = distribution.from_tensors( tensors=prev_distribution, deterministic=self.deterministic) kl_divergence = prev_distribution.kl_divergence( other=distribution) kl_divergence = tf.reshape(tensor=kl_divergence, shape=(-1, shape_size)) kl_divergences.append(kl_divergence) entropy = tf.reshape(tensor=entropy, shape=(-1, shape_size)) entropies.append(entropy) # The surrogate loss in PPO is the minimum of clipped loss and # target advantage * prob_ratio, which is the CPO loss # Presentation on conservative policy iteration: # https://www.cs.cmu.edu/~jcl/presentation/RL/RL.ps prob_ratio = tf.reduce_mean(input_tensor=tf.concat( values=prob_ratios, axis=1), axis=1) tf.summary.histogram('prob_ratio', prob_ratio) tf.summary.scalar('mean_prob_ratio', tf.reduce_mean(input_tensor=prob_ratio, axis=0)) clipped_prob_ratio = tf.clip_by_value(prob_ratio, 1.0 - config.loss_clipping, 1.0 + config.loss_clipping) self.loss_per_instance = -tf.minimum( x=(prob_ratio * self.reward), y=(clipped_prob_ratio * self.reward)) self.surrogate_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0, name='surrogate_loss') tf.losses.add_loss(self.surrogate_loss) # Mean over actions, mean over batch entropy_penalty = tf.reduce_mean(input_tensor=tf.concat( values=entropy_penalties, axis=1), axis=1) self.entropy_penalty = tf.reduce_mean(input_tensor=entropy_penalty, axis=0, name='entropy_penalty') tf.losses.add_loss(self.entropy_penalty) kl_divergence = tf.reduce_mean(input_tensor=tf.concat( values=kl_divergences, axis=1), axis=1) self.kl_divergence = tf.reduce_mean(input_tensor=kl_divergence, axis=0) tf.summary.scalar('kl_divergence', self.kl_divergence) entropy = tf.reduce_mean(input_tensor=tf.concat(values=entropies, axis=1), axis=1) self.entropy = tf.reduce_mean(input_tensor=entropy, axis=0) tf.summary.scalar('entropy', self.entropy)
def tf_loss_per_instance(self, states, internals, actions, terminal, reward, update): # TEMP: Random sampling fix if self.random_sampling_fix: next_states = self.get_states(states=self.next_state_inputs) next_states = { name: tf.stop_gradient(input=state) for name, state in next_states.items() } embedding, next_internals = self.network.apply( x=states, internals=internals, update=update, return_internals=True) # Both networks can use the same internals, could that be a problem? # Otherwise need to handle internals indices correctly everywhere target_embedding = self.target_network.apply( x=next_states, internals=next_internals, update=update) else: embedding = self.network.apply( x={name: state[:-1] for name, state in states.items()}, internals=[internal[:-1] for internal in internals], update=update) # Both networks can use the same internals, could that be a problem? # Otherwise need to handle internals indices correctly everywhere target_embedding = self.target_network.apply( x={name: state[1:] for name, state in states.items()}, internals=[internal[1:] for internal in internals], update=update) actions = {name: action[:-1] for name, action in actions.items()} terminal = terminal[:-1] reward = reward[:-1] deltas = list() for name, distribution in self.distributions.items(): target_distribution = self.target_distributions[name] distr_params = distribution.parameterize(x=embedding) target_distr_params = target_distribution.parameterize( x=target_embedding) q_value = self.tf_q_value(embedding=embedding, distr_params=distr_params, action=actions[name], name=name) if self.double_q_model: action_taken = distribution.sample(distr_params=distr_params, deterministic=True) else: action_taken = target_distribution.sample( distr_params=target_distr_params, deterministic=True) next_q_value = target_distribution.state_action_value( distr_params=target_distr_params, action=action_taken) delta = self.tf_q_delta(q_value=q_value, next_q_value=next_q_value, terminal=terminal, reward=reward) collapsed_size = util.prod(util.shape(delta)[1:]) delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size)) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards loss_per_instance = tf.reduce_mean(input_tensor=tf.concat( values=deltas, axis=1), axis=1) # Optional Huber loss if self.huber_loss is not None and self.huber_loss > 0.0: return tf.where( condition=(tf.abs(x=loss_per_instance) <= self.huber_loss), x=(0.5 * tf.square(x=loss_per_instance)), y=(self.huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss))) else: return tf.square(x=loss_per_instance)
def create_tf_operations(self, config): """ Creates TRPO training operations, i.e. the natural gradient update step based on the KL divergence constraint between new and old policy. :return: """ super(TRPOModel, self).create_tf_operations(config) with tf.variable_scope('update'): losses = list() for name, action in config.actions: distribution = self.distribution[name] previous_distribution = tuple(tf.placeholder(dtype=tf.float32, shape=util.shape(x, unknown=None)) for x in distribution) self.internal_inputs.extend(previous_distribution) self.internal_outputs.extend(distribution) if sum(1 for _ in distribution) == 2: for n, x in enumerate(distribution): if n == 0: self.internal_inits.append(np.zeros(shape=util.shape(x)[1:])) else: self.internal_inits.append(np.ones(shape=util.shape(x)[1:])) else: self.internal_inits.extend(np.zeros(shape=util.shape(x)[1:]) for x in distribution) previous_distribution = self.distribution[name].__class__(distribution=previous_distribution) log_prob = distribution.log_probability(action=self.action[name]) previous_log_prob = previous_distribution.log_probability(action=self.action[name]) prob_ratio = tf.minimum(tf.exp(log_prob - previous_log_prob), 1000) self.loss_per_instance = tf.multiply(x=prob_ratio, y=self.reward) surrogate_loss = -tf.reduce_mean(self.loss_per_instance, axis=0) kl_divergence = distribution.kl_divergence(previous_distribution) entropy = distribution.entropy() losses.append((surrogate_loss, kl_divergence, entropy)) self.losses = [tf.reduce_mean(loss) for loss in zip(*losses)] # Get symbolic gradient expressions variables = list(tf.trainable_variables()) # TODO: ideally not value function (see also for "gradients" below) gradients = tf.gradients(self.losses, variables) variables = [var for var, grad in zip(variables, gradients) if grad is not None] gradients = [grad for grad in gradients if grad is not None] self.policy_gradient = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0) # util.prod(util.shape(v)) fixed_distribution = distribution.__class__([tf.stop_gradient(x) for x in distribution]) fixed_kl_divergence = fixed_distribution.kl_divergence(distribution) self.tangent = tf.placeholder(tf.float32, shape=(None,)) offset = 0 tangents = [] for variable in variables: shape = util.shape(variable) size = util.prod(shape) tangents.append(tf.reshape(self.tangent[offset:offset + size], shape)) offset += size gradients = tf.gradients(fixed_kl_divergence, variables) gradient_vector_product = [tf.reduce_sum(g * t) for (g, t) in zip(gradients, tangents)] self.flat_variable_helper = FlatVarHelper(variables) gradients = tf.gradients(gradient_vector_product, variables) self.fisher_vector_product = tf.concat(values=[tf.reshape(grad, (-1,)) for grad in gradients], axis=0) self.cg_optimizer = ConjugateGradientOptimizer(self.logger, config.cg_iterations)
def create_tf_operations(self, config): super(QModel, self).create_tf_operations(config) # Placeholders with tf.variable_scope('placeholder'): self.next_state = dict() for name, state in config.states.items(): self.next_state[name] = tf.placeholder( dtype=util.tf_dtype(state.type), shape=(None, ) + tuple(state.shape), name=name) network_builder = util.get_function(fct=config.network) # Training network with tf.variable_scope('training') as training_scope: self.training_network = NeuralNetwork( network_builder=network_builder, inputs=self.state) self.internal_inputs.extend(self.training_network.internal_inputs) self.internal_outputs.extend( self.training_network.internal_outputs) self.internal_inits.extend(self.training_network.internal_inits) self.q_values = self.create_training_operations(config) self.training_variables = tf.contrib.framework.get_variables( scope=training_scope) # Target network with tf.variable_scope('target') as target_scope: self.target_network = NeuralNetwork( network_builder=network_builder, inputs=self.next_state) self.internal_inputs.extend(self.target_network.internal_inputs) self.internal_outputs.extend(self.target_network.internal_outputs) self.internal_inits.extend(self.target_network.internal_inits) self.target_values = self.create_target_operations(config) self.target_variables = tf.contrib.framework.get_variables( scope=target_scope) with tf.name_scope('update'): deltas = list() terminal_float = tf.cast(x=self.terminal, dtype=tf.float32) for name, action in self.action.items(): reward = self.reward terminal = terminal_float for _ in range(len(config.actions[name].shape)): reward = tf.expand_dims(input=reward, axis=1) terminal = tf.expand_dims(input=terminal, axis=1) q_target = reward + ( 1.0 - terminal) * config.discount * self.target_values[name] delta = tf.stop_gradient(q_target) - self.q_values[name] delta = tf.reshape( tensor=delta, shape=(-1, util.prod(config.actions[name].shape))) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards delta = tf.reduce_mean(input_tensor=tf.concat(values=deltas, axis=1), axis=1) self.loss_per_instance = tf.square(delta) # If loss clipping is used, calculate the huber loss if config.clip_loss > 0.0: huber_loss = tf.where( condition=(tf.abs(delta) < config.clip_gradients), x=(0.5 * self.loss_per_instance), y=(tf.abs(delta) - 0.5)) self.q_loss = tf.reduce_mean(input_tensor=huber_loss, axis=0) else: self.q_loss = tf.reduce_mean( input_tensor=self.loss_per_instance, axis=0) tf.losses.add_loss(self.q_loss) # Update target network with tf.name_scope('update-target'): self.target_network_update = list() for v_source, v_target in zip(self.training_variables, self.target_variables): update = v_target.assign_sub(config.update_target_weight * (v_target - v_source)) self.target_network_update.append(update)