Esempio n. 1
0
  def _build_train_op(self):
    """Builds the training op for Rainbow.

    Returns:
      train_op: An op performing one step of training.
    """
    target_distribution = tf.stop_gradient(self._build_target_distribution())

    # size of indices: batch_size x 1.
    indices = tf.range(tf.shape(self._replay_logits)[0])[:, None]
    # size of reshaped_actions: batch_size x 2.
    reshaped_actions = tf.concat([indices, self._replay.actions[:, None]], 1)
    # For each element of the batch, fetch the logits for its selected action.
    chosen_action_logits = tf.gather_nd(self._replay_logits, reshaped_actions)

    loss = tf.nn.softmax_cross_entropy_with_logits(
        labels=target_distribution,
        logits=chosen_action_logits)

    optimizer = tf.train.AdamOptimizer(
        learning_rate=self.learning_rate,
        epsilon=self.optimizer_epsilon)

    update_priorities_op = self._replay.tf_set_priority(
        self._replay.indices, tf.sqrt(loss + 1e-10))

    target_priorities = self._replay.tf_get_priority(self._replay.indices)
    target_priorities = tf.math.add(target_priorities, 1e-10)
    target_priorities = 1.0 / tf.sqrt(target_priorities)
    target_priorities /= tf.reduce_max(target_priorities)

    weighted_loss = target_priorities * loss

    with tf.control_dependencies([update_priorities_op]):
      return optimizer.minimize(tf.reduce_mean(weighted_loss)), weighted_loss
    def _build_train_op(self):
        """Builds the training op for Rainbow.

    Returns:
      train_op: An op performing one step of training.
    """

        replay_action_one_hot = tf.one_hot(self._replay.actions,
                                           self.num_actions,
                                           1.,
                                           0.,
                                           name='action_one_hot')
        replay_chosen_q = tf.reduce_sum(self._replay_qs *
                                        replay_action_one_hot,
                                        reduction_indices=1,
                                        name='replay_chosen_q')

        target = tf.stop_gradient(self._build_target_q_op())
        loss = tf.losses.huber_loss(target,
                                    replay_chosen_q,
                                    reduction=tf.losses.Reduction.NONE)

        update_priorities_op = self._replay.tf_set_priority(
            self._replay.indices, tf.sqrt(loss + 1e-10))

        target_priorities = self._replay.tf_get_priority(self._replay.indices)
        target_priorities = tf.math.add(target_priorities, 1e-10)
        target_priorities = 1.0 / tf.sqrt(target_priorities)
        target_priorities /= tf.reduce_max(target_priorities)

        weighted_loss = target_priorities * loss

        with tf.control_dependencies([update_priorities_op]):
            return self.optimizer.minimize(
                tf.reduce_mean(weighted_loss)), weighted_loss
Esempio n. 3
0
def historgram_loss(y, y_hat, k=100., sigma=1 / 2):
    raise NotImplementedError()
    ps = 0.
    w = 1 / k
    y = tf.squeeze(y, axis=2)
    # y_hat = tf.layers.flatten(y_hat)
    k = np.linspace(0., 1., k)
    s = (tf.erf((1. - y) / (tf.sqrt(2.) * sigma)) - tf.erf((0. - y) / (tf.sqrt(2.) * sigma)))
    for idx, j in enumerate(k):
        u = tf.erf(((j + w - y) / (tf.sqrt(2.) * sigma)))
        l = tf.erf(((j - y) / (tf.sqrt(2.) * sigma)))
        p = (u - l) / (2 * s + 1e-6)
        f_x = tf.log(y_hat[:, :, idx])
        ps += p * tf.where(tf.is_nan(f_x), tf.zeros_like(f_x), f_x)
    return tf.reduce_mean(-ps)
Esempio n. 4
0
def batched_euclidean_distance(y_hat, y, squared=True):
    assert y_hat.get_shape().ndims == 3 and y.get_shape().ndims == 3
    a = tf.square(tf.reduce_sum(y, axis=2))[:, :, None]
    b = tf.square(tf.reduce_sum(y_hat, axis=2))[:, None, :]
    D = tf.matmul(y, y_hat, transpose_b=True)
    d = a + b - 2 * D
    return tf.sqrt(d) if not squared else d
Esempio n. 5
0
def regr_metrics(y, y_hat):
    regr_ops = {
        'mse': mse(y, y_hat),
        'mae': mae(y, y_hat),
        'smape': smape(y, y_hat),
        'rmse': tf.sqrt(mse(y, y_hat))
    }
    return regr_ops
Esempio n. 6
0
    def _reg(cls, batch_size, d, x, x_fake, beta=1e-1):
        alpha = tf.random_uniform(shape=[batch_size, 1], minval=0., maxval=1.)
        interpolates = alpha * x + (1 - alpha) * x_fake
        int_d = d(interpolates)
        gradients = tf.gradients(int_d, [interpolates])[0]

        slopes = tf.sqrt(
            tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
        return beta * tf.reduce_mean((slopes - 1)**2)
Esempio n. 7
0
 def update(i, grad, state):
     i = tf.cast(i, dtype=tf.float32)
     x, m, v = state
     m = (1. - b1) * grad + b1 * m  # First  moment estimate.
     v = (1. - b2) * (grad**2.) + b2 * v  # Second moment estimate.
     mhat = m / (1. - b1**(i + 1.))  # Bias correction.
     vhat = v / (1. - b2**(i + 1.))
     x = x - learning_rate * mhat / (tf.sqrt(vhat) + eps)
     return x, m, v
  def normalized_dist(states):
    inner = tf.multiply(states - starting_states, goals - starting_states)
    upper = tf.reduce_sum(inner, -1)
    sign = tf.sign(upper)
    
    result = sign * tf.square(tf.math.divide(upper, tf.norm(goals - starting_states, ord=2)))

    term_1 = tf.square(tf.norm(states - starting_states, 2))
    term_2 = tf.square(tf.math.divide(upper, tf.norm(goals - starting_states, ord=2)))
    
    return tf.sqrt(epsilon + tf.abs(result - alpha * (term_1 - term_2)))
Esempio n. 9
0
def _variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)
def summarize_stats(stats):
  """Summarize a dictionary of variables.

  Args:
    stats: a dictionary of {name: tensor} to compute stats over.
  """
  for name, stat in stats.items():
    mean = tf.reduce_mean(stat)
    tf.summary.scalar('mean_%s' % name, mean)
    tf.summary.scalar('max_%s' % name, tf.reduce_max(stat))
    tf.summary.scalar('min_%s' % name, tf.reduce_min(stat))
    std = tf.sqrt(tf.reduce_mean(tf.square(stat)) - tf.square(mean) + 1e-10)
    tf.summary.scalar('std_%s' % name, std)
    tf.summary.histogram(name, stat)
def weight_standardization_replacements(model):
    """Weight-standardize non-output kernels of `model`."""
    if not isinstance(model, ReparameterizableBackbone):
        raise ValueError(
            '`model` must be an instance of `ReparameterizableBackbone`.')
    kernels = filter(lambda v: 'kernel' in v.name and 'output' not in v.name,
                     model.reparameterizables())
    replacements = []
    for v in kernels:
        # Wrap a standardization around the kernel.
        # Kernel has shape HWIO, normalize over HWI
        mean, var = tf.nn.moments(v, axes=[0, 1, 2], keepdims=True)
        # Author code uses std + 1e-5
        replacements.append((v.ref(), (v - mean) / tf.sqrt(var + 1e-10)))
    return dict(replacements)
Esempio n. 12
0
 def _do_data_dependent_init():
     """Returns ops for the data-dependent init of g and maybe b_fc."""
     w_fc_normalized = tf.nn.l2_normalize(w_fc.read_value(), [0])
     output_init = tf.matmul(embeddings, w_fc_normalized)
     mean_init, var_init = tf.nn.moments(output_init, [0])
     # Data-dependent init values.
     g_init_value = 1. / tf.sqrt(var_init + 1e-10)
     ops = [tf.assign(g, g_init_value)]
     if not cosine_classifier:
         # Also initialize a bias in a data-dependent way.
         b_fc_init_value = -mean_init * g_init_value
         ops.append(tf.assign(b_fc, b_fc_init_value))
     # Mark that the data-dependent initialization is done to prevent it from
     # happening again in the future.
     ops.append(tf.assign(data_dependent_init_done, 1))
     return tf.group(*ops)
    def sample(self, mean, log_b2, training=False):
        """sample
        Sampling z from Z ~ Laplacian(μ,b) 
        Y ~ N(0,1)
        V ~ Exponential(1) = Gamma(1,1)
        z = μ + by(2v)^1/2
        """
        if not training:
            return mean

        # Exponential is special case of Gamma 
        # Exponential(λ) = Gamma(1,λ)
        exponential = tf.random.gamma(tf.shape(mean), alpha=1, beta=1)
        gaussian = tf.random.normal(tf.shape(mean), mean=0.0, stddev=1.0)
        
        return mean + tf.exp(0.5*log_b2)*tf.sqrt(2*exponential)*gaussian
def binary_indicator(states,
                     actions,
                     rewards,
                     next_states,
                     contexts,
                     termination_epsilon=1e-4,
                     offset=0,
                     epsilon=1e-10,
                     state_indices=None,
                     summarize=False):
  """Returns 0/1 by checking if next_states and contexts overlap.

  Args:
    states: A [batch_size, num_state_dims] Tensor representing a batch
        of states.
    actions: A [batch_size, num_action_dims] Tensor representing a batch
      of actions.
    rewards: A [batch_size] Tensor representing a batch of rewards.
    next_states: A [batch_size, num_state_dims] Tensor representing a batch
      of next states.
    contexts: A list of [batch_size, num_context_dims] Tensor representing
      a batch of contexts.
    termination_epsilon: terminate if dist is less than this quantity.
    offset: Offset the rewards.
    epsilon: small offset to ensure non-negative/zero distance.

  Returns:
    A new tf.float32 [batch_size] rewards Tensor, and
      tf.float32 [batch_size] discounts tensor.
  """
  del states, actions  # unused args
  next_states = index_states(next_states, state_indices)
  dist = tf.reduce_sum(tf.squared_difference(next_states, contexts[0]), -1)
  dist = tf.sqrt(dist + epsilon)
  discounts = dist > termination_epsilon
  rewards = tf.logical_not(discounts)
  rewards = tf.to_float(rewards) + offset
  return tf.to_float(rewards), tf.ones_like(tf.to_float(discounts)) #tf.to_float(discounts)
Esempio n. 15
0
def _normalize_advantages(advantages, axes=(0, ), variance_epsilon=1e-8):
    adv_mean, adv_var = tf.nn.moments(x=advantages, axes=axes, keepdims=True)
    normalized_advantages = ((advantages - adv_mean) /
                             (tf.sqrt(adv_var) + variance_epsilon))
    return normalized_advantages
def diff_distance(states,
                  actions,
                  rewards,
                  next_states,
                  contexts,
                  state_scales=1.0,
                  goal_scales=1.0,
                  reward_scales=1.0,
                  weight_index=None,
                  weight_vector=None,
                  summarize=False,
                  termination_epsilon=1e-4,
                  state_indices=None,
                  goal_indices=None,
                  norm='L2',
                  epsilon=1e-10):
  """Returns the difference in euclidean distance between states/next_states and contexts.

  Args:
    states: A [batch_size, num_state_dims] Tensor representing a batch
        of states.
    actions: A [batch_size, num_action_dims] Tensor representing a batch
      of actions.
    rewards: A [batch_size] Tensor representing a batch of rewards.
    next_states: A [batch_size, num_state_dims] Tensor representing a batch
      of next states.
    contexts: A list of [batch_size, num_context_dims] Tensor representing
      a batch of contexts.
    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
      must be broadcastable to number of state dimensions.
    goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
      must be broadcastable to number of goal dimensions.
    reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
      must be broadcastable to number of reward dimensions.
    weight_index: (integer) The context list index that specifies weight.
    weight_vector: (a number or a list or Numpy array) The weighting vector,
      broadcastable to `next_states`.
    summarize: (boolean) enable summary ops.
    termination_epsilon: terminate if dist is less than this quantity.
    state_indices: (a list of integers) list of state indices to select.
    goal_indices: (a list of integers) list of goal indices to select.
    vectorize: Return a vectorized form.
    norm: L1 or L2.
    epsilon: small offset to ensure non-negative/zero distance.

  Returns:
    A new tf.float32 [batch_size] rewards Tensor, and
      tf.float32 [batch_size] discounts tensor.
  """
  del actions, rewards  # Unused
  stats = {}
  record_tensor(next_states, state_indices, stats, 'next_states')
  next_states = index_states(next_states, state_indices)
  states = index_states(states, state_indices)
  goals = index_states(contexts[0], goal_indices)
  next_sq_dists = tf.squared_difference(next_states * state_scales,
                                        goals * goal_scales)
  sq_dists = tf.squared_difference(states * state_scales,
                                   goals * goal_scales)
  record_tensor(sq_dists, None, stats, 'sq_dists')
  if weight_vector is not None:
    next_sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
    sq_dists *= tf.convert_to_tensor(weight_vector, dtype=next_states.dtype)
  if weight_index is not None:
    next_sq_dists *= contexts[weight_index]
    sq_dists *= contexts[weight_index]
  if norm == 'L1':
    next_dist = tf.sqrt(next_sq_dists + epsilon)
    dist = tf.sqrt(sq_dists + epsilon)
    next_dist = tf.reduce_sum(next_dist, -1)
    dist = tf.reduce_sum(dist, -1)
  elif norm == 'L2':
    next_dist = tf.reduce_sum(next_sq_dists, -1)
    next_dist = tf.sqrt(next_dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
    dist = tf.reduce_sum(sq_dists, -1)
    dist = tf.sqrt(dist + epsilon)  # tf.gradients fails when tf.sqrt(-0.0)
  else:
    raise NotImplementedError(norm)
  discounts = next_dist > termination_epsilon
  if summarize:
    with tf.name_scope('RewardFn/'):
      tf.summary.scalar('mean_dist', tf.reduce_mean(dist))
      tf.summary.histogram('dist', dist)
      summarize_stats(stats)
  diff = dist - next_dist
  diff *= reward_scales
  return tf.to_float(diff), tf.to_float(discounts)
def projection_distance(states,
                      starting_states,
                      actions,
                      rewards,
                      next_states,
                      contexts,
                      alpha = 0,
                      state_scales=1.0,
                      goal_scales=1.0,
                      reward_scales=1.0,
                      weight_index=None,
                      weight_vector=None,
                      summarize=False,
                      termination_epsilon=1e-4,
                      state_indices=None,
                      goal_indices=None,
                      vectorize=False,
                      relative_context=False,
                      diff=False,
                      norm='L2',
                      epsilon=1e-10,
                      bonus_epsilon=0., #5.,
                      offset=0.0):
  """Returns the negative euclidean distance between next_states and contexts.

  Args:
    states: A [batch_size, num_state_dims] Tensor representing a batch
        of states.
    actions: A [batch_size, num_action_dims] Tensor representing a batch
      of actions.
    rewards: A [batch_size] Tensor representing a batch of rewards.
    next_states: A [batch_size, num_state_dims] Tensor representing a batch
      of next states.
    contexts: A list of [batch_size, num_context_dims] Tensor representing
      a batch of contexts.
    state_scales: multiplicative scale for (next) states. A scalar or 1D tensor,
      must be broadcastable to number of state dimensions.
    goal_scales: multiplicative scale for goals. A scalar or 1D tensor,
      must be broadcastable to number of goal dimensions.
    reward_scales: multiplicative scale for rewards. A scalar or 1D tensor,
      must be broadcastable to number of reward dimensions.
    weight_index: (integer) The context list index that specifies weight.
    weight_vector: (a number or a list or Numpy array) The weighting vector,
      broadcastable to `next_states`.
    summarize: (boolean) enable summary ops.
    termination_epsilon: terminate if dist is less than this quantity.
    state_indices: (a list of integers) list of state indices to select.
    goal_indices: (a list of integers) list of goal indices to select.
    vectorize: Return a vectorized form.
    norm: L1 or L2.
    epsilon: small offset to ensure non-negative/zero distance.

  Returns:
    A new tf.float32 [batch_size] rewards Tensor, and
      tf.float32 [batch_size] discounts tensor.
  """
  del actions, rewards  # Unused
  stats = {}
  record_tensor(next_states, state_indices, stats, 'next_states')
  states = index_states(states, state_indices)
  starting_states = index_states(starting_states, state_indices)
  next_states = index_states(next_states, state_indices)
  goals = index_states(contexts[0], goal_indices)
  if relative_context:
    goals = states + goals
  
  sq_dists = tf.squared_difference(next_states * state_scales,
      goals * goal_scales)

  dist = tf.reduce_sum(sq_dists, -1)

  #def normalized_dist(states):
  #  dot_product = tf.matmul(states - starting_states, tf.transpose(goals - starting_states))
  #  return goals - starting_states - dot_product
  def projection_dist(states):
    inner = tf.multiply(states - starting_states, goals - starting_states)
    upper = tf.reduce_sum(inner, -1)
    sign = tf.sign(upper)
    
    result = tf.math.divide(upper, tf.norm(goals - starting_states, ord=2))

    term_1 = tf.norm(states - starting_states, 2)
   
    
    return -1*term_1+result
    
  dist_s = projection_dist(states)
  dist_s = tf.sqrt(tf.square(dist_s) + epsilon)
  dist_ns = projection_dist(next_states)
  

  ret = dist_ns, tf.to_float(dist > termination_epsilon) 
  return ret 
Esempio n. 18
0
def stability_loss(h, beta):
    if beta == 0.0:
        return 0.0
    else:
        l2 = tf.sqrt(tf.reduce_sum(tf.square(h), axis=-1))
        return beta * tf.reduce_mean(tf.square(l2[1:] - l2[:-1]))
Esempio n. 19
0
def l2_norm(x, axis=2):
    squared = tf.reduce_sum(tf.square(x), axis=axis, keepdims=True)
    norm = tf.sqrt(tf.maximum(squared, 1e-6))
    return norm
Esempio n. 20
0
def linear_classifier(embeddings, num_classes, cosine_classifier,
                      cosine_logits_multiplier, use_weight_norm, weight_decay):
    """Forward pass through a linear classifier, or possibly a cosine classifier.

  Args:
    embeddings: A Tensor of size [batch size, embedding dim].
    num_classes: An integer; the dimension of the classification.
    cosine_classifier: A bool. If true, a cosine classifier is used, which does
      not require a bias.
    cosine_logits_multiplier: A float. Only used if cosine_classifier is True,
      and multiplies the resulting logits.
    use_weight_norm: A bool. Whether weight norm was used. If so, then if using
      cosine classifier, normalize only the embeddings but not the weights.
    weight_decay: A float; the scalar multiple on the L2 regularization of the
      weight matrix.

  Returns:
    logits: A Tensor of size [batch size, num outputs].
  """

    embedding_dims = embeddings.get_shape().as_list()[-1]

    if use_weight_norm:
        # A variable to keep track of whether the initialization has already
        # happened.
        data_dependent_init_done = tf.get_variable('data_dependent_init_done',
                                                   initializer=0,
                                                   dtype=tf.int32,
                                                   trainable=False)

        w_fc = tf.get_variable('w_fc', [embedding_dims, num_classes],
                               initializer=tf.random_normal_initializer(
                                   0, 0.05),
                               trainable=True)
        # This init is temporary as it needs to be done in a data-dependent way.
        # It will be overwritten during the first forward pass through this layer.
        g = tf.get_variable('g',
                            dtype=tf.float32,
                            initializer=tf.ones([num_classes]),
                            trainable=True)
        b_fc = None
        if not cosine_classifier:
            # Also initialize a bias.
            b_fc = tf.get_variable('b_fc',
                                   initializer=tf.zeros([num_classes]),
                                   trainable=True)

        def _do_data_dependent_init():
            """Returns ops for the data-dependent init of g and maybe b_fc."""
            w_fc_normalized = tf.nn.l2_normalize(w_fc.read_value(), [0])
            output_init = tf.matmul(embeddings, w_fc_normalized)
            mean_init, var_init = tf.nn.moments(output_init, [0])
            # Data-dependent init values.
            g_init_value = 1. / tf.sqrt(var_init + 1e-10)
            ops = [tf.assign(g, g_init_value)]
            if not cosine_classifier:
                # Also initialize a bias in a data-dependent way.
                b_fc_init_value = -mean_init * g_init_value
                ops.append(tf.assign(b_fc, b_fc_init_value))
            # Mark that the data-dependent initialization is done to prevent it from
            # happening again in the future.
            ops.append(tf.assign(data_dependent_init_done, 1))
            return tf.group(*ops)

        # Possibly perform data-dependent init (if it hasn't been done already).
        init_op = tf.cond(tf.equal(data_dependent_init_done, 0),
                          _do_data_dependent_init, tf.no_op)

        with tf.control_dependencies([init_op]):
            # Apply weight normalization.
            w_fc *= g / tf.sqrt(tf.reduce_sum(tf.square(w_fc), [0]))
            # Forward pass through the layer defined by w_fc and b_fc.
            logits = linear_classifier_forward_pass(embeddings, w_fc, b_fc,
                                                    cosine_classifier,
                                                    cosine_logits_multiplier,
                                                    True)

    else:
        # No weight norm.
        w_fc = functional_backbones.weight_variable(
            [embedding_dims, num_classes], weight_decay=weight_decay)
        b_fc = None
        if not cosine_classifier:
            # Also initialize a bias.
            b_fc = functional_backbones.bias_variable([num_classes])
        # Forward pass through the layer defined by w_fc and b_fc.
        logits = linear_classifier_forward_pass(embeddings, w_fc, b_fc,
                                                cosine_classifier,
                                                cosine_logits_multiplier,
                                                False)
    return logits