def forward_backward(self, data, label, temperature=5.0):
    data_slice = []
    label_slice = []
    for i in range(len(self._ctxs)):
      data_slice.append(data[i*self._dev_batch_size:(i+1)*self._dev_batch_size])
      label_slice.append(label[i*self._dev_batch_size:(i+1)*self._dev_batch_size])
    
    gumbel_list = []
    for k in self._gumbel_var_names:
      if not k[0] in self._arg_dict[0].keys():
        break
      tmp_gumbel = sample_gumbel((k[1], ))
      gumbel_list.append(1.0 * mx.nd.array(tmp_gumbel))

    for i in range(len(self._ctxs)):
      self._arg_dict[i][self._data_name][:] = data_slice[i]
      if self._model_type != 'softmax':
        label_index = label_slice[i]
        self._arg_dict[i]['label_index'][:] = label_index
      label_ = mx.nd.one_hot(label_slice[i], self._label_shape[0])
      self._arg_dict[i][self._label_name][:] = label_
      if "temperature" in self._arg_dict[i].keys():
        self._arg_dict[i]["temperature"][:] = temperature

      for k, v in self._b.items():
        self._arg_dict[i][k][:] = v
      
      for idx, k in enumerate(self._gumbel_var_names):
        if not k[0] in self._arg_dict[i].keys():
          break
        self._arg_dict[i][k[0]][:] = gumbel_list[idx]
        # self._arg_dict[i][k[0]][:] = 1.0 * mx.nd.zeros((k[1]))

      self._exe[i].forward(is_train=True)
      self._exe[i].backward()
Example #2
0
    scores = multi_mnist_cnn.deepnn(l, X, 1)
    scores = tf.reshape(scores, [M, n, 1])
    P_hat = util.neuralsort(scores, temperature)

    losses = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=P_true, logits=tf.log(P_hat + 1e-20), dim=2)
    losses = tf.reduce_mean(losses, axis=-1)
    loss = tf.reduce_mean(losses)

if method == 'stochastic_neuralsort':
    scores = multi_mnist_cnn.deepnn(l, X, 1)
    scores = tf.reshape(scores, [M, n, 1])
    P_hat = util.neuralsort(scores, temperature)

    scores_sample = tf.tile(scores, [n_s, 1, 1])
    scores_sample += util.sample_gumbel([M * n_s, n, 1])
    P_hat_sample = util.neuralsort(
        scores_sample, temperature)

    P_true_sample = tf.tile(P_true, [n_s, 1, 1])
    losses = tf.nn.softmax_cross_entropy_with_logits_v2(
        labels=P_true_sample, logits=tf.log(P_hat_sample + 1e-20), dim=2)
    losses = tf.reduce_mean(losses, axis=-1)
    loss = tf.reduce_mean(losses)
else:
    raise ValueError("No such method.")


def vec_gradient(l):  # l is a scalar
    gradient = tf.gradients(l, tf.trainable_variables())
    vec_grads = [tf.reshape(grad, [-1]) for grad in gradient]  # flatten
Example #3
0
    point_estimates = tf.reduce_sum(prob_median * regression_candidates,
                                    axis=1)
    exp_loss = tf.squared_difference(y, point_estimates)

    loss_phi = tf.reduce_mean(exp_loss)
    loss_theta = loss_phi

    P_hat_eval = sinkhorn_operator(pre_sinkhorn, temp=1e-20)
    prob_median_eval = get_median_probs(P_hat_eval)

elif method == 'gumbel_sinkhorn':
    with tf.variable_scope('phi'):
        representations = multi_mnist_cnn.deepnn(l, X, n)
        pre_sinkhorn_orig = tf.reshape(representations, [M, n, n])
        pre_sinkhorn = tf.tile(pre_sinkhorn_orig, [n_s, 1, 1])
        pre_sinkhorn += util.sample_gumbel([n_s * M, n, n])

    with tf.variable_scope('theta'):
        regression_candidates = multi_mnist_cnn.deepnn(l, X, 1)
        regression_candidates = tf.reshape(regression_candidates, [M, n])

    P_hat = sinkhorn_operator(pre_sinkhorn, temp=temp)
    prob_median = get_median_probs(P_hat)
    prob_median = tf.reshape(prob_median, [n_s, M, n])

    point_estimates = tf.reduce_sum(prob_median * regression_candidates,
                                    axis=2)
    exp_loss = tf.squared_difference(y, point_estimates)

    loss_phi = tf.reduce_mean(exp_loss)
    loss_theta = loss_phi
Example #4
0
def gumbel_sinkhorn(log_alpha,
                    temp=1.0, n_samples=1, noise_factor=1.0, n_iters=20,
                    squeeze=True):
    """Random doubly-stochastic matrices via gumbel noise.
    In the zero-temperature limit sinkhorn(log_alpha/temp) approaches
    a permutation matrix. Therefore, for low temperatures this method can be
    seen as an approximate sampling of permutation matrices, where the
    distribution is parameterized by the matrix log_alpha
    The deterministic case (noise_factor=0) is also interesting: it can be
    shown that lim t->0 sinkhorn(log_alpha/t) = M, where M is a
    permutation matrix, the solution of the
    matching problem M=arg max_M sum_i,j log_alpha_i,j M_i,j.
    Therefore, the deterministic limit case of gumbel_sinkhorn can be seen
    as approximate solving of a matching problem, otherwise solved via the
    Hungarian algorithm.
    Warning: the convergence holds true in the limit case n_iters = infty.
    Unfortunately, in practice n_iter is finite which can lead to numerical
    instabilities, mostly if temp is very low. Those manifest as
    pseudo-convergence or some row-columns to fractional entries (e.g.
    a row having two entries with 0.5, instead of a single 1.0)
    To minimize those effects, try increasing n_iter for decreased temp.
    On the other hand, too-low temperature usually lead to high-variance in
    gradients, so better not choose too low temperatures.
    Args:
        log_alpha: 2D tensor (a matrix of shape [N, N])
            or 3D tensor (a batch of matrices of shape = [batch_size, N, N])
        temp: temperature parameter, a float.
        n_samples: number of samples
        noise_factor: scaling factor for the gumbel samples. Mostly to explore
            different degrees of randomness (and the absence of randomness, with
            noise_factor=0)
        n_iters: number of sinkhorn iterations. Should be chosen carefully, in
            inverse corresponde with temp to avoid numerical stabilities.
        squeeze: a boolean, if True and there is a single sample, the output will
            remain being a 3D tensor.
    Returns:
        sink: a 4D tensor of [batch_size, n_samples, N, N] i.e.
            batch_size *n_samples doubly-stochastic matrices. If n_samples = 1 and
            squeeze = True then the output is 3D.
        log_alpha_w_noise: a 4D tensor of [batch_size, n_samples, N, N] of
            noisy samples of log_alpha, divided by the temperature parameter. If
            n_samples = 1 then the output is 3D.
    """
    n = tf.shape(log_alpha)[1]
    log_alpha = tf.reshape(log_alpha, [-1, n, n])
    batch_size = tf.shape(log_alpha)[0]
    log_alpha_w_noise = tf.tile(log_alpha, [n_samples, 1, 1])
    if noise_factor == 0:
        noise = 0.0
    else:
        noise = sample_gumbel([n_samples * batch_size, n, n]) * noise_factor
    log_alpha_w_noise += noise
    log_alpha_w_noise /= temp
    sink = sinkhorn_operator(log_alpha_w_noise, n_iters)
    if n_samples > 1 or squeeze is False:
        sink = tf.reshape(sink, [n_samples, batch_size, n, n])
        sink = tf.transpose(sink, [1, 0, 2, 3])
        log_alpha_w_noise = tf.reshape(
            log_alpha_w_noise, [n_samples, batch_size, n, n])
        log_alpha_w_noise = tf.transpose(log_alpha_w_noise, [1, 0, 2, 3])
    return sink, log_alpha_w_noise