Esempio n. 1
0
    def _get_coordinatewise_learning_rate(self, grad, var):
        # Compute the learning rate using a moving average for the diagonal of BB^T
        avg_first = self.get_slot(var, 'first_moment')
        avg_second = self.get_slot(var, 'second_moment')
        decay_tensor = tf.cast(self._decay_tensor, var.dtype)
        batch_size = tf.cast(self._batch_size_tensor, var.dtype)

        # Create an estimator for the moving average of gradient mean and variance
        # via Welford's algorithm
        if isinstance(grad, tf.Tensor):
            delta = grad - avg_first
            first_moment_update = avg_first.assign_add(delta * tf1.where(
                self.iterations < 1, tf.cast(1, var.dtype), 1. - decay_tensor))

            with tf.control_dependencies([first_moment_update]):
                second_moment_update = avg_second.assign_add(
                    tf.cast(self.iterations < 1, var.dtype) *
                    -(1. - decay_tensor) *
                    (avg_second - decay_tensor * tf.square(delta)))
            diag_preconditioner = distribution_util.with_dependencies(
                [second_moment_update],
                tf.clip_by_value(avg_second, 1e-12, 1e12))
        elif isinstance(grad, tf.IndexedSlices):
            delta = grad.values - tf.gather_nd(avg_first, grad.indices)
            first_moment_update = tf1.scatter_add(
                avg_first, grad.indices,
                delta * tf1.where(self.iterations < 1, tf.cast(1., var.dtype),
                                  1. - decay_tensor))

            with tf.control_dependencies([first_moment_update]):
                avg_second = tf1.scatter_add(
                    avg_second, grad.indices,
                    tf.cast(self.iterations < 1, var.dtype) *
                    -(1. - decay_tensor) *
                    (tf.gather_nd(avg_second, grad.indices) -
                     decay_tensor * tf.square(delta)))
                avg_second = tf.gather_nd(avg_second, grad.indices)
                # TODO(b/70783772): Needs dtype specific clipping.
                diag_preconditioner = tf.clip_by_value(avg_second, 1e-12, 1e12)
        else:
            raise tf.errors.InvalidArgumentError(
                None, None, 'grad must of type Tensor or IndexedSlice')

        diag_preconditioner *= batch_size

        if self._use_single_learning_rate:
            diag_preconditioner = tf.reduce_mean(
                input_tensor=diag_preconditioner)

        # From Theorem 2 Corollary 1 of Mandt et al. 2017
        return 2. * batch_size / (
            tf.cast(self._total_num_examples, var.dtype.base_dtype) *
            diag_preconditioner)
 def _apply_sparse(self, grad, var):
     return self._apply_sparse_shared(
         grad.values,
         var,
         grad.indices,
         lambda x, i, v: tf.scatter_add(  # pylint: disable=g-long-lambda
             x,
             i,
             v,
             use_locking=self._use_locking))
Esempio n. 3
0
    def _sparse_moving_average(self, x_tm1, idxs, a_t_, name, beta=.9):
        """ """

        b_tm1 = self.get_accumulator(x_tm1, '%s' % name)
        b_tm1_ = tf.gather(b_tm1, idxs)
        shape = self.get_variable_shape(x_tm1)
        tm1 = self.get_accumulator(x_tm1,
                                   '%s/tm1' % name,
                                   shape=[shape[0]] + [1] * (len(shape) - 1))
        tm1_ = tf.gather(tm1, idxs)
        t = tf.scatter_add(tm1, idxs, tf.ones_like(tm1_))
        t_ = tf.gather(t, idxs)
        if beta < 1:
            beta_t = tf.convert_to_tensor(beta, name='%s/decay' % name)
            beta_t_ = beta_t * (1 - beta_t**tm1_) / (1 - beta_t**t_)
        else:
            beta_t_ = tm1_ / t_
        b_t = tf.scatter_update(b_tm1, idxs, beta_t_ * b_tm1_)
        b_t = tf.scatter_add(b_t, idxs, (1 - beta_t_) * a_t_)
        return b_t, t
    def update_contextual_features(contextual_features, indices, updates,
                                   flattened_idx_offset):
        first_indices, second_indices = tf.split(indices, 2, 1)

        indices = tf.squeeze(first_indices + second_indices)
        indices = indices + flattened_idx_offset
        contextual_features = tf.scatter_add(contextual_features,
                                             indices,
                                             updates,
                                             use_locking=None)
        return contextual_features
Esempio n. 5
0
 def force_ext_ellipsoid_idx_multi(links, idx):
     gij, ginv = links.get_metrics(idx)
     r = tf.gather(links.points, idx)
     dr2 = links.get_dr2(r, gij)
     A = links.amplitude
     dr = r - r[:, newaxis]
     drh = dr / (tf.norm(dr, axis=-1, keepdims=True) + 1e-15)
     # links.fmat0 = A*drh*((links.dr2[id]**(links.net.POW/2-1))*tf.exp(-links.dr2[id]**links.net.POW))
     links.fmat0 = (A * drh * ((dr2**(links.net.POW / 2.0 - 1)) *
                               tf.exp(-(dr2**(links.net.POW / 2.0)))))
     links.Force_LL_Ell = tf.reduce_sum(links.fmat0, 0)
     return tf.scatter_add(links.net.f_link, idx, links.Force_LL_Ell)
Esempio n. 6
0
    def force_node_repel_idx(nodes, idx):
        """try out different forces, long- and short-range"""
        r0 = tf.gather(nodes.r0, idx)
        th = r0 + r0[:, newaxis]

        A = nodes.amplitude
        x = tf.gather(nodes.points, idx)
        r = x - x[:, newaxis, :]  # tf.expand_dims(x,1)
        rlen = vec_len(r)
        # fmat = A*r*tf.expand_dims((rlen/th)**(POWn-2)*tf.exp(-(rlen/th)**POWn),2)
        # fmat = th[:,:,newaxis]**POW_SN *A*r*tf.expand_dims((rlen/th)**(POWn-2)*tf.exp(-(rlen/th)**POWn),2)
        fmat = (th[:, :, newaxis]**nodes.net.POW_SN * A * r * tf.expand_dims(
            (rlen / th)**(nodes.net.POWn - 2) * tf.exp(-(
                (rlen / th)**nodes.net.POWn)),
            2,
        ))

        nodes.Force_NN = tf.reduce_sum(fmat, 0)
        return tf.scatter_add(nodes.net.f_node, idx, nodes.Force_NN)
Esempio n. 7
0
    def _apply_sparse_shared(self, grad_values, grad_indices, var):
        shape = np.array(var.get_shape())
        var_rank = len(shape)
        # For sparse case, we only update the accumulator representing the sparse
        # dimension. In this case SM3 is similar to isotropic adagrad but with
        # better bound (due to the max operator).
        #
        # We do not use the column accumulator because it will updated for
        # every gradient step and will significantly overestimate the gradient
        # square. While, the row accumulator can take advantage of the sparsity
        # in the gradients. Even if one implements the column accumulator - it
        # will result in a no-op because the row accumulators will have lower
        # values.
        #
        # Note that: We do not run this code paths for our experiments in our paper
        # as on TPU all the sparse gradients are densified.
        if var_rank > 1:
            accumulator_var = self.get_slot(var, "accumulator_" + str(0))
            accumulator = tf.gather(accumulator_var, grad_indices)
            shape_for_broadcasting = tf.concat(
                [[tf.shape(accumulator)[0]], [1] * (var_rank - 1)], 0)
            accumulator = tf.reshape(accumulator, shape_for_broadcasting)
            accumulator += grad_values * grad_values
        else:
            accumulator_var = self.get_slot(var, "accumulator")
            accumulator = tf.scatter_add(accumulator_var, grad_indices,
                                         grad_values * grad_values)

        accumulator_inv_sqrt = tf.rsqrt(accumulator + 1e-30)
        scaled_g = (grad_values * accumulator_inv_sqrt)
        updates = []
        with tf.control_dependencies([scaled_g]):
            if var_rank > 1:
                axes = list(range(1, var_rank))
                new_accumulator = tf.reduce_max(accumulator, axis=axes)
                updates = [
                    tf.scatter_update(accumulator_var, grad_indices,
                                      new_accumulator)
                ]
        with tf.control_dependencies(updates):
            return tf.scatter_sub(var, grad_indices,
                                  self._learning_rate_tensor * scaled_g)
Esempio n. 8
0
    def forces_ext_brute_idx_multi(links, idx):
        """ Generate extra tensors for link external force calculation.
        it has a placeholder links.f_ext_idx_[id] for indexing,
        and defines a tensor links.force_ext_app[id], both unique to this each instance,
        """
        x = tf.gather(links.points, idx)
        # all possible seg pairs
        th0 = tf.gather(links.thickness, idx)
        th_mat = th0 + th0[:, newaxis]
        A = links.amplitude
        links.r = x - x[:, newaxis]
        rlen = vec_len(links.r)
        # !!! must exclude pairs on same edge, otherwise edge won't contract
        fmat = (A * links.r *
                ((rlen / th_mat)**(links.net.POW - 2) / th_mat * tf.exp(-(
                    (rlen / th_mat)**links.net.POW)) *
                 links.link_self_mask_multi(idx))[:, :, newaxis])
        # including selfrepulsion again
        # fmat = A*links.r*((rlen/th_mat)**(POW-2)/th_mat*tf.exp(-(rlen/th_mat)**POW))[:,:,newaxis]

        links.Force_LL = tf.reduce_sum(fmat, 0)
        return tf.scatter_add(links.net.f_link, idx, links.Force_LL)
 def _resource_scatter_add(self, x, i, v):
     with tf.control_dependencies([tf.scatter_add(x.handle, i, v)]):
         return x.value()