コード例 #1
0
    def minimize(self, loss, name=None):
        """ """

        # Error checking
        var_list = tf.trainable_variables()
        for x_tm1 in var_list:
            if not isinstance(x_tm1, tf.Variable):
                raise TypeError("Argument is not a tf.Variable: %s" % x_tm1)
        if not var_list:
            raise ValueError("No variables to optimize")
        if loss.dtype.base_dtype != tf.float32:
            raise ValueError('Loss is not float32')

        # Compute gradients
        grads = tf.gradients(loss,
                             var_list,
                             colocate_gradients_with_ops=True,
                             gate_gradients=True,
                             aggregation_method=2)
        for x_tm1, g_t in zip(var_list, grads):
            if g_t is not None:
                if x_tm1.dtype.base_dtype != tf.float32:
                    raise ValueError('%s is not float32' % x_tm1.name)

        # Apply gradients
        with tf.control_dependencies(None):
            self._init_acc(var_list, grads)
        with tf.name_scope(name, self.name.title(), []) as name:
            caches = [
                cache for cache in self._prepare(var_list, grads)
                if cache['g_t'] is not None
            ]
            for cache in caches:
                x_tm1, g_t = cache['x_tm1'], cache['g_t']
                with tf.name_scope("update_" + x_tm1.op.name), tf.device(
                        x_tm1.device):
                    if isinstance(g_t, tf.Tensor):
                        cache['g_t'] = tf.where(tf.is_finite(g_t), g_t,
                                                tf.zeros_like(g_t))
                        self._apply_dense(cache)
                    else:
                        cache['g_t'] = tf.where(tf.is_finite(g_t.values),
                                                g_t.values,
                                                tf.zeros_like(g_t.values))
                        cache['idxs'] = g_t.indices
                        self._apply_sparse(cache)
            with tf.control_dependencies([self._finish(caches)]):
                with tf.device(self.global_step.device):
                    return tf.assign_add(self.global_step, 1, name=name).op
コード例 #2
0
    def stn_diffeo(self):
        with tf.variable_scope("atn"):
            x_tensor = tf.reshape(self.X,[-1,self.img_sz[0],self.img_sz[1],self.num_channels])
            # x_tensor = tf.Print(x_tensor,[x_tensor],message="x_tensor: ",summarize=100)
            c = tf.reduce_mean(tf.boolean_mask(x_tensor, tf.is_finite(x_tensor)), 0)
            # c = tf.Print(c,[c],message="c: ",summarize=100)

            # self.theta, self.affine_maps, d2 = transfromation_parameters_regressor(self.requested_transforms,self.X,
            #                                                                  self.keep_prob,self.img_sz,self.weight_stddev,self.num_channels,self.activation_func)
            #
            # self.theta = tf.Print(self.theta,[self.theta],message="self.theta: ",summarize=100)
            # out_size = (self.img_sz[0], self.img_sz[1])
            # self.theta_exp = expm(-self.theta)  # compute matrix exponential on {-theta}
            # # self.theta_exp = tf.Print(self.theta_exp,[self.theta_exp],message="theta_exp: ", summarize=100)
            # x_theta, d = transformer(x_tensor, self.theta_exp, out_size)
            # #to avoid the sparse indexing warning, comment the next line, and uncomment the one after it.
            # self.x_theta = tf.reshape(x_theta,shape=[-1,self.img_sz[0],self.img_sz[1],self.num_channels])
            # d.update({'params':d2['params']})

            # Working with recurrent STN: get self.theta and self.theta_exp in shape: [num_STN, batch_sz, 6]
            d = c
            self.x_theta, self.theta, self.theta_exp = transfromation_parameters_regressor(self.requested_transforms, self.X,
                                                                                 self.keep_prob,self.img_sz,
                                                                                 self.weight_stddev,self.num_channels,
                                                                                 self.activation_func, self.num_stn)

            return self.x_theta, d, c
コード例 #3
0
    def preprocess_device_grads(self, device_grads):
        compact_grads = (self.benchmark_cnn.params.use_fp16 and
                         self.benchmark_cnn.params.compact_gradient_transfer)
        defer_grads = (
            self.benchmark_cnn.params.variable_consistency == 'relaxed')

        grads_to_reduce = [[g for g, _ in grad_vars]
                           for grad_vars in device_grads]
        algorithm = batch_allreduce.algorithm_from_params(
            self.benchmark_cnn.params)
        reduced_grads, self._warmup_ops = algorithm.batch_all_reduce(
            grads_to_reduce, self.benchmark_cnn.params.gradient_repacking,
            compact_grads, defer_grads, self.benchmark_cnn.params.xla_compile)
        if self.benchmark_cnn.enable_auto_loss_scale:
            # Check for infs or nans
            is_finite_list = []
            with tf.name_scope('check_for_inf_and_nan'):
                for tower_grads in reduced_grads:
                    with tf.colocate_with(tower_grads[0]):
                        # TODO(tanmingxing): Create fused op that takes in a list of tensors
                        # as input and returns scalar boolean True if there are any
                        # infs/nans.
                        is_finite_list.append(
                            tf.reduce_all([
                                tf.reduce_all(tf.is_finite(g))
                                for g in tower_grads
                            ]))
                self.grad_has_inf_nan = tf.logical_not(
                    tf.reduce_all(is_finite_list))
        reduced_device_grads = [[
            (g, v) for g, (_, v) in zip(grads, grad_vars)
        ] for grads, grad_vars in zip(reduced_grads, device_grads)]
        return self.benchmark_cnn.devices, reduced_device_grads
コード例 #4
0
ファイル: radam_optimizer.py プロジェクト: tapika/Parser-v2
    def _apply_dense(self, cache):
        """ """

        x_tm1, g_t = cache['x_tm1'], cache['g_t']
        updates = cache['updates']

        if self.mu > 0:
            m_t, t_m = self._dense_moving_average(x_tm1,
                                                  g_t,
                                                  'm',
                                                  beta=self.mu)
            m_bar_t = (1 - self.gamma) * m_t + self.gamma * g_t
            updates.extend([m_t, t_m])
        else:
            m_bar_t = g_t

        if self.nu > 0:
            v_t, t_v = self._dense_moving_average(x_tm1,
                                                  g_t**2,
                                                  'v',
                                                  beta=self.nu)
            v_bar_t = tf.sqrt(v_t + self.epsilon)
            updates.extend([v_t, t_v])
        else:
            v_bar_t = 1

        s_t = self.learning_rate * m_bar_t / v_bar_t
        cache['s_t'] = tf.where(tf.is_finite(s_t), s_t, tf.zeros_like(s_t))
        return cache
コード例 #5
0
def ProcessGradients(grads_and_vars,
                     global_gradient_clip=0.0,
                     sanitize_gradients=False,
                     normalize_gradients=False):
    tf.logging.info("Prcessing gradients")
    grads, vars_ = list(zip(*grads_and_vars))
    if sanitize_gradients:
        new_grads = []
        for g in grads:
            if g is not None:
                g = tf.where(tf.is_finite(g), g, tf.zeros_like(g))
            new_grads.append(g)
        grads = new_grads
    if normalize_gradients:
        new_grads = []
        for g in grads:
            if g is not None:
                g *= tf.rsqrt(tf.maximum(1e-12, tf.reduce_sum(tf.square(g))))
            new_grads.append(g)
        grads = new_grads
    if global_gradient_clip > 0:
        grads, grad_norm = tf.clip_by_global_norm(grads, global_gradient_clip)
        grads_and_vars = list(zip(grads, vars_))
    else:
        grad_norm = tf.global_norm(grads)
    tf.summary.scalar("global_grad_norm", grad_norm)
    return grads_and_vars
コード例 #6
0
def _create_var(name: str, value_expr: TfExpression) -> TfExpression:
    """Internal helper for creating autosummary accumulators."""
    assert not _finalized
    name_id = name.replace("/", "_")
    v = tf.cast(value_expr, _dtype)

    if v.shape.is_fully_defined():
        size = np.prod(v.shape.as_list())
        size_expr = tf.constant(size, dtype=_dtype)
    else:
        size = None
        size_expr = tf.reduce_prod(tf.cast(tf.shape(v), _dtype))

    if size == 1:
        if v.shape.ndims != 0:
            v = tf.reshape(v, [])
        v = [size_expr, v, tf.square(v)]
    else:
        v = [size_expr, tf.reduce_sum(v), tf.reduce_sum(tf.square(v))]
    v = tf.cond(tf.is_finite(v[1]), lambda: tf.stack(v),
                lambda: tf.zeros(3, dtype=_dtype))

    with tfutil.absolute_name_scope("Autosummary/" +
                                    name_id), tf.control_dependencies(None):
        var = tf.Variable(tf.zeros(3, dtype=_dtype),
                          trainable=False)  # [sum(1), sum(x), sum(x**2)]
    update_op = tf.cond(tf.is_variable_initialized(var),
                        lambda: tf.assign_add(var, v),
                        lambda: tf.assign(var, v))

    if name in _vars:
        _vars[name].append(var)
    else:
        _vars[name] = [var]
    return update_op
コード例 #7
0
    def __init__(self, idx, mean, var, **kwargs):
        """
        :param mean: Tensor of shape [W] containing the mean at each parameter vertex
        :param var: Tensor of shape [W] containing the variance at each parameter vertex
        """
        Posterior.__init__(self, idx, **kwargs)
        self.nvertices = tf.shape(mean)[0]
        self.name = kwargs.get("name", "NormPost")
        
        mean, var = self._get_mean_var(mean, var, kwargs.get("init", None))
        mean = tf.cast(mean, tf.float32)
        var = tf.cast(var, tf.float32)
        mean = self.log_tf(tf.where(tf.is_finite(mean), mean, tf.zeros_like(mean)))
        var = tf.where(tf.is_nan(var), tf.ones_like(var), var)

        self.mean_variable = self.log_tf(tf.Variable(mean, validate_shape=False,
                                                     name="%s_mean" % self.name))
        self.log_var = self.log_tf(tf.Variable(tf.log(var), validate_shape=False,
                                   name="%s_log_var" % self.name))
        self.var_variable = self.log_tf(tf.exp(self.log_var, name="%s_var" % self.name))
        if kwargs.get("suppress_nan", True):
            #self.mean = tf.where(tf.is_nan(self.mean_variable), tf.ones_like(self.mean_variable), self.mean_variable)
            #self.var = tf.where(tf.is_nan(self.var_variable), tf.ones_like(self.var_variable), self.var_variable)
            self.mean = tf.where(tf.is_nan(self.mean_variable), mean, self.mean_variable)
            self.var = tf.where(tf.is_nan(self.var_variable), var, self.var_variable)
        else:
            self.mean = self.mean_variable
            self.var = self.var_variable
        self.std = self.log_tf(tf.sqrt(self.var, name="%s_std" % self.name))
コード例 #8
0
def check_grads_finite(grads):
    if not len(grads):  # pylint: disable=g-explicit-length-test
        return tf.constant(True)
    else:
        finites = [
            tf.reduce_sum(1 - tf.to_float(tf.is_finite(g))) for g in grads
        ]
        return tf.equal(tf.add_n(finites), 0.)
コード例 #9
0
def row_normalize(x):
    with tf.name_scope('row_norm'):
        x = tf.clip_by_value(x, 0., 1.)
        s = tf.cast(tf.shape(x)[1], tf.float32)
        vec = tf.reduce_sum(x, axis=1)
        x = x / repeat(vec, s)
        x = tf.where(tf.is_finite(x), x, tf.ones(shape=tf.shape(x)) / s)
        return x
コード例 #10
0
def compute_gradients(loss, variables, loss_scale):
    with tf.name_scope("gradient_computation"):
        gradients = tf.gradients(loss * loss_scale, variables)
        # Create zero gradients for None entries
        zeros = [tf.zeros_like(var) for var in variables]
        gradients = [grad / loss_scale if grad is not None else None for grad in gradients]
        finites = [tf.reduce_all(tf.is_finite(grad)) if grad is not None else None for grad in gradients]
        gradients = [tf.where(finite, grad, zero) if grad is not None else None for finite, grad, zero in zip(finites, gradients, zeros)]

        all_finite = tf.reduce_all([f for f in finites if f is not None])

    return gradients, all_finite
コード例 #11
0
def _custom_recall_at_k(labels_as_multi_hot, predictions, k):
    """Calculates recall_at_k metric with multi-hot labels.

  For each example which contains at least one label, a recall-at-k is
  calculated by assessing what proportion of these labels are in the top k
  predictions. This metric is the mean of these values.

  Args:
    labels_as_multi_hot: a tensor of [batch_size, num_output_classes] where
      elements are zero (absent) or one (present).
    predictions: a tensor of [batch_size, num_output_classes] where elemenents
      are floats indicating the probability of class membership.
    k: number of top predictions to consider (must be <= num_output_classes).

  Returns:
    mean: A scalar `Tensor` representing the current mean, the value of `total`
       divided by `count` (of finite values).
    update_op: An operation that increments the `total` and `count` variables
      appropriately and whose (scalar) value matches the mean_value.
  """
    labels_as_multi_hot = tf.cast(labels_as_multi_hot, tf.float32)

    num_output_classes = tf.shape(labels_as_multi_hot)[1]
    _, indices = tf.math.top_k(predictions, k=k)

    predictions_top_k_as_multi_hot = _indices_to_multihot(
        indices, num_output_classes)

    true_positives_tensor = tf.math.logical_and(
        tf.cast(labels_as_multi_hot, tf.bool),
        tf.cast(predictions_top_k_as_multi_hot, tf.bool))

    false_negatives_tensor = tf.math.greater(labels_as_multi_hot,
                                             predictions_top_k_as_multi_hot)

    true_positives_per_example = tf.count_nonzero(true_positives_tensor,
                                                  axis=1)
    false_negatives_per_example = tf.count_nonzero(false_negatives_tensor,
                                                   axis=1)

    recall_per_example = true_positives_per_example / (
        true_positives_per_example + false_negatives_per_example)

    is_finite = tf.is_finite(
        recall_per_example)  # To filter out no label cases.
    recall_per_example_finite_only = tf.boolean_mask(recall_per_example,
                                                     is_finite)

    return tf.metrics.mean(recall_per_example_finite_only)
コード例 #12
0
def get_S_fromtensor(W):
    wsum = tf.sparse.reduce_sum(W, axis=1)
    wsum = tf.reshape(wsum, (-1, ))
    d_sqrt = tf.reciprocal(tf.sqrt(wsum))
    d_sqrt = tf.where(tf.is_finite(d_sqrt), d_sqrt,
                      tf.ones(shape=tf.shape(d_sqrt)))

    d_sqrt_i = tf.gather(d_sqrt, W.indices[:, 0])
    d_sqrt_j = tf.gather(d_sqrt, W.indices[:, 1])

    S = tf.sparse.SparseTensor(indices=W.indices,
                               values=W.values * d_sqrt_i * d_sqrt_j,
                               dense_shape=W._dense_shape)

    return S
コード例 #13
0
ファイル: glow_ops.py プロジェクト: changlan/tensor2tensor
def postprocess(x, n_bits_x=8):
  """Converts x from [-0.5, 0.5], to [0, 255].

  Args:
    x: 3-D or 4-D Tensor normalized between [-0.5, 0.5]
    n_bits_x: Number of bits representing each pixel of the output.
              Defaults to 8, to default to 256 possible values.
  Returns:
    x: 3-D or 4-D Tensor representing images or videos.
  """
  x = tf.where(tf.is_finite(x), x, tf.ones_like(x))
  x = tf.clip_by_value(x, -0.5, 0.5)
  x += 0.5
  x = x * 2**n_bits_x
  return tf.cast(tf.clip_by_value(x, 0, 255), dtype=tf.uint8)
コード例 #14
0
    def get_gradients_to_apply(self, device_num, gradient_state):
        device_grads = gradient_state
        tower_grad = device_grads[device_num]

        if self.benchmark_cnn.enable_auto_loss_scale and device_num == 0:
            # Since we don't aggregate variables in --independent mode, we cannot tell
            # if there are NaNs on all GPUs. So we arbitrarily choose to only check
            # NaNs on the first GPU.
            has_inf_nan_list = []
            for grad, _ in tower_grad:
                has_inf_nan_list.append(tf.reduce_all(tf.is_finite(grad)))
            self.grad_has_inf_nan = tf.logical_not(
                tf.reduce_all(has_inf_nan_list))

        return tower_grad
コード例 #15
0
    def alignment_loss(self):
        # ------------------------ Our loss (with W) ------------------------------------------------------
        img = tf.slice(self.x_theta, [0, 0, 0, 0], [-1, -1, -1, self.num_channels - 1])  # (64, 128, 128, 3)
        mask = tf.slice(self.x_theta, [0, 0, 0, self.num_channels - 1], [-1, -1, -1, -1])  # (64, 128, 128, 1)

        #img = tf.layers.batch_normalization(x_theta_new0)

        #img = tf.multiply(img_slice, mask)  # (64, 128, 128, 3)

        sum_weighted_imgs = tf.reduce_sum(img, 0)
        # sum_weighted_imgs = tf.Print(sum_weighted_imgs, [sum_weighted_imgs], message="sum_weighted_imgs: ", summarize=100)

        sum_weights = tf.reduce_sum(mask, 0)
        sum_weights = tf.concat([sum_weights,sum_weights,sum_weights], 2)
        # sum_weights = tf.Print(sum_weights, [sum_weights], message="sum_weights: ", summarize=100)

        # averages = tf.where(tf.less(sum_weights, 1e-3), tf.zeros_like(sum_weighted_imgs), tf.divide(sum_weighted_imgs, sum_weights+1e-7))  #sum_weights+1e-7
        # averages = tf.Print(averages, [averages], message="averages: ", summarize=100)

        # "Recursive" average:
        average_batch = tf.where(tf.less(sum_weights, 1e-3), tf.zeros_like(sum_weighted_imgs), tf.divide(sum_weighted_imgs, sum_weights+1e-7))  #sum_weights+1e-7
        averages_curr = tf.divide(tf.multiply(self.cnt, self.averages_prev) + average_batch, tf.add(self.cnt, 1))
        # self.averages_prev = averages_curr
        # self.cnt = tf.add(self.cnt, 1)

        weighted_diff = tf.multiply(mask, tf.subtract(img, averages_curr))

        # square_weighted_diff = tf.square(weighted_diff)
        huber_diff = tf.where(tf.less(tf.abs(weighted_diff), self.delta), 0.5*tf.square(weighted_diff), self.delta*tf.abs(weighted_diff)-0.5*(self.delta**2))

        huber_diff_robust = huber_diff / (huber_diff + self.sigma ** 2)

        loss_sum_per_pixel = tf.reduce_sum(huber_diff_robust, 0)
        alignment_loss = tf.reduce_sum(loss_sum_per_pixel)
        alignment_loss =  alignment_loss/tf.reduce_sum(mask)   #  alignment_loss / (self.img_sz[0] * self.img_sz[1] * self.num_channels)
        # alignment_loss = tf.reduce_sum(alignment_loss / (alignment_loss + self.sigma ** 2))
        # alignment_loss = alignment_loss / (alignment_loss + self.sigma ** 2)

        a = tf.reduce_mean(tf.boolean_mask(self.x_theta, tf.is_finite(self.x_theta)), 0)
        # a = tf.Print(a,[a],message="a: ",summarize=100)
        b = tf.reduce_sum(mask) # need to remove
        # b = tf.Print(b,[b],message="b: ",summarize=100)

        return alignment_loss, a, b
コード例 #16
0
    def alignment_loss_median(self):
        # ------------------------ Our loss (with W) ------------------------------------------------------
            # Compute median per pixel-stack:
        num_of_img_pixels = self.img_sz[0] * self.img_sz[1] * (self.img_sz[2]-1)
        img = tf.slice(self.x_theta, [0, 0, 0, 0], [-1, -1, -1, self.num_channels - 1])  # Shape: (bs, h, w, 3)
        mask = tf.slice(self.x_theta, [0, 0, 0, self.num_channels - 1], [-1, -1, -1, -1])  # Shape: (bs, h, w, 1)
        bool_mask = tf.logical_not(tf.equal(mask, 0))
        bool_mask = tf.concat([bool_mask, bool_mask, bool_mask], 3)
        neg_val = tf.ones_like(img) * -200.
        x_theta_tmp = tf.where(bool_mask, img, neg_val)  # Shape: (bs_sz, h, w, 3)
        # a = tf.reduce_max(x_theta_tmp)
        # a = tf.Print(a,[a],message="a: ",summarize=100)

        batch_elements = tf.transpose(tf.reshape(x_theta_tmp, [-1, num_of_img_pixels]))  # Shape: (h*w*3, bs)
        batch_elements = tf.concat([batch_elements, tf.zeros([num_of_img_pixels, 1])], 1)

        medians_pixels_stack = tf.map_fn(
            lambda x: tf.contrib.distributions.percentile(tf.boolean_mask(x, x > -10.), q=50., axis=[0]), batch_elements, dtype=tf.float32)  # Shape: (h*w*3, )
        # -- Instead, perform median on all values including irrelevant pixels:
        # medians_pixels_stack = tf.contrib.distributions.percentile(batch_elements, q=50., axis=[1]) # Shape: (h*w*3, )
        medians_in_img_shape = tf.reshape(medians_pixels_stack, [self.img_sz[0], self.img_sz[1], (self.img_sz[2]-1)])  # Shape: (h, w, 3)

            # Compute loss per pixel-stack:
        weighted_diff = tf.multiply(mask, tf.subtract(img, medians_in_img_shape))   # Shape: (bs, h, w, 3)
        # square_weighted_diff = tf.square(weighted_diff)
        huber_diff = tf.where(tf.less(tf.abs(weighted_diff), self.delta), 0.5*tf.square(weighted_diff), self.delta*tf.abs(weighted_diff)-0.5*(self.delta**2))
        huber_diff_robust = huber_diff #/ (huber_diff + self.sigma ** 2)

            # Compute total loss:
        loss_sum_pixel_stack = tf.reduce_sum(huber_diff_robust, 0)
        alignment_loss = tf.reduce_sum(loss_sum_pixel_stack)
        alignment_loss = alignment_loss/tf.reduce_sum(mask)   #  alignment_loss / (self.img_sz[0] * self.img_sz[1] * self.num_channels)
        # alignment_loss = tf.reduce_sum(alignment_loss / (alignment_loss + self.sigma ** 2))
        # alignment_loss = alignment_loss / (alignment_loss + self.sigma ** 2)

        a = tf.reduce_mean(tf.boolean_mask(self.x_theta, tf.is_finite(self.x_theta)), 0)
        # # a = tf.Print(a,[a],message="a: ",summarize=100)
        b = tf.reduce_sum(mask) # need to remove
        # b = tf.Print(b,[b],message="b: ",summarize=100)

        return alignment_loss, a, b
コード例 #17
0
ファイル: krum.py プロジェクト: big-data-lab-umbc/autodist
 def aggregate(self, gradients):
     with tf.name_scope("GAR_krum_tf"):
         # Assertion
         assert len(gradients) > 0, "Empty list of gradient to aggregate"
         # Distance computations
         distances = []
         for i in range(self.__nbworkers - 1):
             dists = list()
             for j in range(i + 1, self.__nbworkers):
                 sqr_dst = tf.reduce_sum(
                     tf.squared_difference(gradients[i], gradients[j]))
                 dists.append(
                     tf.negative(
                         tf.where(tf.is_finite(sqr_dst), sqr_dst,
                                  tf.constant(np.inf, dtype=sqr_dst.dtype)))
                 )  # Use of 'negative' to get the smallest distances and score indexes in 'nn.top_k'
             distances.append(dists)
         # Score computations
         scores = []
         for i in range(self.__nbworkers):
             dists = []
             for j in range(self.__nbworkers):
                 if j == i:
                     continue
                 if j < i:
                     dists.append(distances[j][i - j - 1])
                 else:
                     dists.append(distances[i][j - i - 1])
             dists = tf.parallel_stack(dists)
             dists, _ = tf.nn.top_k(dists,
                                    k=(self.__nbworkers - self.__nbbyzwrks -
                                       2),
                                    sorted=False)
             scores.append(tf.reduce_sum(dists))
         # Average of the 'nbselected' smallest scoring gradients
         gradients = tf.parallel_stack(gradients)
         scores = tf.parallel_stack(scores)
         _, indexes = tf.nn.top_k(scores, k=self.__nbselected, sorted=False)
         return tf.reduce_mean(tf.gather(gradients, indexes), axis=0)
コード例 #18
0
def _create_autosummary_var(name, value_expr):
    assert not _autosummary_finalized
    v = tf.cast(value_expr, tf.float32)
    if v.shape.ndims is 0:
        v = [v, np.float32(1.0)]
    elif v.shape.ndims is 1:
        v = [tf.reduce_sum(v), tf.cast(tf.shape(v)[0], tf.float32)]
    else:
        v = [
            tf.reduce_sum(v),
            tf.reduce_prod(tf.cast(tf.shape(v), tf.float32))
        ]
    v = tf.cond(tf.is_finite(v[0]), lambda: tf.stack(v), lambda: tf.zeros(2))
    with tf.control_dependencies(None):
        var = tf.Variable(tf.zeros(2))  # [numerator, denominator]
    update_op = tf.cond(tf.is_variable_initialized(var),
                        lambda: tf.assign_add(var, v),
                        lambda: tf.assign(var, v))
    if name in _autosummary_vars:
        _autosummary_vars[name].append(var)
    else:
        _autosummary_vars[name] = [var]
    return update_op
コード例 #19
0
ファイル: radam_optimizer.py プロジェクト: tapika/Parser-v2
    def _apply_sparse(self, cache):
        """ """

        x_tm1, g_t, idxs = cache['x_tm1'], cache['g_t'], cache['idxs']
        idxs, idxs_ = tf.unique(idxs)
        g_t_ = tf.unsorted_segment_sum(g_t, idxs_, tf.size(idxs))
        updates = cache['updates']

        if self.mu > 0:
            m_t, t_m = self._sparse_moving_average(x_tm1,
                                                   idxs,
                                                   g_t_,
                                                   'm',
                                                   beta=self.mu)
            m_t_ = tf.gather(m_t, idxs)
            m_bar_t_ = (1 - self.gamma) * m_t_ + self.gamma * g_t_
            updates.extend([m_t, t_m])
        else:
            m_bar_t_ = g_t_

        if self.nu > 0:
            v_t, t_v = self._sparse_moving_average(x_tm1,
                                                   idxs,
                                                   g_t_**2,
                                                   'v',
                                                   beta=self.nu)
            v_t_ = tf.gather(v_t, idxs)
            v_bar_t_ = tf.sqrt(v_t_ + self.epsilon)
            updates.extend([v_t, t_v])
        else:
            v_bar_t_ = 1

        s_t_ = self.learning_rate * m_bar_t_ / v_bar_t_
        cache['s_t'] = tf.where(tf.is_finite(s_t_), s_t_, tf.zeros_like(s_t_))
        cache['g_t'] = g_t_
        cache['idxs'] = idxs
        return cache
コード例 #20
0
def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
                                         check_inf_nan):
    """Calculate the average gradient for a shared variable across all towers.

  Note that this function provides a synchronization point across all towers.

  Args:
    grad_and_vars: A list or tuple of (gradient, variable) tuples. Each
      (gradient, variable) pair within the outer list represents the gradient
      of the variable calculated for a single tower, and the number of pairs
      equals the number of towers.
    use_mean: if True, mean is taken, else sum of gradients is taken.
    check_inf_nan: check grads for nans and infs.

  Returns:
    The tuple ([(average_gradient, variable),], has_nan_or_inf) where the
      gradient has been averaged across all towers. The variable is chosen from
      the first tower. The has_nan_or_inf indicates the grads has nan or inf.
  """
    grads = [g for g, _ in grad_and_vars]
    if any(isinstance(g, tf.IndexedSlices) for g in grads):
        # TODO(reedwm): All-reduce IndexedSlices more effectively.
        grad = aggregate_indexed_slices_gradients(grads)
    else:
        grad = tf.add_n(grads)

    if use_mean and len(grads) > 1:
        grad = tf.scalar_mul(1.0 / len(grads), grad)

    v = grad_and_vars[0][1]
    if check_inf_nan:
        with tf.name_scope('check_for_inf_and_nan'):
            has_nan_or_inf = tf.logical_not(tf.reduce_all(tf.is_finite(grads)))
        return (grad, v), has_nan_or_inf
    else:
        return (grad, v), None
コード例 #21
0
    def apply_updates(self):
        assert not self._updates_applied
        self._updates_applied = True
        devices = list(self._dev_grads.keys())
        total_grads = sum(len(grads) for grads in self._dev_grads.values())
        assert len(devices) >= 1 and total_grads >= 1
        ops = []
        with absolute_name_scope(self.scope):

            # Cast gradients to FP32 and calculate partial sum within each device.
            dev_grads = OrderedDict()  # device => [(grad, var), ...]
            for dev_idx, dev in enumerate(devices):
                with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev):
                    sums = []
                    for gv in zip(*self._dev_grads[dev]):
                        assert all(v is gv[0][1] for g, v in gv)
                        g = [tf.cast(g, tf.float32) for g, v in gv]
                        g = g[0] if len(g) == 1 else tf.add_n(g)
                        sums.append((g, gv[0][1]))
                    dev_grads[dev] = sums

            # Sum gradients across devices.
            if len(devices) > 1:
                with tf.name_scope('SumAcrossGPUs'), tf.device(None):
                    for var_idx, grad_shape in enumerate(self._grad_shapes):
                        g = [dev_grads[dev][var_idx][0] for dev in devices]
                        if np.prod(
                                grad_shape
                        ):  # nccl does not support zero-sized tensors
                            g = tf.contrib.nccl.all_sum(g)
                        for dev, gg in zip(devices, g):
                            dev_grads[dev][var_idx] = (
                                gg, dev_grads[dev][var_idx][1])

            # Apply updates separately on each device.
            for dev_idx, (dev, grads) in enumerate(dev_grads.items()):
                with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev):

                    # Scale gradients as needed.
                    if self.use_loss_scaling or total_grads > 1:
                        with tf.name_scope('Scale'):
                            coef = tf.constant(np.float32(1.0 / total_grads),
                                               name='coef')
                            coef = self.undo_loss_scaling(coef)
                            grads = [(g * coef, v) for g, v in grads]

                    # Check for overflows.
                    with tf.name_scope('CheckOverflow'):
                        grad_ok = tf.reduce_all(
                            tf.stack([
                                tf.reduce_all(tf.is_finite(g))
                                for g, v in grads
                            ]))

                    # Update weights and adjust loss scaling.
                    with tf.name_scope('UpdateWeights'):
                        opt = self._dev_opt[dev]
                        ls_var = self.get_loss_scaling_var(dev)
                        if not self.use_loss_scaling:
                            ops.append(
                                tf.cond(grad_ok,
                                        lambda: opt.apply_gradients(grads),
                                        tf.no_op))
                        else:
                            ops.append(
                                tf.cond(
                                    grad_ok, lambda: tf.group(
                                        tf.assign_add(ls_var, self.
                                                      loss_scaling_inc),
                                        opt.apply_gradients(grads)),
                                    lambda: tf.group(
                                        tf.assign_sub(ls_var, self.
                                                      loss_scaling_dec))))

                    # Report statistics on the last device.
                    if dev == devices[-1]:
                        with tf.name_scope('Statistics'):
                            ops.append(
                                autosummary(self.id + '/learning_rate',
                                            self.learning_rate))
                            ops.append(
                                autosummary(self.id + '/overflow_frequency',
                                            tf.where(grad_ok, 0, 1)))
                            if self.use_loss_scaling:
                                ops.append(
                                    autosummary(self.id + '/loss_scaling_log2',
                                                ls_var))

            # Initialize variables and group everything into a single op.
            self.reset_optimizer_state()
            init_uninited_vars(list(self._dev_ls_var.values()))
            return tf.group(*ops, name='TrainingOp')
コード例 #22
0
def create_train_op(optimizer,
                    grads_and_vars,
                    max_grad=1.0,
                    mixed_precision=False,
                    gradient_accumulation_steps=1):
    global_step = tf.train.get_or_create_global_step()

    if gradient_accumulation_steps > 1:
        local_step = tf.get_variable(name="local_step",
                                     shape=[],
                                     dtype=tf.int32,
                                     trainable=False,
                                     initializer=tf.zeros_initializer)
        batch_finite = tf.get_variable(name="batch_finite",
                                       shape=[],
                                       dtype=tf.bool,
                                       trainable=False,
                                       initializer=tf.ones_initializer)
        accum_vars = [
            tf.get_variable(name=tvar.name.split(":")[0] + "/accum",
                            shape=tvar.shape.as_list(),
                            dtype=tf.float32,
                            trainable=False,
                            initializer=tf.zeros_initializer())
            for tvar in tf.trainable_variables()
        ]

        reset_step = tf.cast(tf.math.equal(
            local_step % gradient_accumulation_steps, 0),
                             dtype=tf.bool)
        local_step = tf.cond(
            reset_step, lambda: local_step.assign(tf.ones_like(local_step)),
            lambda: local_step.assign_add(1))

        grads_and_vars_and_accums = [(gv[0], gv[1], accum_vars[i])
                                     for i, gv in enumerate(grads_and_vars)
                                     if gv[0] is not None]
        grads, tvars, accum_vars = list(zip(*grads_and_vars_and_accums))

        all_are_finite = tf.reduce_all([
            tf.reduce_all(tf.is_finite(g)) for g in grads
        ]) if mixed_precision else tf.constant(True, dtype=tf.bool)
        batch_finite = tf.cond(
            reset_step, lambda: batch_finite.assign(
                tf.math.logical_and(tf.constant(True, dtype=tf.bool),
                                    all_are_finite)),
            lambda: batch_finite.assign(
                tf.math.logical_and(batch_finite, all_are_finite)))

        # This is how the model was pre-trained.
        # ensure global norm is a finite number
        # to prevent clip_by_global_norm from having a hizzy fit.
        (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=max_grad)

        accum_vars = tf.cond(
            reset_step, lambda: [
                accum_vars[i].assign(grad)
                for i, grad in enumerate(clipped_grads)
            ], lambda: [
                accum_vars[i].assign_add(grad)
                for i, grad in enumerate(clipped_grads)
            ])

        def update(accum_vars):
            return optimizer.apply_gradients(list(zip(accum_vars, tvars)))

        update_step = tf.identity(tf.cast(tf.math.equal(
            local_step % gradient_accumulation_steps, 0),
                                          dtype=tf.bool),
                                  name="update_step")
        update_op = tf.cond(update_step, lambda: update(accum_vars),
                            lambda: tf.no_op())

        new_global_step = tf.cond(
            tf.math.logical_and(update_step, batch_finite),
            lambda: global_step + 1, lambda: global_step)
        new_global_step = tf.identity(new_global_step, name='step_update')
        train_op = tf.group(update_op, [global_step.assign(new_global_step)])
    else:
        grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
        grads, tvars = list(zip(*grads_and_vars))
        all_are_finite = tf.reduce_all([
            tf.reduce_all(tf.is_finite(g)) for g in grads
        ]) if mixed_precision else tf.constant(True, dtype=tf.bool)

        # This is how the model was pre-trained.
        # ensure global norm is a finite number
        # to prevent clip_by_global_norm from having a hizzy fit.
        (clipped_grads, _) = tf.clip_by_global_norm(grads, clip_norm=max_grad)

        # 这里不要传入global step,adam内部没有对global step累加
        # 而原本adam等tf内置优化器会累加,这样就会造成global step重复增加
        train_op = optimizer.apply_gradients(list(zip(clipped_grads, tvars)))

        new_global_step = tf.cond(all_are_finite, lambda: global_step + 1,
                                  lambda: global_step)
        new_global_step = tf.identity(new_global_step, name='step_update')

        train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
コード例 #23
0
def create_optimizer(loss,
                     learning_rate,
                     num_train_steps,
                     weight_decay_rate=0.0,
                     warmup_steps=0,
                     warmup_proportion=0,
                     lr_decay_power=1.0,
                     layerwise_lr_decay_power=-1,
                     n_transformer_layers=None,
                     hvd=None,
                     use_fp16=False,
                     num_accumulation_steps=1,
                     allreduce_post_accumulation=False):
    """
    Creates an optimizer and training op.
    """
    compression = Compression.fp16 if use_fp16 else Compression.none

    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=lr_decay_power,
                                              cycle=False)
    warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps)
    learning_rate *= tf.minimum(
        1.0,
        tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32))

    if layerwise_lr_decay_power > 0:
        learning_rate = _get_layer_lrs(learning_rate, layerwise_lr_decay_power,
                                       n_transformer_layers)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=weight_decay_rate,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

    if hvd is not None and (num_accumulation_steps == 1 or
                            (not allreduce_post_accumulation)):
        optimizer = hvd.DistributedOptimizer(optimizer,
                                             sparse_as_dense=True,
                                             compression=compression)
    if use_fp16:
        loss_scale_manager = tf_contrib.mixed_precision.ExponentialUpdateLossScaleManager(
            init_loss_scale=2**32,
            incr_every_n_steps=1000,
            decr_every_n_nan_or_inf=2,
            decr_ratio=0.5)
        optimizer = tf_contrib.mixed_precision.LossScaleOptimizer(
            optimizer, loss_scale_manager)

    tvars = tf.trainable_variables()
    # if hvd.rank() == 0:
    #     print("*****Trainable variables*****")
    #     for v in tvars:
    #         print(v)
    #     print("*****************************")

    grads_and_vars = optimizer.compute_gradients(
        loss * 1.0 / num_accumulation_steps, tvars)

    if num_accumulation_steps > 1:
        local_step = tf.get_variable(name="local_step",
                                     shape=[],
                                     dtype=tf.int32,
                                     trainable=False,
                                     initializer=tf.zeros_initializer())
        batch_finite = tf.get_variable(name="batch_finite",
                                       shape=[],
                                       dtype=tf.bool,
                                       trainable=False,
                                       initializer=tf.ones_initializer())
        accum_vars = [
            tf.get_variable(name=tvar.name.split(":")[0] + "/accum",
                            shape=tvar.shape.as_list(),
                            dtype=tf.float32,
                            trainable=False,
                            initializer=tf.zeros_initializer())
            for tvar in tvars
        ]

        reset_step = tf.cast(tf.math.equal(local_step % num_accumulation_steps,
                                           0),
                             dtype=tf.bool)
        local_step = tf.cond(
            reset_step, lambda: local_step.assign(tf.ones_like(local_step)),
            lambda: local_step.assign_add(1))

        grads, tvars, accum_vars = zip(
            *[(g, v, g_acc)
              for (g, v), g_acc in zip(grads_and_vars, accum_vars)
              if g is not None])

        if use_fp16:
            # оказывается, это условие может быть кучу перых шагов false, а затем будет всю дорогу true
            all_are_finite = tf.reduce_all(
                [tf.reduce_all(tf.is_finite(g)) for g in grads])
            # если возобновить обучение из чекпоинта, то снова первые дохера шагов градиенты будут накапливаться,
            # что повлечёт скачок лосса
            # сделано так для продолжения обучения
            # all_are_finite = tf.constant(True, dtype=tf.bool)
        else:
            all_are_finite = tf.constant(True, dtype=tf.bool)

        batch_finite = tf.cond(
            reset_step, lambda: batch_finite.assign(
                tf.math.logical_and(tf.constant(True, dtype=tf.bool),
                                    all_are_finite)),
            lambda: batch_finite.assign(
                tf.math.logical_and(batch_finite, all_are_finite)))

        # This is how the model was pre-trained.
        # ensure global norm is a finite number
        # to prevent clip_by_global_norm from having a hizzy fit.
        (clipped_grads, _) = tf.clip_by_global_norm(
            grads,
            clip_norm=1.0,
            use_norm=tf.cond(all_are_finite, lambda: tf.global_norm(grads),
                             lambda: tf.constant(1.0)))

        accum_vars = tf.cond(
            reset_step, lambda:
            [v.assign(grad)
             for v, grad in zip(accum_vars, clipped_grads)], lambda:
            [v.assign_add(grad) for v, grad in zip(accum_vars, clipped_grads)])

        def update(accum_vars):
            if allreduce_post_accumulation and hvd is not None:
                accum_vars = [
                    hvd.allreduce(tf.convert_to_tensor(accum_var),
                                  compression=compression) if isinstance(
                                      accum_var, tf.IndexedSlices) else
                    hvd.allreduce(accum_var, compression=compression)
                    for accum_var in accum_vars
                ]
            return optimizer.apply_gradients(list(zip(accum_vars, tvars)),
                                             global_step=global_step)

        update_step = tf.identity(tf.cast(tf.math.equal(
            local_step % num_accumulation_steps, 0),
                                          dtype=tf.bool),
                                  name="update_step")
        update_op = tf.cond(update_step, lambda: update(accum_vars),
                            lambda: tf.no_op())
        new_global_step = tf.cond(
            tf.math.logical_and(
                update_step,
                tf.cast(hvd.allreduce(tf.cast(batch_finite, tf.int32)),
                        tf.bool)), lambda: global_step + 1,
            lambda: global_step)
        new_global_step = tf.identity(new_global_step, name='step_update')
        train_op = tf.group(update_op, [global_step.assign(new_global_step)])
    else:
        grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
        grads, tvars = list(zip(*grads_and_vars))

        if use_fp16:
            all_are_finite = tf.reduce_all(
                [tf.reduce_all(tf.is_finite(g)) for g in grads])
        else:
            all_are_finite = tf.constant(True, dtype=tf.bool)

        # This is how the model was pre-trained.
        # ensure global norm is a finite number
        # to prevent clip_by_global_norm from having a hizzy fit.
        (clipped_grads, _) = tf.clip_by_global_norm(
            grads,
            clip_norm=1.0,
            use_norm=tf.cond(all_are_finite, lambda: tf.global_norm(grads),
                             lambda: tf.constant(1.0)))
        train_op = optimizer.apply_gradients(list(zip(clipped_grads, tvars)),
                                             global_step=global_step)
        new_global_step = tf.cond(all_are_finite, lambda: global_step + 1,
                                  lambda: global_step)
        new_global_step = tf.identity(new_global_step, name='step_update')
        train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
コード例 #24
0
 def _check_regularizer(self, reg):
   weights = tf.random.normal((12, 16))
   nll = reg(weights)
   self.assertTrue(bool(tf.is_finite(nll)), msg='Invalid prior nll returned.')
   self.assertFalse(bool(tf.is_nan(nll)), msg='Prior nll is NaN.')
コード例 #25
0
def attention_layer(from_tensor,
                    to_tensor,
                    attention_mask=None,
                    num_attention_heads=1,
                    size_per_head=512,
                    query_act=None,
                    key_act=None,
                    value_act=None,
                    attention_probs_dropout_prob=0.0,
                    initializer_range=0.02,
                    softmax_temperature=1.0,
                    batch_size=None,
                    from_seq_length=None,
                    to_seq_length=None):
  """Performs multi-headed attention from `from_tensor` to `to_tensor`.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
  this is self-attention. Each timestep in `from_tensor` attends to the
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.

  This function first projects `from_tensor` into a "query" tensor and
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  of tensors of length `num_attention_heads`, where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with tf.einsum as follows:
    Input_tensor: [BFD]
    Wq, Wk, Wv: [DNH]
    Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq)
    K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk)
    V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv)
    attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H)
    attention_probs:[BNFT] = softmax(attention_scores)
    context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V)
    Wout:[DNH]
    Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout)

  Args:
    from_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    softmax_temperature: The temperature for the softmax attention.
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    from_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `from_tensor`.
    to_seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `to_tensor`.

  Returns:
    float Tensor of shape [batch_size, from_seq_length, num_attention_heads,
      size_per_head].

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """
  from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
  to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

  if len(from_shape) != len(to_shape):
    raise ValueError(
        "The rank of `from_tensor` must match the rank of `to_tensor`.")

  if len(from_shape) == 3:
    batch_size = from_shape[0]
    from_seq_length = from_shape[1]
    to_seq_length = to_shape[1]
  elif len(from_shape) == 2:
    if (batch_size is None or from_seq_length is None or to_seq_length is None):
      raise ValueError(
          "When passing in rank 2 tensors to attention_layer, the values "
          "for `batch_size`, `from_seq_length`, and `to_seq_length` "
          "must all be specified.")

  # Scalar dimensions referenced here:
  #   B = batch size (number of sequences)
  #   F = `from_tensor` sequence length
  #   T = `to_tensor` sequence length
  #   N = `num_attention_heads`
  #   H = `size_per_head`

  # `query_layer` = [B, F, N, H]
  query_layer = dense_layer_3d(from_tensor, num_attention_heads, size_per_head,
                               create_initializer(initializer_range), query_act,
                               "query")

  # `key_layer` = [B, T, N, H]
  key_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                             create_initializer(initializer_range), key_act,
                             "key")

  # `value_layer` = [B, T, N, H]
  value_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head,
                               create_initializer(initializer_range), value_act,
                               "value")

  # Take the dot product between "query" and "key" to get the raw
  # attention scores.
  attention_scores = tf.einsum(
      "BFNH,BTNH->BNFT", query_layer, key_layer, name="query_key_einsum")

  attention_scores = attention_scores / softmax_temperature
  attention_scores = tf.multiply(attention_scores,
                                 1.0 / math.sqrt(float(size_per_head)))

  if attention_mask is not None:
    # `attention_mask` = [B, 1, F, T] or [B, H, F, T]
    # Caller can pass a rank 3 tensor for a constand mask or rank 4 for per-head
    # head attention mask.
    attention_mask = tf.reshape(
        attention_mask, shape=[batch_size, -1, from_seq_length, to_seq_length])

    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
    # masked positions, this operation will create a tensor which is 0.0 for
    # positions we want to attend and -10000.0 for masked positions.
    attention_mask_float = tf.cast(attention_mask, tf.float32)
    # Please keep this tf.where as it fixes back propagation issues: It removes
    # NaNs when using tf.math.log.
    attention_mask_float = tf.where(attention_mask_float > 0.0,
                                    attention_mask_float,
                                    tf.zeros_like(attention_mask_float))

    adder = tf.math.log(attention_mask_float)
    adder = tf.where(
        tf.is_finite(adder), adder,
        tf.zeros_like(adder, dtype=tf.float32) - 10000.0)

    # Since we are adding it to the raw scores before the softmax, this is
    # effectively the same as removing these entirely.
    attention_scores += adder

  # Normalize the attention scores to probabilities.
  # `attention_probs` = [B, N, F, T]
  attention_probs = tf.nn.softmax(attention_scores)

  # This is actually dropping out entire tokens to attend to, which might
  # seem a bit unusual, but is taken from the original Transformer paper.
  attention_probs_do = dropout(attention_probs, attention_probs_dropout_prob)

  # `context_layer` = [B, F, N, H]
  context_layer = tf.einsum(
      "BNFT,BTNH->BFNH",
      attention_probs_do,
      value_layer,
      name="attention_value_einsum")

  return context_layer, attention_probs
コード例 #26
0
    def get(self):
        """ Provides input data to the graph. """
        # calculate size of each record (this lists what is contained in the db and how many bytes are occupied)
        record_bytes = 0

        encoding_bytes = 4
        kp_xyz_entries = 3 * self.num_kp
        record_bytes += encoding_bytes*kp_xyz_entries

        encoding_bytes = 4
        kp_uv_entries = 2 * self.num_kp
        record_bytes += encoding_bytes*kp_uv_entries

        kp_vis_entries = self.num_kp
        record_bytes += encoding_bytes*kp_vis_entries

        image_bytes = self.image_size[0] * self.image_size[1] * 3
        record_bytes += image_bytes

        """ READ DATA ITEMS"""
        # Start reader
        reader = tf.FixedLengthRecordReader(header_bytes=0, record_bytes=record_bytes)
        _, value = reader.read(tf.train.string_input_producer([self.path_to_db]))

        # decode to floats
        bytes_read = 0
        data_dict = dict()
        record_bytes_float32 = tf.decode_raw(value, tf.float32)

        # 1. Read keypoint xyz
        keypoint_xyz21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_xyz_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes*kp_xyz_entries
        keypoint_xyz21 /= 1000.0  # scale to meters
        keypoint_xyz21 = self.convert_kp(keypoint_xyz21)

        # calculate wrist coord
        if self.use_wrist_coord:
            wrist_xyz = keypoint_xyz21[16, :] + 2.0*(keypoint_xyz21[0, :] - keypoint_xyz21[16, :])
            keypoint_xyz21 = tf.concat([tf.expand_dims(wrist_xyz, 0),
                                        keypoint_xyz21[1:, :]], 0)

        data_dict['keypoint_xyz21'] = keypoint_xyz21

        # 2. Read keypoint uv AND VIS
        keypoint_uv_vis21 = tf.reshape(tf.slice(record_bytes_float32, [bytes_read//4], [kp_uv_entries+kp_vis_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes*(kp_uv_entries+kp_vis_entries)
        keypoint_uv_vis21 = self.convert_kp(keypoint_uv_vis21)
        keypoint_uv21 = keypoint_uv_vis21[:, :2]
        keypoint_vis21 = tf.equal(keypoint_uv_vis21[:, 2], 1.0)

        # calculate wrist vis
        if self.use_wrist_coord:
            wrist_vis = tf.logical_or(keypoint_vis21[16], keypoint_vis21[0])
            keypoint_vis21 = tf.concat([tf.expand_dims(wrist_vis, 0),
                                        keypoint_vis21[1:]], 0)

            wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :])
            keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0),
                                       keypoint_uv21[1:, :]], 0)

        data_dict['keypoint_vis21'] = keypoint_vis21

        if self.coord_uv_noise:
            noise = tf.truncated_normal([42, 2], mean=0.0, stddev=self.coord_uv_noise_sigma)
            keypoint_uv21 += noise

        data_dict['keypoint_uv21'] = keypoint_uv21

        # decode to uint8
        record_bytes_uint8 = tf.decode_raw(value, tf.uint8)

        # 4. Read image
        image = tf.reshape(tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]),
                               [self.image_size[0], self.image_size[1], 3])
        image = tf.cast(image, tf.float32)
        bytes_read += image_bytes

        # subtract mean
        image = image / 255.0 - 0.5
        if self.hue_aug:
            image = tf.image.random_hue(image, self.hue_aug_max)
        data_dict['image'] = image

        """ CONSTANTS """
        # Camera intrinsics
        sx = 822.79041
        sy = 822.79041
        tx = 318.47345
        ty = 250.31296
        data_dict['cam_mat'] = tf.constant([[sx, 0.0, tx], [0.0, sy, ty], [0.0, 0.0, 1.0]])

        # Hand side: this dataset only contains left hands
        data_dict['hand_side'] = tf.one_hot(tf.constant(0, dtype=tf.int32), depth=2, on_value=1.0, off_value=0.0, dtype=tf.float32)

        assert bytes_read == record_bytes, "Doesnt add up."

        """ DEPENDENT DATA ITEMS: XYZ represenations. """
        # make coords relative to root joint
        kp_coord_xyz_root = keypoint_xyz21[0, :] # this is the palm coord
        kp_coord_xyz21_rel = keypoint_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(tf.reduce_sum(tf.square(kp_coord_xyz21_rel[12, :] - kp_coord_xyz21_rel[11, :])))
        data_dict['keypoint_scale'] = index_root_bone_length
        data_dict['keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11

        # calculate local coordinates
        kp_coord_xyz21_local = bone_rel_trafo(data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local)
        data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local

        # calculate viewpoint and coords in canonical coordinates
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        """ DEPENDENT DATA ITEMS: HAND CROP """
        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]

            # catch problem, when no valid kp available (happens almost never)
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)), lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([2, ])

            if self.crop_center_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                    crop_scale_noise = tf.squeeze(tf.random_uniform([1], minval=1.0, maxval=1.2))

            if not self.use_wrist_coord:
                wrist_uv = keypoint_uv21[16, :] + 2.0*(keypoint_uv21[0, :] - keypoint_uv21[16, :])
                keypoint_uv21 = tf.concat([tf.expand_dims(wrist_uv, 0),
                                           keypoint_uv21[1:, :]], 0)

            # select visible coords only
            kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21)
            kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21)
            kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1)

            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0), self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2*tf.maximum(max_coord - crop_center, crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0), 500.0)

            # catch problem, when no valid kp available
            crop_size_best = tf.cond(tf.reduce_all(tf.is_finite(crop_size_best)), lambda: crop_size_best,
                                  lambda: tf.constant(200.0))
            crop_size_best.set_shape([])

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            scale *= crop_scale_noise
            data_dict['crop_scale'] = scale

            if self.crop_offset_noise:
                noise = tf.truncated_normal([2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Crop image
            img_crop = crop_image_from_xy(tf.expand_dims(image, 0), crop_center, self.crop_size, scale)
            data_dict['image_crop'] = tf.squeeze(img_crop)

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            keypoint_uv21_u = (data_dict['keypoint_uv21'][:, 0] - crop_center_float[1]) * scale + self.crop_size // 2
            keypoint_uv21_v = (data_dict['keypoint_uv21'][:, 1] - crop_center_float[0]) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21

            # Modify camera intrinsics
            scale = tf.reshape(scale, [1, ])
            scale_matrix = tf.dynamic_stitch([[0], [1], [2],
                                              [3], [4], [5],
                                              [6], [7], [8]], [scale, [0.0], [0.0],
                                                               [0.0], scale, [0.0],
                                                               [0.0], [0.0], [1.0]])
            scale_matrix = tf.reshape(scale_matrix, [3, 3])

            crop_center_float = tf.cast(crop_center, tf.float32)
            trans1 = crop_center_float[0] * scale - self.crop_size // 2
            trans2 = crop_center_float[1] * scale - self.crop_size // 2
            trans1 = tf.reshape(trans1, [1, ])
            trans2 = tf.reshape(trans2, [1, ])
            trans_matrix = tf.dynamic_stitch([[0], [1], [2],
                                              [3], [4], [5],
                                              [6], [7], [8]], [[1.0], [0.0], -trans2,
                                                               [0.0], [1.0], -trans1,
                                                               [0.0], [0.0], [1.0]])
            trans_matrix = tf.reshape(trans_matrix, [3, 3])

            data_dict['cam_mat'] = tf.matmul(trans_matrix, tf.matmul(scale_matrix, data_dict['cam_mat']))

        """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints"""
        # create scoremaps from the subset of 2D annoataion
        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]], -1)

        scoremap_size = self.image_size
        
        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21)
        
        if self.scoremap_dropout:
            scoremap = tf.nn.dropout(scoremap, self.scoremap_dropout_prob,
                                        noise_shape=[1, 1, 21])
            scoremap *= self.scoremap_dropout_prob

        data_dict['scoremap'] = scoremap

        if self.random_crop_to_size:
            tensor_stack = tf.concat([data_dict['image'],
                                      tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32), -1),
                                      tf.cast(data_dict['hand_mask'], tf.float32)], 2)
            s = tensor_stack.get_shape().as_list()
            tensor_stack_cropped = tf.random_crop(tensor_stack,
                                                  [self.random_crop_size, self.random_crop_size, s[2]])
            data_dict = dict()  # delete everything else because the random cropping makes the data invalid anyway
            data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32)

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))
コード例 #27
0
  if hvd is not None:
    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.fp16 if use_fp16 or manual_fp16 else Compression.none)

  tvars = tf.trainable_variables()
  # grads = tf.gradients(
  #    loss, tvars, colocate_gradients_with_ops=colocate_gradients_with_ops)
  // Change 10 calculate gradients with horovod
  grads_and_vars = optimizer.compute_gradients(loss, tvars)

  # # This is how the model was pre-trained.
  #(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
  // Change 11 clip grads
  grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
      grads, tvars = list(zip(*grads_and_vars))
      all_are_finite = tf.reduce_all(
          [tf.reduce_all(tf.is_finite(g)) for g in grads]) if use_fp16 or manual_fp16 else tf.constant(True, dtype=tf.bool)

  # This is how the model was pre-trained.
  # ensure global norm is a finite number
  # to prevent clip_by_global_norm from having a hizzy fit.
  (clipped_grads, _) = tf.clip_by_global_norm(
      grads, clip_norm=1.0,
      use_norm=tf.cond(
          all_are_finite,
          lambda: tf.global_norm(grads),
          lambda: tf.constant(1.0)))

  #train_op = optimizer.apply_gradients(
  #    list(zip(grads, tvars)), global_step=global_step)
  // Change 12 apply grads using the cliped grads
  train_op = optimizer.apply_gradients(
コード例 #28
0
ファイル: general.py プロジェクト: teo-milea/hand3d
def calc_center_bb(binary_class_mask):
    """ Returns the center of mass coordinates for the given binary_class_mask. """
    with tf.variable_scope('calc_center_bb'):
        binary_class_mask = tf.cast(binary_class_mask, tf.int32)
        binary_class_mask = tf.equal(binary_class_mask, 1)
        s = binary_class_mask.get_shape().as_list()
        if len(s) == 4:
            binary_class_mask = tf.squeeze(binary_class_mask, [3])

        s = binary_class_mask.get_shape().as_list()
        assert len(s) == 3, "binary_class_mask must be 3D."
        assert (s[0] < s[1]) and (
            s[0] < s[2]), "binary_class_mask must be [Batch, Width, Height]"

        # my meshgrid
        x_range = tf.expand_dims(tf.range(s[1]), 1)
        y_range = tf.expand_dims(tf.range(s[2]), 0)
        X = tf.tile(x_range, [1, s[2]])
        Y = tf.tile(y_range, [s[1], 1])

        bb_list = list()
        center_list = list()
        crop_size_list = list()
        for i in range(s[0]):
            X_masked = tf.cast(tf.boolean_mask(X, binary_class_mask[i, :, :]),
                               tf.float32)
            Y_masked = tf.cast(tf.boolean_mask(Y, binary_class_mask[i, :, :]),
                               tf.float32)

            x_min = tf.reduce_min(X_masked)
            x_max = tf.reduce_max(X_masked)
            y_min = tf.reduce_min(Y_masked)
            y_max = tf.reduce_max(Y_masked)

            start = tf.stack([x_min, y_min])
            end = tf.stack([x_max, y_max])
            bb = tf.stack([start, end], 1)
            bb_list.append(bb)

            center_x = 0.5 * (x_max + x_min)
            center_y = 0.5 * (y_max + y_min)
            center = tf.stack([center_x, center_y], 0)

            center = tf.cond(tf.reduce_all(tf.is_finite(center)),
                             lambda: center,
                             lambda: tf.constant([160.0, 160.0]))
            center.set_shape([2])
            center_list.append(center)

            crop_size_x = x_max - x_min
            crop_size_y = y_max - y_min
            crop_size = tf.expand_dims(tf.maximum(crop_size_x, crop_size_y), 0)
            crop_size = tf.cond(tf.reduce_all(tf.is_finite(crop_size)),
                                lambda: crop_size,
                                lambda: tf.constant([100.0]))
            crop_size.set_shape([1])
            crop_size_list.append(crop_size)

        bb = tf.stack(bb_list)
        center = tf.stack(center_list)
        crop_size = tf.stack(crop_size_list)

        return center, bb, crop_size
コード例 #29
0
    def apply_updates(self, allow_no_op: bool = False) -> tf.Operation:
        """Construct training op to update the registered variables based on their gradients."""
        tfutil.assert_tf_initialized()
        assert not self._updates_applied
        self._updates_applied = True
        all_ops = []

        # Check for no-op.
        if allow_no_op and len(self._devices) == 0:
            with tfutil.absolute_name_scope(self.scope):
                return tf.no_op(name='TrainingOp')

        # Clean up gradients.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Clean%d" % device_idx), tf.device(device.name):
                for var, grad in device.grad_raw.items():

                    # Filter out disconnected gradients and convert to float32.
                    grad = [g for g in grad if g is not None]
                    grad = [tf.cast(g, tf.float32) for g in grad]

                    # Sum within the device.
                    if len(grad) == 0:
                        grad = tf.zeros(var.shape)  # No gradients => zero.
                    elif len(grad) == 1:
                        grad = grad[0]              # Single gradient => use as is.
                    else:
                        grad = tf.add_n(grad)       # Multiple gradients => sum.

                    # Scale as needed.
                    scale = 1.0 / len(device.grad_raw[var]) / len(self._devices)
                    scale = tf.constant(scale, dtype=tf.float32, name="scale")
                    if self.minibatch_multiplier is not None:
                        scale /= tf.cast(self.minibatch_multiplier, tf.float32)
                    scale = self.undo_loss_scaling(scale)
                    device.grad_clean[var] = grad * scale

        # Sum gradients across devices.
        if len(self._devices) > 1:
            with tfutil.absolute_name_scope(self.scope + "/Broadcast"), tf.device(None):
                if platform.system() == "Windows":    # Windows => NCCL ops are not available.
                    self._broadcast_fallback()
                elif tf.VERSION.startswith("1.15."):  # TF 1.15 => NCCL ops are broken: https://github.com/tensorflow/tensorflow/issues/41539
                    self._broadcast_fallback()
                else:                                 # Otherwise => NCCL ops are safe to use.
                    self._broadcast_nccl()

        # Apply updates separately on each device.
        for device_idx, device in enumerate(self._devices.values()):
            with tfutil.absolute_name_scope(self.scope + "/Apply%d" % device_idx), tf.device(device.name):
                # pylint: disable=cell-var-from-loop

                # Accumulate gradients over time.
                if self.minibatch_multiplier is None:
                    acc_ok = tf.constant(True, name='acc_ok')
                    device.grad_acc = OrderedDict(device.grad_clean)
                else:
                    # Create variables.
                    with tf.control_dependencies(None):
                        for var in device.grad_clean.keys():
                            device.grad_acc_vars[var] = tf.Variable(tf.zeros(var.shape), trainable=False, name="grad_acc_var")
                        device.grad_acc_count = tf.Variable(tf.zeros([]), trainable=False, name="grad_acc_count")

                    # Track counter.
                    count_cur = device.grad_acc_count + 1.0
                    count_inc_op = lambda: tf.assign(device.grad_acc_count, count_cur)
                    count_reset_op = lambda: tf.assign(device.grad_acc_count, tf.zeros([]))
                    acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier, tf.float32))
                    all_ops.append(tf.cond(acc_ok, count_reset_op, count_inc_op))

                    # Track gradients.
                    for var, grad in device.grad_clean.items():
                        acc_var = device.grad_acc_vars[var]
                        acc_cur = acc_var + grad
                        device.grad_acc[var] = acc_cur
                        with tf.control_dependencies([acc_cur]):
                            acc_inc_op = lambda: tf.assign(acc_var, acc_cur)
                            acc_reset_op = lambda: tf.assign(acc_var, tf.zeros(var.shape))
                            all_ops.append(tf.cond(acc_ok, acc_reset_op, acc_inc_op))

                # No overflow => apply gradients.
                all_ok = tf.reduce_all(tf.stack([acc_ok] + [tf.reduce_all(tf.is_finite(g)) for g in device.grad_acc.values()]))
                apply_op = lambda: device.optimizer.apply_gradients([(tf.cast(grad, var.dtype), var) for var, grad in device.grad_acc.items()])
                all_ops.append(tf.cond(all_ok, apply_op, tf.no_op))

                # Adjust loss scaling.
                if self.use_loss_scaling:
                    ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var, self.loss_scaling_inc)
                    ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var, self.loss_scaling_dec)
                    ls_update_op = lambda: tf.group(tf.cond(all_ok, ls_inc_op, ls_dec_op))
                    all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op))

                # Last device => report statistics.
                if device_idx == len(self._devices) - 1:
                    all_ops.append(autosummary.autosummary(self.id + "/learning_rate", tf.convert_to_tensor(self.learning_rate)))
                    all_ops.append(autosummary.autosummary(self.id + "/overflow_frequency", tf.where(all_ok, 0, 1), condition=acc_ok))
                    if self.use_loss_scaling:
                        all_ops.append(autosummary.autosummary(self.id + "/loss_scaling_log2", device.loss_scaling_var))

        # Initialize variables.
        self.reset_optimizer_state()
        if self.use_loss_scaling:
            tfutil.init_uninitialized_vars([device.loss_scaling_var for device in self._devices.values()])
        if self.minibatch_multiplier is not None:
            tfutil.run([var.initializer for device in self._devices.values() for var in list(device.grad_acc_vars.values()) + [device.grad_acc_count]])

        # Group everything into a single op.
        with tfutil.absolute_name_scope(self.scope):
            return tf.group(*all_ops, name="TrainingOp")
コード例 #30
0
ファイル: BinaryDbReader.py プロジェクト: LyazS/hand3d
    def get(self):
        """ Provides input data to the graph. """
        # calculate size of each record (this lists what is contained in the db and how many bytes are occupied)
        record_bytes = 2

        encoding_bytes = 4
        kp_xyz_entries = 3 * self.num_kp
        record_bytes += encoding_bytes * kp_xyz_entries

        encoding_bytes = 4
        kp_uv_entries = 2 * self.num_kp
        record_bytes += encoding_bytes * kp_uv_entries

        cam_matrix_entries = 9
        record_bytes += encoding_bytes * cam_matrix_entries

        image_bytes = self.image_size[0] * self.image_size[1] * 3
        record_bytes += image_bytes

        hand_parts_bytes = self.image_size[0] * self.image_size[1]
        record_bytes += hand_parts_bytes

        kp_vis_bytes = self.num_kp
        record_bytes += kp_vis_bytes
        """ READ DATA ITEMS"""
        # Start reader
        reader = tf.FixedLengthRecordReader(header_bytes=0,
                                            record_bytes=record_bytes)
        _, value = reader.read(
            tf.train.string_input_producer([self.path_to_db]))

        # decode to floats
        bytes_read = 0
        data_dict = dict()
        record_bytes_float32 = tf.decode_raw(value, tf.float32)

        # 1. Read keypoint xyz
        keypoint_xyz = tf.reshape(
            tf.slice(record_bytes_float32, [bytes_read // 4],
                     [kp_xyz_entries]), [self.num_kp, 3])
        bytes_read += encoding_bytes * kp_xyz_entries

        # calculate palm coord
        if not self.use_wrist_coord:
            palm_coord_l = tf.expand_dims(
                0.5 * (keypoint_xyz[0, :] + keypoint_xyz[12, :]), 0)
            palm_coord_r = tf.expand_dims(
                0.5 * (keypoint_xyz[21, :] + keypoint_xyz[33, :]), 0)
            keypoint_xyz = tf.concat([
                palm_coord_l, keypoint_xyz[1:21, :], palm_coord_r,
                keypoint_xyz[-20:, :]
            ], 0)

        data_dict['keypoint_xyz'] = keypoint_xyz

        # 2. Read keypoint uv
        keypoint_uv = tf.cast(
            tf.reshape(
                tf.slice(record_bytes_float32, [bytes_read // 4],
                         [kp_uv_entries]), [self.num_kp, 2]), tf.int32)
        bytes_read += encoding_bytes * kp_uv_entries

        keypoint_uv = tf.cast(keypoint_uv, tf.float32)

        # calculate palm coord
        if not self.use_wrist_coord:
            palm_coord_uv_l = tf.expand_dims(
                0.5 * (keypoint_uv[0, :] + keypoint_uv[12, :]), 0)
            palm_coord_uv_r = tf.expand_dims(
                0.5 * (keypoint_uv[21, :] + keypoint_uv[33, :]), 0)
            keypoint_uv = tf.concat([
                palm_coord_uv_l, keypoint_uv[1:21, :], palm_coord_uv_r,
                keypoint_uv[-20:, :]
            ], 0)

        if self.coord_uv_noise:
            noise = tf.truncated_normal([42, 2],
                                        mean=0.0,
                                        stddev=self.coord_uv_noise_sigma)
            keypoint_uv += noise

        data_dict['keypoint_uv'] = keypoint_uv

        # 3. Camera intrinsics
        cam_mat = tf.reshape(
            tf.slice(record_bytes_float32, [bytes_read // 4],
                     [cam_matrix_entries]), [3, 3])
        bytes_read += encoding_bytes * cam_matrix_entries
        data_dict['cam_mat'] = cam_mat

        # decode to uint8
        bytes_read += 2
        record_bytes_uint8 = tf.decode_raw(value, tf.uint8)

        # 4. Read image
        image = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [image_bytes]),
            [self.image_size[0], self.image_size[1], 3])
        image = tf.cast(image, tf.float32)
        bytes_read += image_bytes

        # subtract mean
        image = image / 255.0 - 0.5
        if self.hue_aug:
            image = tf.image.random_hue(image, self.hue_aug_max)
        data_dict['image'] = image

        # 5. Read mask
        hand_parts_mask = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [hand_parts_bytes]),
            [self.image_size[0], self.image_size[1]])
        hand_parts_mask = tf.cast(hand_parts_mask, tf.int32)
        bytes_read += hand_parts_bytes
        data_dict['hand_parts'] = hand_parts_mask
        hand_mask = tf.greater(hand_parts_mask, 1)
        bg_mask = tf.logical_not(hand_mask)
        data_dict['hand_mask'] = tf.cast(tf.stack([bg_mask, hand_mask], 2),
                                         tf.int32)

        # 6. Read visibilty
        keypoint_vis = tf.reshape(
            tf.slice(record_bytes_uint8, [bytes_read], [kp_vis_bytes]),
            [self.num_kp])
        keypoint_vis = tf.cast(keypoint_vis, tf.bool)
        bytes_read += kp_vis_bytes

        # calculate palm visibility
        if not self.use_wrist_coord:
            palm_vis_l = tf.expand_dims(
                tf.logical_or(keypoint_vis[0], keypoint_vis[12]), 0)
            palm_vis_r = tf.expand_dims(
                tf.logical_or(keypoint_vis[21], keypoint_vis[33]), 0)
            keypoint_vis = tf.concat([
                palm_vis_l, keypoint_vis[1:21], palm_vis_r, keypoint_vis[-20:]
            ], 0)
        data_dict['keypoint_vis'] = keypoint_vis

        assert bytes_read == record_bytes, "Doesnt add up."
        """ DEPENDENT DATA ITEMS: SUBSET of 21 keypoints"""
        # figure out dominant hand by analysis of the segmentation mask
        one_map, zero_map = tf.ones_like(hand_parts_mask), tf.zeros_like(
            hand_parts_mask)
        cond_l = tf.logical_and(tf.greater(hand_parts_mask, one_map),
                                tf.less(hand_parts_mask, one_map * 18))
        cond_r = tf.greater(hand_parts_mask, one_map * 17)
        hand_map_l = tf.where(cond_l, one_map, zero_map)
        hand_map_r = tf.where(cond_r, one_map, zero_map)
        num_px_left_hand = tf.reduce_sum(hand_map_l)
        num_px_right_hand = tf.reduce_sum(hand_map_r)

        # PRODUCE the 21 subset using the segmentation masks
        # We only deal with the more prominent hand for each frame and discard the second set of keypoints
        kp_coord_xyz_left = keypoint_xyz[:21, :]
        kp_coord_xyz_right = keypoint_xyz[-21:, :]

        cond_left = tf.logical_and(
            tf.cast(tf.ones_like(kp_coord_xyz_left), tf.bool),
            tf.greater(num_px_left_hand, num_px_right_hand))
        kp_coord_xyz21 = tf.where(cond_left, kp_coord_xyz_left,
                                  kp_coord_xyz_right)

        hand_side = tf.where(
            tf.greater(num_px_left_hand,
                       num_px_right_hand), tf.constant(0, dtype=tf.int32),
            tf.constant(1, dtype=tf.int32))  # left hand = 0; right hand = 1
        data_dict['hand_side'] = tf.one_hot(hand_side,
                                            depth=2,
                                            on_value=1.0,
                                            off_value=0.0,
                                            dtype=tf.float32)

        data_dict['keypoint_xyz21'] = kp_coord_xyz21

        # make coords relative to root joint
        kp_coord_xyz_root = kp_coord_xyz21[0, :]  # this is the palm coord
        kp_coord_xyz21_rel = kp_coord_xyz21 - kp_coord_xyz_root  # relative coords in metric coords
        index_root_bone_length = tf.sqrt(
            tf.reduce_sum(
                tf.square(kp_coord_xyz21_rel[12, :] -
                          kp_coord_xyz21_rel[11, :])))
        data_dict['keypoint_scale'] = index_root_bone_length
        data_dict[
            'keypoint_xyz21_normed'] = kp_coord_xyz21_rel / index_root_bone_length  # normalized by length of 12->11

        # calculate local coordinates
        kp_coord_xyz21_local = bone_rel_trafo(
            data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_local = tf.squeeze(kp_coord_xyz21_local)
        data_dict['keypoint_xyz21_local'] = kp_coord_xyz21_local

        # calculate viewpoint and coords in canonical coordinates
        kp_coord_xyz21_rel_can, rot_mat = canonical_trafo(
            data_dict['keypoint_xyz21_normed'])
        kp_coord_xyz21_rel_can, rot_mat = tf.squeeze(
            kp_coord_xyz21_rel_can), tf.squeeze(rot_mat)
        kp_coord_xyz21_rel_can = flip_right_hand(kp_coord_xyz21_rel_can,
                                                 tf.logical_not(cond_left))
        data_dict['keypoint_xyz21_can'] = kp_coord_xyz21_rel_can
        data_dict['rot_mat'] = tf.matrix_inverse(rot_mat)

        # Set of 21 for visibility
        keypoint_vis_left = keypoint_vis[:21]
        keypoint_vis_right = keypoint_vis[-21:]
        keypoint_vis21 = tf.where(cond_left[:, 0], keypoint_vis_left,
                                  keypoint_vis_right)
        data_dict['keypoint_vis21'] = keypoint_vis21

        # Set of 21 for UV coordinates
        keypoint_uv_left = keypoint_uv[:21, :]
        keypoint_uv_right = keypoint_uv[-21:, :]
        keypoint_uv21 = tf.where(cond_left[:, :2], keypoint_uv_left,
                                 keypoint_uv_right)
        data_dict['keypoint_uv21'] = keypoint_uv21
        """ DEPENDENT DATA ITEMS: HAND CROP """
        if self.hand_crop:
            crop_center = keypoint_uv21[12, ::-1]

            # catch problem, when no valid kp available (happens almost never)
            crop_center = tf.cond(tf.reduce_all(tf.is_finite(crop_center)),
                                  lambda: crop_center,
                                  lambda: tf.constant([0.0, 0.0]))
            crop_center.set_shape([
                2,
            ])

            if self.crop_center_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_center_noise_sigma)
                crop_center += noise

            crop_scale_noise = tf.constant(1.0)
            if self.crop_scale_noise:
                crop_scale_noise = tf.squeeze(
                    tf.random_uniform([1], minval=1.0, maxval=1.2))

            # select visible coords only
            kp_coord_h = tf.boolean_mask(keypoint_uv21[:, 1], keypoint_vis21)
            kp_coord_w = tf.boolean_mask(keypoint_uv21[:, 0], keypoint_vis21)
            kp_coord_hw = tf.stack([kp_coord_h, kp_coord_w], 1)

            # determine size of crop (measure spatial extend of hw coords first)
            min_coord = tf.maximum(tf.reduce_min(kp_coord_hw, 0), 0.0)
            max_coord = tf.minimum(tf.reduce_max(kp_coord_hw, 0),
                                   self.image_size)

            # find out larger distance wrt the center of crop
            crop_size_best = 2 * tf.maximum(max_coord - crop_center,
                                            crop_center - min_coord)
            crop_size_best = tf.reduce_max(crop_size_best)
            crop_size_best = tf.minimum(tf.maximum(crop_size_best, 50.0),
                                        500.0)

            # catch problem, when no valid kp available
            crop_size_best = tf.cond(
                tf.reduce_all(tf.is_finite(crop_size_best)),
                lambda: crop_size_best, lambda: tf.constant(200.0))
            crop_size_best.set_shape([])

            # calculate necessary scaling
            scale = tf.cast(self.crop_size, tf.float32) / crop_size_best
            scale = tf.minimum(tf.maximum(scale, 1.0), 10.0)
            scale *= crop_scale_noise
            data_dict['crop_scale'] = scale

            if self.crop_offset_noise:
                noise = tf.truncated_normal(
                    [2], mean=0.0, stddev=self.crop_offset_noise_sigma)
                crop_center += noise

            # Crop image
            img_crop = crop_image_from_xy(tf.expand_dims(image, 0),
                                          crop_center, self.crop_size, scale)
            data_dict['image_crop'] = tf.squeeze(img_crop)

            # Modify uv21 coordinates
            crop_center_float = tf.cast(crop_center, tf.float32)
            keypoint_uv21_u = (keypoint_uv21[:, 0] - crop_center_float[1]
                               ) * scale + self.crop_size // 2
            keypoint_uv21_v = (keypoint_uv21[:, 1] - crop_center_float[0]
                               ) * scale + self.crop_size // 2
            keypoint_uv21 = tf.stack([keypoint_uv21_u, keypoint_uv21_v], 1)
            data_dict['keypoint_uv21'] = keypoint_uv21

            # Modify camera intrinsics
            scale = tf.reshape(scale, [
                1,
            ])
            scale_matrix = tf.dynamic_stitch([
                [0], [1], [2], [3], [4], [5], [6], [7], [8]
            ], [scale, [0.0], [0.0], [0.0], scale, [0.0], [0.0], [0.0], [1.0]])
            scale_matrix = tf.reshape(scale_matrix, [3, 3])

            crop_center_float = tf.cast(crop_center, tf.float32)
            trans1 = crop_center_float[0] * scale - self.crop_size // 2
            trans2 = crop_center_float[1] * scale - self.crop_size // 2
            trans1 = tf.reshape(trans1, [
                1,
            ])
            trans2 = tf.reshape(trans2, [
                1,
            ])
            trans_matrix = tf.dynamic_stitch(
                [[0], [1], [2], [3], [4], [5], [6], [7], [8]],
                [[1.0], [0.0], -trans2, [0.0], [1.0], -trans1, [0.0], [0.0],
                 [1.0]])
            trans_matrix = tf.reshape(trans_matrix, [3, 3])

            data_dict['cam_mat'] = tf.matmul(trans_matrix,
                                             tf.matmul(scale_matrix, cam_mat))
        """ DEPENDENT DATA ITEMS: Scoremap from the SUBSET of 21 keypoints"""
        # create scoremaps from the subset of 2D annoataion
        keypoint_hw21 = tf.stack([keypoint_uv21[:, 1], keypoint_uv21[:, 0]],
                                 -1)

        scoremap_size = self.image_size

        if self.hand_crop:
            scoremap_size = (self.crop_size, self.crop_size)

        scoremap = self.create_multiple_gaussian_map(keypoint_hw21,
                                                     scoremap_size,
                                                     self.sigma,
                                                     valid_vec=keypoint_vis21)

        if self.scoremap_dropout:
            scoremap = tf.nn.dropout(scoremap,
                                     self.scoremap_dropout_prob,
                                     noise_shape=[1, 1, 21])
            scoremap *= self.scoremap_dropout_prob

        data_dict['scoremap'] = scoremap

        if self.scale_to_size:
            image, keypoint_uv21, keypoint_vis21 = data_dict[
                'image'], data_dict['keypoint_uv21'], data_dict[
                    'keypoint_vis21']
            s = image.get_shape().as_list()
            image = tf.image.resize_images(image, self.scale_target_size)
            scale = (self.scale_target_size[0] / float(s[0]),
                     self.scale_target_size[1] / float(s[1]))
            keypoint_uv21 = tf.stack([
                keypoint_uv21[:, 0] * scale[1], keypoint_uv21[:, 1] * scale[0]
            ], 1)

            data_dict = dict(
            )  # delete everything else because the scaling makes the data invalid anyway
            data_dict['image'] = image
            data_dict['keypoint_uv21'] = keypoint_uv21
            data_dict['keypoint_vis21'] = keypoint_vis21

        elif self.random_crop_to_size:
            tensor_stack = tf.concat([
                data_dict['image'],
                tf.expand_dims(tf.cast(data_dict['hand_parts'], tf.float32),
                               -1),
                tf.cast(data_dict['hand_mask'], tf.float32)
            ], 2)
            s = tensor_stack.get_shape().as_list()
            tensor_stack_cropped = tf.random_crop(
                tensor_stack,
                [self.random_crop_size, self.random_crop_size, s[2]])
            data_dict = dict(
            )  # delete everything else because the random cropping makes the data invalid anyway
            data_dict['image'], data_dict['hand_parts'], data_dict['hand_mask'] = tensor_stack_cropped[:, :, :3],\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 3], tf.int32),\
                                                                                  tf.cast(tensor_stack_cropped[:, :, 4:], tf.int32)

        names, tensors = zip(*data_dict.items())

        if self.shuffle:
            tensors = tf.train.shuffle_batch_join([tensors],
                                                  batch_size=self.batch_size,
                                                  capacity=100,
                                                  min_after_dequeue=50,
                                                  enqueue_many=False)
        else:
            tensors = tf.train.batch_join([tensors],
                                          batch_size=self.batch_size,
                                          capacity=100,
                                          enqueue_many=False)

        return dict(zip(names, tensors))