def clip_by_global_norm(grads,
                        clip_norm=1.0,
                        grad_scale=1.0,
                        saturate=0.0,
                        zero_infs=False,
                        zero_nans=False):

    grad_float = list()
    grad_ehalf = list()
    grad_bhalf = list()

    for grad in grads:
        if grad.dtype is tf.float32:
            grad_float.append(grad)
        elif grad.dtype is tf.float16:
            grad_ehalf.append(grad)
        elif grad.dtype is tf.bfloat16:
            grad_bhalf.append(grad)
        else:
            raise ValueError("unsupported grad dtype")

    with tf.device("/gpu:0"):
        global_norm, norm_scale, _ = clip_global_norm_op(
            scalar_constant(grad_scale, dtype=tf.float32),
            scalar_constant(clip_norm, dtype=tf.float32),
            grad_float,
            grad_ehalf,
            grad_bhalf,
            saturate=saturate,
            zero_infs=zero_infs,
            zero_nans=zero_nans)

    return global_norm, norm_scale
    def softmax(self, x, scale=1.0, dtype=None):
        nn_lut = get_constant(self.nn_lut, name="nn")

        if dtype is None:
            dtype = self.softmax_dtype

        return blocksparse_softmax(x, scalar_constant(scale, dtype=tf.float32), nn_lut, blocks=self.blocks, blk_size=self.blk_size, ctx_blks=self.ctx_blks_q, lut_max=self.nn_max, T=dtype)
Esempio n. 3
0
def embedding_lookup(emb, idx, sort_grad=True, bench=0, use_tf=False):

    dev = emb.op.device.lower()
    if use_tf or not dev or "cpu" in dev:
        #print("######################### Using TF embeding:", dev)
        y = tf.nn.embedding_lookup(convert_gradient_to_tensor(emb), idx)
    else:
        y = embedding_lookup_op(emb, idx, scalar_constant(emb.shape[0].value, dtype=tf.int32), sorted=sort_grad, bench=bench)
    return y
    def masked_softmax(self, x, scale=1.0, autoregress_at_key=None, dtype=None):
        if self.softmax_mask is None:
            if autoregress_at_key is not None:
                raise ValueError("autoregress_at_key only applies to ops with mask_callback defined.")
            return self.softmax(x, scale)

        nn_lut  = get_constant(self.nn_lut,       name="nn")
        sm_mask = get_constant(self.softmax_mask, name="sm")

        if autoregress_at_key is not None:
            lut = get_constant(self.nt_lut, name="nt")
            key = scalar_constant(autoregress_at_key, dtype=tf.int32)
            with tf.control_dependencies([x.op]):
                sm_mask = bst_partial_autoregressive_mask(sm_mask, lut, key, blocks=self.blocks, blk_size=self.blk_size, ctx_blks_k=self.ctx_blks_k)

        if dtype is None:
            dtype = self.softmax_dtype

        return blocksparse_masked_softmax(x, scalar_constant(scale, dtype=tf.float32), nn_lut, sm_mask, blocks=self.blocks, blk_size=self.blk_size, ctx_blks=self.ctx_blks_q, lut_max=self.nn_max, T=dtype)
Esempio n. 5
0
def filter_tensor(x,
                  scale=1.0,
                  saturate=0.0,
                  zero_infs=False,
                  zero_nans=False):
    return filter_tensor_op(x,
                            scalar_constant(scale, dtype=tf.float32),
                            saturate=float(saturate),
                            zero_infs=zero_infs,
                            zero_nans=zero_nans)
def blocksparse_l2_decay(param, gate=None, rate=0.05, epsilon=1e-12):

    _check_param_shape(param, gate)

    gate = [gate] if gate is not None else []

    return l2_decay_op(param,
                       scalar_constant(rate, dtype=tf.float32),
                       gate,
                       epsilon=epsilon)
Esempio n. 7
0
def block_reduced_full_dw(param_grad, scale=1.0, norm="max", group_size=8):

    # max(abs()) or l2_norm()
    norm  = 0 if norm.lower() == "max" else 1
    # host side scalar, if zero will cause compute for this op to be skipped.
    scale = scalar_constant(scale, dtype=tf.float32)

    assert group_size <= 8

    # backward walk param grad to find BlocksparseMatmulDW ops
    # this should only hit BlocksparseMatmulDWs, BlocksparseMatmulDGs, AddNs or FloatCasts
    ops = get_parents(param_grad, "BlocksparseMatmulDW")
    if len(ops) < 1:
        raise ValueError("BlocksparseMatmulDW op not found")

    # this sorting is dependent on the op names being correctly ordered.
    ops.sort(key=lambda op: op.name.split('/')[-1], reverse=True)

    # use the parent scope for the new ops
    scope = ops[-1].name.split('/')
    scope = '/'.join(scope[0:-1])

    # we're going to be using absolute names, so clear name_scope
    with tf.name_scope(None):
        dw_full = None
        offset  = 0
        while offset < len(ops):

            xs = [op.inputs[0] for op in ops[offset:offset+group_size] ]
            gs = [op.inputs[1] for op in ops[offset:offset+group_size] ]

            # Get the corresponding activation grad op for the last param grad op in the group
            bprop = None
            for consumer in gs[-1].consumers():
                if consumer.type == "BlocksparseMatmulDX":
                    bprop = consumer
                    break
            assert bprop is not None

            # get attributes of first op in group
            up    = ops[offset]
            bsize = up.get_attr("bsize")
            axis  = up.get_attr("axis")
            name  = "%s/block_reduced_full_dw_%03d" % (scope, offset)
            dw_full = [] if dw_full is None else [dw_full]

            dw_full, _, _ = blocksparse_reduced_dw(xs, gs, scale, dw_full, bsize=bsize, norm=norm, axis=axis, name=name)

            # force the dw op before any more time steps are processed
            bprop._add_control_input(dw_full.op)

            offset += group_size

    return dw_full
def blocksparse_prune(param,
                      gate,
                      step,
                      sparsity=None,
                      threshold=None,
                      norm="max",
                      frequency=1):

    _check_param_shape(param, gate)

    # one must be set
    assert (sparsity is None) ^ (threshold is None)

    if sparsity is not None:

        # apply pruning to the moving average
        norms = blocksparse_norm(param, norm=norm)

        k = scalar_constant(param.shape[0].value, dtype=tf.int32)

        _, idx = tf.nn.top_k(norms, k=k, sorted=True)

        return blocksparse_prune_op(gate,
                                    idx,
                                    scalar_constant(sparsity,
                                                    dtype=tf.float32),
                                    step,
                                    frequency=frequency)

    elif threshold is not None:

        norm = 1 if norm.lower() == "l2" else 0

        return blocksparse_threshold_prune_op(gate,
                                              param,
                                              scalar_constant(
                                                  threshold, dtype=tf.float32),
                                              step,
                                              frequency=frequency,
                                              norm_type=norm)
def masked_softmax(x, mask=None, scale=1.0, bench=0):
    if mask is not None:
        x_shape = x.shape.as_list()
        m_shape = mask.shape.as_list()

        assert len(x_shape) == len(m_shape)
        for i in range(len(m_shape)):
            assert m_shape[i] in (1, x_shape[i])
        mask = [ mask ]
    else:
        mask = []

    return masked_softmax_op(x, scalar_constant(scale, dtype=tf.float32), mask, bench=bench)
Esempio n. 10
0
def concrete_gate(loga,
                  tempurature=2.0 / 3.0,
                  limit_a=-0.1,
                  limit_b=1.1,
                  epsilon=1e-6):

    gate, _ = concrete_gate_op(loga,
                               get_entropy(),
                               scalar_constant(tempurature, dtype=tf.float32),
                               limit_a=limit_a,
                               limit_b=limit_b,
                               epsilon=epsilon)
    return gate
Esempio n. 11
0
    def __init__(self,
                 learning_rate=5e-4,
                 beta2=0.999,
                 epsilon=1e-30,
                 clip_thresh=1.0,
                 norm_scale=None,
                 grad_scale=1.0,
                 saturate=0.0,
                 zero_infs=False,
                 zero_nans=False,
                 name="Adafactor",
                 zero_init_variables=False):

        super().__init__(False, name)
        self.epsilon = epsilon
        self.saturate = saturate
        self.zero_infs = zero_infs
        self.zero_nans = zero_nans
        self.name = name
        self.norm_scale = [] if norm_scale is None else [norm_scale]

        beta2_init = 0.0 if zero_init_variables else beta2

        with tf.device("/cpu:0"), tf.variable_scope("adafactor_decay"):

            one = scalar_constant(1.0, dtype=tf.float32)
            self.decay1_power = tf.Variable(initial_value=beta2_init,
                                            name="decay1_power",
                                            trainable=False)
            self.decay2_power = tf.Variable(initial_value=beta2_init *
                                            beta2_init,
                                            name="decay2_power",
                                            trainable=False)
            self.learn_rate = scalar_constant(learning_rate, dtype=tf.float32)
            self.clip_thresh = scalar_constant(clip_thresh, dtype=tf.float32)
            self.grad_scale = scalar_constant(grad_scale, dtype=tf.float32)
            self.decay_t = scalar_constant(beta2, dtype=tf.float32)
            self.decay = self.decay_t * (one - self.decay1_power) / (
                one - self.decay2_power)
def masked_top_k_softmax(x, k, mask=None, scale=1.0):

    assert k <= x.shape[-1].value <= 1024

    if mask is not None:
        x_shape = x.shape.as_list()
        m_shape = mask.shape.as_list()

        assert len(x_shape) == len(m_shape)
        for i in range(len(m_shape)):
            assert m_shape[i] in (1, x_shape[i])
        mask = [ mask ]
    else:
        mask = []

    return masked_top_k_softmax_op(x, k, scalar_constant(scale, dtype=tf.float32), mask)
Esempio n. 13
0
    def __init__(self,
                 learning_rate=3e-4,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-8,
                 clip_sigmas=0.0,
                 norm_scale=None,
                 grad_scale=1.0,
                 saturate=0.0,
                 zero_infs=False,
                 zero_nans=False,
                 gated=False,
                 param_qspec=None,
                 mean_qspec=None,
                 var_qspec=None,
                 fp16=False,
                 zero_init_variables=False,
                 name="Adam"):

        super().__init__(False, name)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.saturate = saturate
        self.zero_infs = zero_infs
        self.zero_nans = zero_nans
        self.gated = gated
        self.param_qspec = param_qspec
        self.mean_qspec = mean_qspec
        self.var_qspec = var_qspec
        self.name = name
        self.norm_scale = [] if norm_scale is None else [norm_scale]
        self.fp16 = fp16

        beta1_init = 0.0 if zero_init_variables else beta1
        beta2_init = 0.0 if zero_init_variables else beta2

        with tf.device("/cpu:0"), tf.variable_scope("adam_beta"):

            one = scalar_constant(1.0, dtype=tf.float32)
            self.beta1_power = tf.Variable(initial_value=beta1_init,
                                           name="beta1_power",
                                           trainable=False)
            self.beta2_power = tf.Variable(initial_value=beta2_init,
                                           name="beta2_power",
                                           trainable=False)
            self.beta1_t = scalar_constant(beta1, dtype=tf.float32)
            self.beta2_t = scalar_constant(beta2, dtype=tf.float32)
            self.clip_sigma = scalar_constant(clip_sigmas, dtype=tf.float32)
            self.grad_scale = scalar_constant(grad_scale, dtype=tf.float32)
            self.lr = scalar_constant(
                learning_rate, dtype=tf.float32) * tf.sqrt(
                    one - self.beta2_power) / (one - self.beta1_power)
Esempio n. 14
0
def dropout(x, keep_prob, mask=None, mask_shape=None):

    keep_prob = scalar_constant(keep_prob)

    if mask is None:

        if mask_shape is not None and len(mask_shape) > 0:
            size = 1
            for m_dim, x_dim in zip(mask_shape, x.shape.as_list()):
                # we don't currently support placeholder dims when broadcasting the dropout mask
                assert m_dim == 1 or m_dim == x_dim, "incompatible mask_shape: %s x.shape: %s" % (mask_shape, x.shape)
                size *= m_dim
        else:
            size = 0

        mask = gen_dropout_mask_op(x, get_entropy(), keep_prob, size=size)

    if mask_shape is None:
        mask_shape = []

    return apply_dropout_mask_op(x, mask, keep_prob, mask_shape=mask_shape), mask
def softmax(x, scale=1.0, bench=0):
    return masked_softmax_op(x, scalar_constant(scale, dtype=tf.float32), [], bench=bench)