Beispiel #1
0
 def MatmulJob():
     with flow.scope.placement(device_type, "0:0"):
         a = flow.get_variable(
             "a",
             shape=a_shape,
             dtype=dtype,
             initializer=flow.random_uniform_initializer(minval=0,
                                                         maxval=1),
             trainable=True,
         )
         b = flow.get_variable(
             "b",
             shape=b_shape,
             dtype=dtype,
             initializer=flow.random_uniform_initializer(minval=0,
                                                         maxval=1),
             trainable=True,
         )
         if data_type == "float16":
             out = flow.matmul(
                 flow.cast(a, dtype=flow.float16),
                 flow.cast(b, dtype=flow.float16),
                 transpose_a,
                 transpose_b,
                 alpha,
             )
             c = flow.get_variable(
                 "c",
                 shape=out.shape,
                 dtype=dtype,
                 initializer=flow.random_uniform_initializer(minval=-1,
                                                             maxval=1),
                 trainable=True,
             )
             loss = flow.cast(out + flow.cast(c, dtype=flow.float16),
                              dtype=flow.float)
         else:
             out = flow.matmul(a, b, transpose_a, transpose_b, alpha)
             c = flow.get_variable(
                 "c",
                 shape=out.shape,
                 dtype=dtype,
                 initializer=flow.random_uniform_initializer(minval=-1,
                                                             maxval=1),
                 trainable=True,
             )
             loss = out + c
         flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
             [], [0.0001]),
                            momentum=0).minimize(loss)
         flow.watch(a, test_global_storage.Setter("a"))
         flow.watch_diff(a, test_global_storage.Setter("a_diff"))
         flow.watch(b, test_global_storage.Setter("b"))
         flow.watch_diff(b, test_global_storage.Setter("b_diff"))
         flow.watch(c, test_global_storage.Setter("c"))
         flow.watch_diff(c, test_global_storage.Setter("c_diff"))
         flow.watch(loss, test_global_storage.Setter("loss"))
         flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))
         return loss
Beispiel #2
0
 def Matmul(
     x: tp.Numpy.Placeholder((4, 4), dtype=flow.float32),
     y: tp.Numpy.Placeholder((4, 4), dtype=flow.float32),
 ) -> tp.Numpy:
     s = flow.matmul(x, y)
     flow.watch(s, Watch)
     z = flow.matmul(s, x)
     return z
Beispiel #3
0
def createOfQNet(
    input_image: oft.Numpy.Placeholder((BATCH_SIZE, 4, 64, 64),
                                       dtype=flow.float32),
    var_name_prefix: str = "QNet",
    is_train: bool = True,
) -> oft.Numpy:
    (
        conv1_weight,
        conv1_bias,
        conv2_weight,
        conv2_bias,
        fc1_weight,
        fc1_bias,
        fc2_weight,
        fc2_bias,
    ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train)
    (
        conv1_weight,
        conv1_bias,
        conv2_weight,
        conv2_bias,
        fc1_weight,
        fc1_bias,
        fc2_weight,
        fc2_bias,
    ) = getQNetParams(var_name_prefix=var_name_prefix, is_train=is_train)
    conv1 = flow.nn.compat_conv2d(input_image,
                                  conv1_weight,
                                  strides=[1, 1],
                                  padding="same",
                                  data_format="NCHW")
    conv1 = flow.nn.bias_add(conv1, conv1_bias, "NCHW")
    conv1 = flow.nn.relu(conv1)
    pool1 = flow.nn.max_pool2d(conv1, 2, 2, "VALID", "NCHW", name="pool1")
    conv2 = flow.nn.compat_conv2d(pool1,
                                  conv2_weight,
                                  strides=[1, 1],
                                  padding="same",
                                  data_format="NCHW")
    conv2 = flow.nn.bias_add(conv2, conv2_bias, "NCHW")
    conv2 = flow.nn.relu(conv2)
    pool2 = flow.nn.max_pool2d(conv2, 2, 2, "VALID", "NCHW", name="pool2")
    pool2_flatten = flow.reshape(pool2, (BATCH_SIZE, -1))
    fc1 = flow.matmul(a=pool2_flatten, b=fc1_weight, transpose_b=True)
    fc1 = flow.nn.bias_add(fc1, fc1_bias)
    fc1 = flow.nn.relu(fc1)
    fc2 = flow.matmul(a=fc1, b=fc2_weight, transpose_b=True)
    fc2 = flow.nn.bias_add(fc2, fc2_bias)
    return fc2
Beispiel #4
0
def row_parallel_linear(
    name,
    x,
    output_size,
    weight_initializer,
    bias_initializer=flow.constant_initializer(0.0),
    weight_parallel_dist=distribute.get_row_linear_weight_parallel_dist(),
    bias_parallel_dist=distribute.get_row_linear_bias_parallel_dist(),
    dropout_rate=0.1,
    bias_dropout_fusion=True,
):
    w, b = get_linear_params(
        name,
        x.shape[-1],
        output_size,
        x.dtype,
        weight_initializer=weight_initializer,
        bias_initializer=bias_initializer,
        weight_parallel_dist=weight_parallel_dist,
        bias_parallel_dist=bias_parallel_dist,
    )
    # 2d sbp sig: [S(0), S(1)] x [B, S(0)] -> [S(0), P] -> [S(0), B]
    # data grad 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)]
    x = flow.matmul(x, w)
    x = distribute.forward_p2b_parallel_cast(x)
    if bias_dropout_fusion:
        x = flow.nn.fused_bias_add_dropout(x,
                                           b,
                                           data_format="NHC",
                                           rate=dropout_rate)
    else:
        x = flow.nn.bias_add(x, b, data_format="NHC")
        x = flow.nn.dropout(x, rate=dropout_rate)

    return x
Beispiel #5
0
    def logits(self, hidden_states, token_embeddings):
        """
        shape sig: (batch_size * seq_length, hidden_size) x (hidden_size, vocab_size)(transposed)
            -> (batch_size * seq_length, vocab_size)
        dp sbp sig: S(0) x B -> S(0)
        2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)]
        """
        assert len(hidden_states.shape) == 3
        assert np.prod(
            hidden_states.shape[0:2]) == self.batch_size * self.seq_length
        assert hidden_states.shape[-1] == self.hidden_size
        assert len(token_embeddings.shape) == 2
        assert token_embeddings.shape[0] == self.vocab_size
        assert token_embeddings.shape[1] == self.hidden_size

        with distribute.layer_placement_scope(-1):
            if (hidden_states.shape[0] == self.seq_length
                    and hidden_states.shape[1] == self.batch_size):
                # [s, b, H] -> [b, s, H]
                h = flow.transpose(hidden_states, [1, 0, 2])
            elif (hidden_states.shape[0] == self.batch_size
                  and hidden_states.shape[1] == self.seq_length):
                h = hidden_states
            else:
                raise ValueError(
                    f"invalid hidden states shape {hidden_states.shape}")

            # [s, b, H] or [b, s, H] -> [b * s, H]
            h = flow.flatten(h, start_dim=0, end_dim=1)
            # 2d sbp sig: [S(0), B] x [B, S(1)](transposed) -> [S(0), S(1)]
            # grad 2d sbp sig: [S(0), S(1)] x [B, S(0)] -> [S(0), P] -> [S(0), B]
            h = distribute.backward_p2b_parallel_cast(h)
            lgs = flow.matmul(h, token_embeddings, transpose_b=True)

        return lgs
def _AddClassficationLoss(input_blob,
                          label_blob,
                          hidden_size,
                          label_num,
                          initializer_range,
                          scope_name='classification'):
    with flow.scope.namespace(scope_name):
        output_weight_blob = flow.get_variable(
            name="output_weights",
            shape=[label_num, hidden_size],
            dtype=input_blob.dtype,
            # initializer=bert_util.CreateInitializer(initializer_range),
            initializer=flow.random_normal_initializer(
                mean=0.0, stddev=initializer_range, seed=None, dtype=None))
        output_bias_blob = flow.get_variable(
            name="output_bias",
            shape=[label_num],
            dtype=input_blob.dtype,
            initializer=flow.constant_initializer(0.0),
        )
        logit_blob = flow.matmul(input_blob,
                                 output_weight_blob,
                                 transpose_b=True)
        logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logit_blob, labels=label_blob)
        loss = pre_example_loss
        return loss, pre_example_loss, logit_blob
Beispiel #7
0
def _AddNextSentenceOutput(input_blob, label_blob, hidden_size,
                           initializer_range):
    with flow.scope.namespace("cls-seq_relationship"):
        output_weight_blob = flow.get_variable(
            name="output_weights",
            shape=[2, hidden_size],
            dtype=input_blob.dtype,
            model_name="weight",
            initializer=bert_util.CreateInitializer(initializer_range),
        )
        output_bias_blob = flow.get_variable(
            name="output_bias",
            shape=[2],
            dtype=input_blob.dtype,
            model_name="bias",
            initializer=flow.constant_initializer(0.0),
        )
        logit_blob = flow.matmul(input_blob,
                                 output_weight_blob,
                                 transpose_b=True)
        logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logit_blob, labels=label_blob)
        loss = pre_example_loss
        return (loss, pre_example_loss, logit_blob)
Beispiel #8
0
 def test_fn(
         a: flow.typing.Numpy.Placeholder(a_shape),
         b: flow.typing.Numpy.Placeholder(b_shape),
         c: flow.typing.Numpy.Placeholder(c_shape),
 ) -> flow.typing.Numpy:
     var_a = flow.get_variable(
         name="var_a",
         shape=a_shape,
         dtype=flow.float32,
         initializer=flow.ones_initializer(),
         distribute=flow.distribute.split(1),
     )
     a = flow.parallel_cast(a, distribute=flow.distribute.split(1))
     a = var_a * a
     out = flow.matmul(a, b)
     out = flow.parallel_cast(
         out,
         distribute=flow.distribute.broadcast(),
         gradient_distribute=flow.distribute.broadcast(),
     )
     c = flow.parallel_cast(c, distribute=flow.distribute.broadcast())
     out = flow.nn.bias_add(out, c)
     lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [0.001])
     flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(out)
     return out
Beispiel #9
0
def col_parallel_linear(
    name,
    x,
    output_size,
    weight_initializer,
    bias_initializer=flow.constant_initializer(0.0),
    weight_parallel_dist=distribute.get_col_linear_weight_parallel_dist(),
    bias_parallel_dist=distribute.get_col_linear_bias_parallel_dist(),
    need_gelu=False,
    bias_gelu_fusion=True,
):
    w, b = get_linear_params(
        name,
        x.shape[-1],
        output_size,
        x.dtype,
        weight_initializer=weight_initializer,
        bias_initializer=bias_initializer,
        weight_parallel_dist=weight_parallel_dist,
        bias_parallel_dist=bias_parallel_dist,
    )
    # 2d sbp sig: [S(0), B] x [B, S(1)] -> [S(0), S(1)]
    # data grad 2d sbp sig: [S(0), S(1)] x [B, S(0)](transposed) -> [S(0), P] -> [S(0), B]
    x = distribute.backward_p2b_parallel_cast(x)
    x = flow.matmul(x, w)
    if need_gelu:
        if bias_gelu_fusion:
            x = flow.nn.fused_bias_add_gelu(x, b, data_format="NHC")
        else:
            x = flow.nn.bias_add(x, b, data_format="NHC")
            x = flow.math.gelu(x)
    else:
        x = flow.nn.bias_add(x, b, data_format="NHC")

    return x
Beispiel #10
0
def gram_matrix(input):
    b = input.shape[0]
    ch = input.shape[1]
    h = input.shape[2]
    w = input.shape[3]
    features = flow.reshape(input, [b, ch, h * w])
    features_t = flow.transpose(features, [0, 2, 1])
    gram = flow.matmul(features, features_t) / (ch * h * w)
    return gram
Beispiel #11
0
    def fused_multihead_attn(self, h):
        assert len(h.shape) == 3
        assert h.shape[0] == self.seq_length
        assert h.shape[1] == self.batch_size
        assert h.shape[2] == self.hidden_size * 3

        qmk, v = flow.nn.fused_self_attention_query_mul_key_and_value(
            h, head_size=self.head_size, alpha=(1.0 / self.norm_factor))
        qmk = self.tril_softmax_dropout(qmk)
        return flow.matmul(qmk, v)
Beispiel #12
0
def inceptionv3(images, labels, trainable=True):
    conv0 = _conv2d_layer(
        "conv0", images, filters=32, kernel_size=3, strides=2, padding="VALID"
    )
    conv1 = _conv2d_layer(
        "conv1", conv0, filters=32, kernel_size=3, strides=1, padding="VALID"
    )
    conv2 = _conv2d_layer(
        "conv2", conv1, filters=64, kernel_size=3, strides=1, padding="SAME"
    )
    pool1 = flow.nn.max_pool2d(
        conv2, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1"
    )
    conv3 = _conv2d_layer(
        "conv3", pool1, filters=80, kernel_size=1, strides=1, padding="VALID"
    )
    conv4 = _conv2d_layer(
        "conv4", conv3, filters=192, kernel_size=3, strides=1, padding="VALID"
    )
    pool2 = flow.nn.max_pool2d(
        conv4, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool2"
    )
    mixed_0 = InceptionA(pool2, 0)
    mixed_1 = InceptionA(mixed_0, 1)
    mixed_2 = InceptionA(mixed_1, 2)
    mixed_3 = InceptionB(mixed_2, 3)
    mixed_4 = InceptionC(mixed_3, 4, 128)
    mixed_5 = InceptionC(mixed_4, 5, 160)
    mixed_6 = InceptionC(mixed_5, 6, 160)
    mixed_7 = InceptionC(mixed_6, 7, 192)
    mixed_8 = InceptionD(mixed_7, 8)
    mixed_9 = InceptionE(mixed_8, 9)
    mixed_10 = InceptionE(mixed_9, 10)
    pool3 = flow.nn.avg_pool2d(
        mixed_10, ksize=8, strides=1, padding="VALID", data_format="NCHW", name="pool3"
    )
    with flow.scope.namespace("logits"):
        pool3 = flow.reshape(pool3, [pool3.shape[0], -1])
        weight = flow.get_variable(
            "fc1-weight",
            shape=(pool3.shape[1], 1001),
            dtype=flow.float,
            initializer=flow.truncated_normal(0.816496580927726),
            model_name="weight",
        )
        bias = flow.get_variable(
            "fc1-bias",
            shape=(1001,),
            dtype=flow.float,
            initializer=flow.constant_initializer(),
            model_name="bias",
        )
        fc1 = flow.matmul(pool3, weight)
        fc1 = flow.nn.bias_add(fc1, bias)
    return fc1
Beispiel #13
0
 def self_attn_qk_v_fw_bw(
     h: flow.typing.Numpy.Placeholder(
         shape=(seq_len, batch_size, hidden_size), dtype=flow.float32
     )
 ) -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy]:
     var = flow.get_variable(
         "var",
         shape=(1,),
         dtype=flow.float32,
         initializer=flow.constant_initializer(1.0, dtype=flow.float32),
         trainable=True,
     )
     h = h * var
     if fused:
         flow.watch_diff(h, test_global_storage.Setter("h_grad_fused"))
     else:
         flow.watch_diff(h, test_global_storage.Setter("h_grad"))
     if fp16:
         h = flow.amp_white_identity(h)
     alpha = get_alpha(head_size)
     if fused:
         (qmk, v) = flow.nn.fused_self_attention_query_mul_key_and_value(
             h, head_size=head_size, alpha=alpha
         )
     else:
         h = flow.reshape(h, (seq_len, batch_size, -1, 3 * head_size))
         (q, k, v) = (
             flow.transpose(
                 flow.slice(
                     h,
                     begin=[None, None, None, head_size * i],
                     size=[None, None, None, head_size],
                 ),
                 perm=[1, 2, 0, 3],
             )
             for i in range(3)
         )
         qmk = flow.matmul(q, k, transpose_b=True, alpha=alpha)
     h = flow.matmul(qmk, v)
     loss = flow.math.reduce_sum(h)
     flow.optimizer.SGD(get_lr_scheduler(), momentum=0).minimize(loss)
     return (qmk, v)
Beispiel #14
0
def _dense_layer(
    inputs,
    units,
    activation=None,
    use_bias=True,
    kernel_initializer=None,
    bias_initializer=None,
    trainable=True,
    name=None,
):
    in_shape = inputs.shape
    in_num_axes = len(in_shape)
    assert in_num_axes >= 2
    name_prefix = name if name is not None else id_util.UniqueStr("Dense_")
    inputs = flow.reshape(inputs,
                          (-1, in_shape[-1])) if in_num_axes > 2 else inputs
    weight = flow.get_variable(
        name="{}-weight".format(name_prefix),
        shape=(units, inputs.shape[1]),
        dtype=inputs.dtype,
        initializer=kernel_initializer
        if kernel_initializer is not None else flow.constant_initializer(0),
        trainable=trainable,
        model_name="weight",
    )
    weight = flow.identity(weight)
    weight = flow.repeat(weight, args.num_piece_in_batch)
    out = flow.matmul(a=inputs,
                      b=weight,
                      transpose_b=True,
                      name="{}_matmul".format(name_prefix))
    if use_bias:
        bias = flow.get_variable(
            name="{}-bias".format(name_prefix),
            shape=(units, ),
            dtype=inputs.dtype,
            initializer=bias_initializer
            if bias_initializer is not None else flow.constant_initializer(0),
            trainable=trainable,
            model_name="bias",
        )
        bias = flow.identity(bias)
        bias = flow.repeat(bias, args.num_piece_in_batch)
        out = flow.nn.bias_add(out,
                               bias,
                               name="{}_bias_add".format(name_prefix))
    out = (activation(out, name="{}_activation".format(name_prefix))
           if activation is not None else out)
    out = flow.reshape(out, in_shape[:-1] +
                       (units, )) if in_num_axes > 2 else out
    return out
Beispiel #15
0
def dense(input,
          units,
          name,
          use_bias=False,
          trainable=True,
          reuse=False,
          const_init=False):
    name_ = name if reuse == False else name + "_reuse"

    in_shape = input.shape
    in_num_axes = len(in_shape)
    assert in_num_axes >= 2

    inputs = flow.reshape(input,
                          (-1, in_shape[-1])) if in_num_axes > 2 else input

    weight = flow.get_variable(
        name="{}-weight".format(name),
        shape=(units, inputs.shape[1]),
        dtype=inputs.dtype,
        initializer=flow.random_normal_initializer(
            stddev=0.02) if not const_init else get_const_initializer(),
        trainable=trainable,
        reuse=reuse,
        model_name="weight",
    )

    out = flow.matmul(
        a=inputs,
        b=weight,
        transpose_b=True,
        name=name_ + "matmul",
    )

    if use_bias:
        bias = flow.get_variable(
            name="{}-bias".format(name),
            shape=(units, ),
            dtype=inputs.dtype,
            initializer=flow.random_normal_initializer()
            if not const_init else get_const_initializer(),
            trainable=trainable,
            reuse=reuse,
            model_name="bias",
        )
        out = flow.nn.bias_add(out, bias, name=name_ + "_bias_add")

    out = flow.reshape(out, in_shape[:-1] +
                       (units, )) if in_num_axes > 2 else out
    return out
Beispiel #16
0
 def xla_matmul_job(
         a=flow.FixedTensorDef(a_shape, dtype=dtype),
         b=flow.FixedTensorDef(b_shape, dtype=dtype),
 ):
     out = flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)
     c = flow.get_variable(
         "c",
         shape=out.shape,
         dtype=flow.float,
         initializer=flow.ones_initializer(),
         trainable=True,
     )
     out = flow.math.add_n([out, c])
     return out
Beispiel #17
0
    def multihead_attn(self, q, k, v):
        """
        q, k, v shape: (batch_size, num_attn_heads, seq_length, head_size)
        """
        assert all(len(x.shape) == 4 for x in (q, k, v))
        assert all(x.shape[0] == self.batch_size for x in (q, k, v))
        assert all(x.shape[1] == self.num_heads for x in (q, k, v))
        assert all(x.shape[2] == self.seq_length for x in (q, k, v))
        assert all(x.shape[3] == self.head_size for x in (q, k, v))

        # q * k: batch_matmul
        # shape sig: (b, n, s, h) x (b, n, h, s)(transposed) -> (b, n, s, s)
        # data parallel sbp sig: S(0) x S(0) -> S(0)
        # 2d sbp sig: [S(0), S(1)] x [S(0), S(1)] -> [S(0), S(1)]
        qmk = flow.matmul(q,
                          k,
                          transpose_b=True,
                          alpha=(1.0 / self.norm_factor))
        qmk = self.tril_softmax_dropout(qmk)
        # w * v: batch_matmul
        # shape sig: (b, n, s, s) x (b, n, s, h) -> (b, n, s, h)
        # data parallel sbp sig: S(0) x S(0) -> S(0)
        # 2d sbp sig: [S(0), S(1)] x [S(0), S(1)] -> [S(0), S(1)]
        return flow.matmul(qmk, v)
Beispiel #18
0
 def DynamicReshapeJob(x: oft.ListNumpy.Placeholder(data_shape)):
     reshape_out1 = flow.reshape(x, (-1, 20))
     my_model = flow.get_variable(
         "my_model",
         shape=(20, 32),
         dtype=flow.float,
         initializer=flow.random_uniform_initializer(minval=-10,
                                                     maxval=10),
         trainable=True,
     )
     my_model = flow.cast_to_current_logical_view(my_model)
     mm_out = flow.matmul(reshape_out1, my_model)
     reshape_out2 = flow.reshape(mm_out, (-1, 8, 4))
     flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
         [], [0.0001]),
                        momentum=0).minimize(reshape_out2)
     return reshape_out1
Beispiel #19
0
def _FullyConnected(input_blob,
                    input_size,
                    units,
                    activation=None,
                    name=None,
                    weight_initializer=None):
    weight_blob = flow.get_variable(name=name + '-weight',
                                    shape=[input_size, units],
                                    dtype=input_blob.dtype,
                                    initializer=weight_initializer)
    bias_blob = flow.get_variable(name=name + '-bias',
                                  shape=[units],
                                  dtype=input_blob.dtype,
                                  initializer=flow.constant_initializer(0.0))
    output_blob = flow.matmul(input_blob, weight_blob)
    output_blob = flow.nn.bias_add(output_blob, bias_blob)
    return output_blob
Beispiel #20
0
 def model() -> tp.Numpy:
     with get_placement():
         x = flow.get_variable(
             name="x",
             shape=(4, 5),
             dtype=flow.float32,
             initializer=flow.random_normal_initializer(mean=10, stddev=1),
         )
         w = flow.get_variable(
             name="w",
             shape=(5, 6),
             dtype=flow.float32,
             initializer=flow.random_normal_initializer(mean=10, stddev=1),
             distribute=flow.distribute.split(0),
         )
         y = flow.matmul(x, w)
         flow.optimizer.SGD(
             flow.optimizer.PiecewiseConstantScheduler([], [0.01]), momentum=0.9
         ).minimize(y)
         return y
Beispiel #21
0
 def matmul_job() -> typing.Tuple[flow.typing.Numpy, flow.typing.Numpy,
                                  flow.typing.Numpy, flow.typing.Numpy]:
     a_var = flow.get_variable(
         "a",
         shape=a_shape,
         dtype=flow.float32,
         initializer=flow.random_uniform_initializer(minval=0, maxval=1),
         trainable=True,
     )
     b_var = flow.get_variable(
         "b",
         shape=b_shape,
         dtype=flow.float32,
         initializer=flow.random_uniform_initializer(minval=0, maxval=1),
         trainable=True,
     )
     flow.watch_diff(a_var, test_global_storage.Setter("a_diff"))
     flow.watch_diff(b_var, test_global_storage.Setter("b_diff"))
     if dtype is flow.float16:
         a = flow.amp_white_identity(a_var)
         b = flow.amp_white_identity(b_var)
     else:
         a = a_var
         b = b_var
     c = flow.matmul(a, b, trans_a, trans_b, alpha)
     add_to = flow.get_variable(
         "c",
         shape=c.shape,
         dtype=flow.float32,
         initializer=flow.random_uniform_initializer(minval=-1, maxval=1),
         trainable=True,
     )
     if test_add_to_output:
         flow.watch_diff(add_to, test_global_storage.Setter("add_to_diff"))
         if dtype is flow.float16:
             add_to = flow.amp_white_identity(add_to)
         c = c + add_to
     flow.watch_diff(c, test_global_storage.Setter("c_diff"))
     get_optimizer().minimize(c)
     return (a_var, b_var, add_to, c)
Beispiel #22
0
def _AddMaskedLanguageModelLoss(
    input_blob,
    output_weights_blob,
    positions_blob,
    label_id_blob,
    label_weight_blob,
    seq_length,
    hidden_size,
    vocab_size,
    max_predictions_per_seq,
    hidden_act,
    initializer_range,
):
    with flow.scope.namespace("other"):
        sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob,
                                                     axis=[-1])
        ones = sum_label_weight_blob * 0.0 + 1.0
        sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
        batch_size = flow.math.reduce_sum(ones)
        sum_label_weight_blob = sum_label_weight_blob / batch_size
    with flow.scope.namespace("cls-predictions"):
        input_blob = _GatherIndexes(input_blob, positions_blob, seq_length,
                                    hidden_size)
        with flow.scope.namespace("transform"):
            if callable(hidden_act):
                act_fn = op_conf_util.kNone
            else:
                act_fn = hidden_act
            input_blob = bert_util._FullyConnected(
                input_blob,
                input_size=hidden_size,
                units=hidden_size,
                activation=act_fn,
                weight_initializer=bert_util.CreateInitializer(
                    initializer_range),
                name="dense",
            )
            if callable(hidden_act):
                input_blob = hidden_act(input_blob)
                input_blob = bert_util._LayerNorm(input_blob, hidden_size)
        output_bias = flow.get_variable(
            name="output_bias",
            shape=[vocab_size],
            dtype=input_blob.dtype,
            initializer=flow.constant_initializer(1.0),
        )
        logit_blob = flow.matmul(input_blob,
                                 output_weights_blob,
                                 transpose_b=True)
        logit_blob = flow.nn.bias_add(logit_blob, output_bias)
        label_id_blob = flow.reshape(label_id_blob, [-1])
        pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logit_blob, labels=label_id_blob)
        pre_example_loss = flow.reshape(pre_example_loss,
                                        [-1, max_predictions_per_seq])
        numerator = pre_example_loss * label_weight_blob
        with flow.scope.namespace("loss"):
            numerator = flow.math.reduce_sum(numerator, axis=[-1])
            denominator = sum_label_weight_blob + 1e-5
            loss = numerator / denominator
        return loss, pre_example_loss, logit_blob
Beispiel #23
0
def _AttentionLayer(
    from_blob,
    to_blob,
    attention_mask_blob,
    num_attention_heads=1,
    size_per_head=512,
    query_act=op_conf_util.kNone,
    key_act=op_conf_util.kNone,
    value_act=op_conf_util.kNone,
    attention_probs_dropout_prob=0.0,
    initializer_range=0.02,
    do_return_2d_tensor=False,
    batch_size=None,
    from_seq_length=None,
    to_seq_length=None,
):
    def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
        output_blob = flow.reshape(
            input_blob, [-1, seq_length, num_attention_heads, width]
        )
        output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
        return output_blob

    from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
    to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])
    query_blob = _FullyConnected(
        from_blob_2d,
        input_size=num_attention_heads * size_per_head,
        units=num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        weight_initializer=CreateInitializer(initializer_range),
    )
    key_blob = _FullyConnected(
        to_blob_2d,
        input_size=num_attention_heads * size_per_head,
        units=num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        weight_initializer=CreateInitializer(initializer_range),
    )
    value_blob = _FullyConnected(
        to_blob_2d,
        input_size=num_attention_heads * size_per_head,
        units=num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        weight_initializer=CreateInitializer(initializer_range),
    )
    query_blob = TransposeForScores(
        query_blob, num_attention_heads, from_seq_length, size_per_head
    )
    key_blob = TransposeForScores(
        key_blob, num_attention_heads, to_seq_length, size_per_head
    )
    attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
    attention_scores_blob = attention_scores_blob * (
        1.0 / math.sqrt(float(size_per_head))
    )
    attention_mask_blob = flow.reshape(
        attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]
    )
    attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
    addr_blob = (attention_mask_blob - 1.0) * 10000.0
    attention_scores_blob = attention_scores_blob + addr_blob
    attention_probs_blob = flow.nn.softmax(attention_scores_blob)
    attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)
    value_blob = flow.reshape(
        value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]
    )
    value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
    context_blob = flow.matmul(attention_probs_blob, value_blob)
    context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])
    if do_return_2d_tensor:
        context_blob = flow.reshape(
            context_blob, [-1, num_attention_heads * size_per_head]
        )
    else:
        context_blob = flow.reshape(
            context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]
        )
    return context_blob
Beispiel #24
0
 def trt_matmul_job(
         a=flow.FixedTensorDef(a_shape, dtype=dtype),
         b=flow.FixedTensorDef(b_shape, dtype=dtype),
 ):
     return flow.matmul(a, b, transpose_a=trans_a, transpose_b=trans_b)