Exemple #1
0
    def ConvJob():
        with flow.scope.placement(device_type, "0:0"):
            x = flow.get_variable(
                "x",
                shape=x_shape,
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
                trainable=True,
            )
            loss = flow.layers.conv1d(
                x,
                filters,
                kernel_size=kernel_size,
                strides=[1],
                padding="valid",
                data_format="NCW",
                dilation_rate=1,
                groups=groups,
                use_bias=False,
                kernel_initializer=flow.random_uniform_initializer(minval=0,
                                                                   maxval=100),
                weight_name="conv1d_weight",
            )
            weight_shape = (filters, x.shape[1] // groups, kernel_size)
            weight = flow.get_variable(
                name="conv1d_weight",
                shape=weight_shape,
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
            )
            flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                [], [1e-4]),
                               momentum=0).minimize(loss)

            flow.watch(x, test_global_storage.Setter("x"))
            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
            flow.watch(weight, test_global_storage.Setter("weight"))
            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
            flow.watch(loss, test_global_storage.Setter("loss"))
            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))

            return loss
    def ConvJob():
        with flow.scope.placement(device_type, "0:0"):
            x = flow.get_variable(
                "x",
                shape=x_shape,
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
                trainable=True,
            )
            if data_format == "NCW":
                weight_shape = (filters, x.shape[1] // groups, kernel_size)
            else:
                weight_shape = (filters, kernel_size, x.shape[2] // groups)
            weight = flow.get_variable(
                "conv-weight",
                shape=weight_shape,
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
            )
            loss = flow.nn.conv1d(
                x,
                weight,
                strides=[stride],
                padding=of_padding,
                data_format=data_format,
                dilations=dilation,
                groups=groups,
            )
            flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                [], [1e-4]),
                               momentum=0).minimize(loss)

            flow.watch(x, test_global_storage.Setter("x"))
            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
            flow.watch(weight, test_global_storage.Setter("weight"))
            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
            flow.watch(loss, test_global_storage.Setter("loss"))
            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))

            return loss
 def test_job(
         x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),
         addend: oft.Numpy.Placeholder(input_shape, dtype=flow.float32),
 ):
     v = flow.get_variable(
         name="v",
         shape=(1, ),
         dtype=flow.float32,
         initializer=flow.zeros_initializer(),
     )
     x = x + v
     addend = addend + v
     x1 = flow.identity(x)
     x2 = flow.identity(x)
     addend1 = flow.identity(addend)
     addend2 = flow.identity(addend)
     flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
     flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
     flow.watch_diff(addend1, test_global_storage.Setter("addend1_diff"))
     flow.watch_diff(addend2, test_global_storage.Setter("addend2_diff"))
     x1 = flow.cast(x1, data_type)
     x2 = flow.cast(x2, data_type)
     addend1 = flow.cast(addend1, data_type)
     addend2 = flow.cast(addend2, data_type)
     y1 = flow.layers.batch_normalization_add_relu(x1,
                                                   addend=addend1,
                                                   axis=axis,
                                                   name="BN1")
     y2 = flow.math.relu(
         flow.layers.batch_normalization(x2, axis=axis, name="BN2") +
         addend2)
     y1 = flow.cast(y1, flow.float32)
     y2 = flow.cast(y2, flow.float32)
     flow.watch(y1, test_global_storage.Setter("y1"))
     flow.watch(y2, test_global_storage.Setter("y2"))
     y1 = flow.where(flow.math.greater(y2, v), y1, v)
     y2 = flow.where(flow.math.greater(y1, v), y2, v)
     loss = y1 + y2
     flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([],
                                                                  [0.001]),
                        momentum=0).minimize(flow.math.reduce_sum(loss))
     return loss
Exemple #4
0
    def ConvJob():
        with flow.scope.placement(device_type, "0:0"):
            x = flow.get_variable(
                "x",
                shape=x_shape,
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
                trainable=True,
            )
            if data_format == "NCHW":
                weight_shape = (filters, x.shape[1] // groups, kernel_size,
                                kernel_size)
            else:
                weight_shape = (filters, kernel_size, kernel_size,
                                x.shape[3] // groups)
            weight = flow.get_variable(
                "conv-weight",
                shape=weight_shape,
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
            )
            loss = flow.nn.conv2d(
                x,
                weight,
                strides=[stride_h, stride_w],
                padding=of_padding,
                data_format=data_format,
                dilations=[dilation_h, dilation_w],
                groups=groups,
            )
            flow.losses.add_loss(loss)

            flow.watch(x, test_global_storage.Setter("x"))
            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
            flow.watch(weight, test_global_storage.Setter("weight"))
            flow.watch_diff(weight, test_global_storage.Setter("weight_diff"))
            flow.watch(loss, test_global_storage.Setter("loss"))
            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))

            return loss
Exemple #5
0
    def PreluJob(
        x: oft.Numpy.Placeholder(x_shape, dtype=type_name_to_flow_type[dtype])
    ):
        with flow.scope.placement(device_type, "0:0"):
            x += flow.get_variable(
                name="v1",
                shape=(1,),
                dtype=type_name_to_flow_type[dtype],
                initializer=flow.zeros_initializer(),
            )
            loss = flow.layers.prelu(
                x,
                alpha_initializer=flow.random_uniform_initializer(
                    minval=0.1, maxval=0.9
                ),
                shared_axes=shared_axes,
                name="prelu",
            )
            alpha_shape = list(x.shape[1:])
            if shared_axes is not None:
                for i in shared_axes:
                    alpha_shape[i - 1] = 1
            alpha = flow.get_variable(
                "prelu-alpha",
                shape=tuple(alpha_shape),
                dtype=type_name_to_flow_type[dtype],
                initializer=flow.random_uniform_initializer(minval=0.1, maxval=0.9),
            )
            flow.optimizer.SGD(
                flow.optimizer.PiecewiseConstantScheduler([], [1e-4]), momentum=0
            ).minimize(loss)

            flow.watch(x, test_global_storage.Setter("x"))
            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
            flow.watch(alpha, test_global_storage.Setter("alpha"))
            flow.watch_diff(alpha, test_global_storage.Setter("alpha_diff"))
            flow.watch(loss, test_global_storage.Setter("loss"))
            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))

            return loss
Exemple #6
0
    def ScalarAddByTensorJob():
        with flow.scope.placement(device_type, "0:0"):
            x = flow.get_variable(
                "x",
                shape=x_shape,
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
                trainable=True,
            )
            y = flow.get_variable(
                "y",
                shape=(1, ),
                dtype=flow.float,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=100),
                trainable=True,
            )
            if case == "add":
                loss = flow.math.add(x, y)
            elif case == "sub":
                loss = flow.math.subtract(x, y)
            elif case == "mul":
                loss = flow.math.multiply(x, y)
            elif case == "div":
                loss = flow.math.divide(x, y)
            flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                [], [1e-4]),
                               momentum=0).minimize(loss)

            flow.watch(x, test_global_storage.Setter("x"))
            flow.watch(y, test_global_storage.Setter("y"))
            flow.watch_diff(x, test_global_storage.Setter("x_diff"))
            flow.watch_diff(y, test_global_storage.Setter("y_diff"))
            flow.watch(loss, test_global_storage.Setter("loss"))
            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))

            return loss
    def test_fused_scale_tril_softmax_dropout_fw_bw_job():
        with flow.scope.placement(device_type, "0:0"):
            x = flow.get_variable(
                "x",
                shape=x_shape,
                dtype=dtype,
                initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0),
                trainable=True,
            )
            flow.watch(x, test_global_storage.Setter("x"))

            x1 = flow.identity(x)
            x2 = flow.identity(x)

            flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
            flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))
            if data_type == "float16":
                y1 = flow.cast(
                    flow.nn.dropout(
                        flow.nn.softmax(
                            flow.math.fused_scale_tril(
                                flow.cast(x1, dtype=flow.float16),
                                diagonal=diagonal,
                                fill_value=fill_value,
                                scale=scale,
                            ),
                        ),
                        rate=rate,
                        seed=seed,
                        name="dropout",
                    ),
                    dtype=flow.float,
                )
                y2 = flow.cast(
                    flow.nn.fused_scale_tril_softmax_dropout(
                        flow.cast(x2, dtype=flow.float16),
                        diagonal=diagonal,
                        fill_value=fill_value,
                        scale=scale,
                        rate=rate,
                        seed=seed,
                    ),
                    dtype=flow.float,
                )
            else:
                y1 = flow.nn.dropout(
                    flow.nn.softmax(
                        flow.math.fused_scale_tril(
                            x1, diagonal=diagonal, fill_value=fill_value, scale=scale
                        )
                    ),
                    rate=rate,
                    seed=seed,
                    name="dropout",
                )
                y2 = flow.nn.fused_scale_tril_softmax_dropout(
                    x2,
                    diagonal=diagonal,
                    fill_value=fill_value,
                    scale=scale,
                    rate=rate,
                    seed=seed,
                )
            flow.watch(y1, test_global_storage.Setter("y1"))
            flow.watch(y2, test_global_storage.Setter("y2"))
            flow.watch_diff(y1, test_global_storage.Setter("y1_diff"))
            flow.watch_diff(y2, test_global_storage.Setter("y2_diff"))

            loss = y1 + y2
            total_loss = loss * x
        flow.optimizer.SGD(
            flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0
        ).minimize(flow.math.reduce_sum(total_loss))

        return loss
            def test_job(x: oft.Numpy.Placeholder(x_shape, dtype=dtype)):
                v = flow.get_variable(
                    "x",
                    shape=x_shape,
                    dtype=dtype,
                    initializer=flow.constant_initializer(0),
                    trainable=True,
                )
                flow.watch_diff(v, assert_grad)
                x += v
                if data_type == "float16":
                    x = flow.cast(x, dtype=flow.float16)
                with flow.scope.placement(device_type, "0:0"):
                    param_shape = x.shape[begin_params_axis:]
                    gamma = None
                    beta = None
                    if center:
                        with flow.scope.namespace("LayerNorm"):
                            beta = flow.get_variable(
                                name="beta",
                                shape=param_shape,
                                dtype=flow.float,
                                initializer=flow.constant_initializer(0.0),
                                trainable=trainable,
                                model_name="beta",
                                reuse=False,
                            )
                            if trainable:
                                flow.watch_diff(beta, assert_grad_beta)
                            if data_type == "float16":
                                beta = flow.cast(beta, dtype=flow.float16)

                    if scale:
                        with flow.scope.namespace("LayerNorm"):
                            gamma = flow.get_variable(
                                name="gamma",
                                shape=param_shape,
                                dtype=flow.float,
                                initializer=flow.constant_initializer(1.0),
                                trainable=trainable,
                                model_name="gamma",
                                reuse=False,
                            )
                            if trainable:
                                if data_type == "float16":
                                    flow.watch_diff(
                                        gamma,
                                        test_global_storage.Setter(
                                            "gamma_diff"))
                                else:
                                    flow.watch_diff(gamma, assert_grad_gamma)
                            if data_type == "float16":
                                gamma = flow.cast(gamma, dtype=flow.float16)
                    x = flow.identity(x)

                    y = flow.nn.layer_norm(
                        x,
                        gamma=gamma,
                        beta=beta,
                        begin_norm_axis=begin_norm_axis,
                        begin_params_axis=begin_params_axis,
                        epsilon=epsilon,
                    )
                    z = y + x
                if data_type == "float16":
                    y = flow.cast(y, dtype=flow.float)
                    z = flow.cast(z, dtype=flow.float)

                flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                    [], [1e-4]),
                                   momentum=0).minimize(z)
                return y
Exemple #9
0
    def FlowJob(
            value: oft.Numpy.Placeholder(x_shape),
            bias: oft.Numpy.Placeholder(bias_shape),
            addend: oft.Numpy.Placeholder(x_shape),
    ):
        with flow.scope.placement(device_type, "0:0"):
            value += flow.get_variable(
                name="v1",
                shape=(1, ),
                dtype=flow.float,
                initializer=flow.zeros_initializer(),
            )
            bias += flow.get_variable(
                name="v2",
                shape=(1, ),
                dtype=flow.float,
                initializer=flow.zeros_initializer(),
            )
            addend += flow.get_variable(
                name="v3",
                shape=(1, ),
                dtype=flow.float,
                initializer=flow.zeros_initializer(),
            )

            x1 = flow.identity(value)
            x2 = flow.identity(value)

            bias1 = flow.identity(bias)
            bias2 = flow.identity(bias)

            addend1 = flow.identity(addend)
            addend2 = flow.identity(addend)

            flow.watch_diff(x1, test_global_storage.Setter("x1_diff"))
            flow.watch_diff(x2, test_global_storage.Setter("x2_diff"))

            flow.watch_diff(bias1, test_global_storage.Setter("bias1_diff"))
            flow.watch_diff(bias2, test_global_storage.Setter("bias2_diff"))

            flow.watch_diff(addend1,
                            test_global_storage.Setter("addend1_diff"))
            flow.watch_diff(addend2,
                            test_global_storage.Setter("addend2_diff"))

            if data_type == "float16":
                out1 = flow.nn.dropout(
                    flow.nn.bias_add(
                        flow.cast(x1, dtype=flow.float16),
                        flow.cast(bias1, dtype=flow.float16),
                        data_format=data_format,
                    ),
                    rate=rate,
                    seed=seed,
                    name="dropout",
                )
                y1 = flow.cast(
                    out1 + flow.cast(addend1, dtype=flow.float16),
                    dtype=flow.float,
                )
                out2 = flow.nn.fused_bias_add_dropout(
                    flow.cast(x2, dtype=flow.float16),
                    flow.cast(bias2, dtype=flow.float16),
                    data_format=data_format,
                    rate=rate,
                    seed=seed,
                )
                y2 = flow.cast(
                    out2 + flow.cast(addend2, dtype=flow.float16),
                    dtype=flow.float,
                )
            else:
                y1 = (flow.nn.dropout(
                    flow.nn.bias_add(x1, bias1, data_format=data_format),
                    rate=rate,
                    seed=seed,
                    name="dropout",
                ) + addend1)
                y2 = (flow.nn.fused_bias_add_dropout(
                    x2,
                    bias2,
                    data_format=data_format,
                    rate=rate,
                    seed=seed,
                ) + addend2)
            flow.watch(y1, test_global_storage.Setter("y1"))
            flow.watch(y2, test_global_storage.Setter("y2"))
            flow.watch_diff(y1, test_global_storage.Setter("y1_diff"))
            flow.watch_diff(y2, test_global_storage.Setter("y2_diff"))

            loss = y1 + y2
        flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([],
                                                                     [0.001]),
                           momentum=0).minimize(flow.math.reduce_sum(loss))

        return loss
Exemple #10
0
    def MatmulJob():
        with flow.scope.placement(device_type, "0:0"):
            a = flow.get_variable(
                "a",
                shape=a_shape,
                dtype=dtype,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=1),
                trainable=True,
            )
            b = flow.get_variable(
                "b",
                shape=b_shape,
                dtype=dtype,
                initializer=flow.random_uniform_initializer(minval=0,
                                                            maxval=1),
                trainable=True,
            )
            if data_type == "float16":
                out = flow.matmul(
                    flow.cast(a, dtype=flow.float16),
                    flow.cast(b, dtype=flow.float16),
                    transpose_a,
                    transpose_b,
                )
                c = flow.get_variable(
                    "c",
                    shape=out.shape,
                    dtype=dtype,
                    initializer=flow.random_uniform_initializer(minval=-1,
                                                                maxval=1),
                    trainable=True,
                )
                loss = flow.cast(out + flow.cast(c, dtype=flow.float16),
                                 dtype=flow.float)
            else:
                out = flow.matmul(a, b, transpose_a, transpose_b)
                c = flow.get_variable(
                    "c",
                    shape=out.shape,
                    dtype=dtype,
                    initializer=flow.random_uniform_initializer(minval=-1,
                                                                maxval=1),
                    trainable=True,
                )
                loss = out + c

            flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler(
                [], [1e-4]),
                               momentum=0).minimize(loss)

            flow.watch(a, test_global_storage.Setter("a"))
            flow.watch_diff(a, test_global_storage.Setter("a_diff"))
            flow.watch(b, test_global_storage.Setter("b"))
            flow.watch_diff(b, test_global_storage.Setter("b_diff"))
            flow.watch(c, test_global_storage.Setter("c"))
            flow.watch_diff(c, test_global_storage.Setter("c_diff"))
            flow.watch(loss, test_global_storage.Setter("loss"))
            flow.watch_diff(loss, test_global_storage.Setter("loss_diff"))

            return loss
Exemple #11
0
def lstm(input,
         units,
         return_sequence=False,
         initial_state=None,
         direction='forward',
         layer_index=0,
         is_train=True):
    '''
       input: sequence input tensor with shape [batch_size,sequence_length,embedding size]
       units: hidden units numbers
    '''
    batch_size = input.shape[0]
    seq_len = input.shape[1]
    input_size = input.shape[2]

    dtype = flow.float32
    with flow.scope.namespace('layer' + str(layer_index)):
        with flow.scope.namespace(direction):
            weight_blob_i = flow.get_variable(
                name='input' + '-weight',
                shape=[input_size, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            weight_blob_ih = flow.get_variable(
                name='input' + '-h-weight',
                shape=[units, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            bias_blob_i = flow.get_variable(
                name='input' + '-bias',
                shape=[units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.constant_initializer(0.0))

            weight_blob_f = flow.get_variable(
                name='forget' + '-weight',
                shape=[input_size, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            weight_blob_fh = flow.get_variable(
                name='forget' + '-h-weight',
                shape=[units, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            bias_blob_f = flow.get_variable(
                name='forget' + '-bias',
                shape=[units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.constant_initializer(0.0))

            weight_blob_c = flow.get_variable(
                name='cell' + '-weight',
                shape=[input_size, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            weight_blob_ch = flow.get_variable(
                name='cell' + '-h-weight',
                shape=[units, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            bias_blob_c = flow.get_variable(
                name='cell' + '-bias',
                shape=[units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.constant_initializer(0.0))

            weight_blob_o = flow.get_variable(
                name='output' + '-weight',
                shape=[input_size, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            weight_blob_oh = flow.get_variable(
                name='output' + '-h-weight',
                shape=[units, units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.glorot_normal_initializer())

            bias_blob_o = flow.get_variable(
                name='output' + '-bias',
                shape=[units],
                dtype=dtype,
                trainable=is_train,
                initializer=flow.constant_initializer(0.0))

    flow.watch(weight_blob_i, test_global_storage.Setter("weight_blob_i"))
    flow.watch(weight_blob_f, test_global_storage.Setter("weight_blob_f"))
    flow.watch(weight_blob_c, test_global_storage.Setter("weight_blob_c"))
    flow.watch(weight_blob_o, test_global_storage.Setter("weight_blob_o"))

    flow.watch(weight_blob_ih, test_global_storage.Setter("weight_blob_ih"))
    flow.watch(weight_blob_fh, test_global_storage.Setter("weight_blob_fh"))
    flow.watch(weight_blob_ch, test_global_storage.Setter("weight_blob_ch"))
    flow.watch(weight_blob_oh, test_global_storage.Setter("weight_blob_oh"))

    flow.watch(bias_blob_i, test_global_storage.Setter("bias_blob_i"))
    flow.watch(bias_blob_f, test_global_storage.Setter("bias_blob_f"))
    flow.watch(bias_blob_c, test_global_storage.Setter("bias_blob_c"))
    flow.watch(bias_blob_o, test_global_storage.Setter("bias_blob_o"))

    def step_function(input, states):

        hx = states[0]
        cx = states[1]

        x_i = _FullyConnected(input, weight_blob_i, bias_blob_i)  # input gate
        mark_int = x_i
        x_f = _FullyConnected(input, weight_blob_f, bias_blob_f)  # forget gate
        x_c = _FullyConnected(input, weight_blob_c, bias_blob_c)  # cell state
        x_o = _FullyConnected(input, weight_blob_o, bias_blob_o)  # output gate

        h_i = _FullyConnected(hx, weight_blob_ih, None)
        h_f = _FullyConnected(hx, weight_blob_fh, None)
        h_c = _FullyConnected(hx, weight_blob_ch, None)
        h_o = _FullyConnected(hx, weight_blob_oh, None)

        x_i = x_i + h_i
        x_f = x_f + h_f
        x_c = x_c + h_c
        x_o = x_o + h_o

        x_i = flow.math.sigmoid(x_i)
        x_f = flow.math.sigmoid(x_f)
        cellgate = flow.math.tanh(x_c)
        x_o = flow.math.sigmoid(x_o)

        cy = x_f * cx + x_i * cellgate

        hy = x_o * flow.math.tanh(cy)

        return hy, (hy, cy)

    if initial_state:
        states = initial_state
    else:
        states = [
            flow.constant(0, dtype=flow.float32, shape=[batch_size, units]),
            flow.constant(0, dtype=flow.float32, shape=[batch_size, units])
        ]

    successive_outputs = []
    successive_states = []

    for index in range(seq_len):
        # print('time step:',index)
        inp = flow.slice(input, [None, index, 0], [None, 1, input_size])
        # print(inp.shape)
        inp = flow.reshape(inp, [-1, input_size])
        # print(inp.shape)
        output, states = step_function(inp, states)

        output = flow.reshape(output, [-1, 1, units])
        successive_outputs.append(output)
        successive_states.append(states)
    last_output = successive_outputs[-1]
    new_states = successive_states[-1]
    outputs = flow.concat(successive_outputs, axis=1)

    if return_sequence:
        return outputs
    else:
        return flow.reshape(last_output, [-1, units])