Example #1
0
def _test_read_write(x):
    i = layers.zeros(shape=[1], dtype='int64')
    i.stop_gradient = False
    arr = layers.array_write(x=x[0], i=i)
    i = layers.increment(x=i)
    arr = layers.array_write(x=x[1], i=i, array=arr)
    i = layers.increment(x=i)
    arr = layers.array_write(x=x[2], i=i, array=arr)

    i = layers.zeros(shape=[1], dtype='int64')
    i.stop_gradient = False
    a0 = layers.array_read(array=arr, i=i)
    i = layers.increment(x=i)
    a1 = layers.array_read(array=arr, i=i)
    i = layers.increment(x=i)
    a2 = layers.array_read(array=arr, i=i)

    mean_a0 = layers.mean(a0)
    mean_a1 = layers.mean(a1)
    mean_a2 = layers.mean(a2)

    a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])

    mean_x0 = layers.mean(x[0])
    mean_x1 = layers.mean(x[1])
    mean_x2 = layers.mean(x[2])

    x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])

    return a_sum, x_sum
    def simple_net(self):
        d0 = layers.data(
            "d0", shape=[10], append_batch_size=False, dtype='float32')
        d1 = layers.data(
            "d1", shape=[10], append_batch_size=False, dtype='float32')
        d2 = layers.data(
            "d2", shape=[10], append_batch_size=False, dtype='float32')
        # fill_constant npu op doesn't support int64
        i = layers.zeros(shape=[1], dtype='int32')
        i = layers.cast(i, 'int64')
        i.stop_gradient = True
        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)
        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)
        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)
        i = layers.zeros(shape=[1], dtype='int32')
        i = layers.cast(i, 'int64')
        i.stop_gradient = True
        array_len = layers.fill_constant(shape=[1], dtype='int32', value=5)
        array_len = layers.cast(array_len, 'int64')
        array_len.stop_gradient = True
        cond = layers.ones(shape=[1], dtype='int32')
        cond = layers.cast(cond, 'bool')
        j = layers.fill_constant(shape=[1], dtype='int32', value=1)
        j = layers.cast(j, 'int64')
        j.stop_gradient = True
        array_len2 = layers.fill_constant(shape=[1], dtype='int32', value=3)
        array_len2 = layers.cast(array_len2, 'int64')
        array_len2.stop_gradient = True
        cond2 = layers.logical_or(x=j, y=array_len2)
        cond2 = layers.ones(shape=[1], dtype='int32')
        cond2 = layers.cast(cond2, 'bool')
        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)
        sum_result = layers.array_read(array=mem_array, i=j)
        loss = layers.mean(sum_result)
        return loss, sum_result
Example #3
0
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size, para_name, args):
    """Util function for pointer network"""

    def linear(inputs, para_name, args):
        return layers.fc(input=inputs,
                         size=size,
                         param_attr=fluid.ParamAttr(name=para_name + '_w'),
                         bias_attr=fluid.ParamAttr(name=para_name + '_b'))

    input_cat = layers.concat([hidden_t_prev, x_t], axis=1)
    forget_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_f',
                                          args))
    input_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_i',
                                         args))
    output_gate = layers.sigmoid(x=linear(input_cat, para_name + '_lstm_o',
                                          args))
    cell_tilde = layers.tanh(x=linear(input_cat, para_name + '_lstm_c', args))

    cell_t = layers.sums(input=[
        layers.elementwise_mul(
            x=forget_gate, y=cell_t_prev), layers.elementwise_mul(
                x=input_gate, y=cell_tilde)
    ])

    hidden_t = layers.elementwise_mul(x=output_gate, y=layers.tanh(x=cell_t))

    return hidden_t, cell_t
Example #4
0
    def create_rnn_op(self):
        x = layers.data(shape=[self.sent_len, self.batch_size, self.input_dim],
                        dtype='float32',
                        name='x',
                        append_batch_size=False)
        x.stop_gradient = False
        h_boot1 = layers.data(shape=[self.batch_size, self.input_dim],
                              dtype='float32',
                              name='h_boot1',
                              append_batch_size=False)
        h_boot1.stop_gradient = False
        h_boot2 = layers.data(shape=[self.batch_size, self.input_dim],
                              dtype='float32',
                              name='h_boot2',
                              append_batch_size=False)
        h_boot2.stop_gradient = False

        rnn = layers.StaticRNN()
        with rnn.step():
            h_pre1 = rnn.memory(init=h_boot1)
            h_pre2 = rnn.memory(init=h_boot2)
            x_t = rnn.step_input(x)

            mem1 = layers.scale(x=h_pre1, scale=1.0)
            mem2 = layers.scale(x=h_pre2, scale=1.0)
            out = layers.sums(input=[mem1, x_t, mem2])

            rnn.update_memory(h_pre1, mem1)
            rnn.update_memory(h_pre2, mem2)
            rnn.output(out)

        return rnn()
Example #5
0
    def test_simple_forward(self):
        d0 = layers.data(
            "d0", shape=[10], append_batch_size=False, dtype='float32')
        d1 = layers.data(
            "d1", shape=[10], append_batch_size=False, dtype='float32')
        d2 = layers.data(
            "d2", shape=[10], append_batch_size=False, dtype='float32')
        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True
        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        while_op = layers.While(cond=cond)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

        sum_result = layers.array_read(array=mem_array, i=i)
        loss = layers.mean(sum_result)

        append_backward(loss)

        cpu = core.CPUPlace()
        exe = Executor(cpu)
        d = []

        for i in xrange(3):
            d.append(numpy.random.random(size=[10]).astype('float32'))

        outs = exe.run(feed={'d0': d[0],
                             'd1': d[1],
                             'd2': d[2]},
                       fetch_list=[sum_result])
        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
Example #6
0
def weight_layers(lm_embeddings, name="", l2_coef=0.0):
    '''
    Weight the layers of a biLM with trainable scalar weights to
    compute ELMo representations.

    Input:
        lm_embeddings(list): representations of 2 layers from biLM.
        name = a string prefix used for the trainable variable names
        l2_coef: the l2 regularization coefficient $\lambda$.
            Pass None or 0.0 for no regularization.

    Output:
        weighted_lm_layers: weighted embeddings form biLM
    '''

    n_lm_layers = len(lm_embeddings)
    W = layers.create_parameter(
        [
            n_lm_layers,
        ],
        dtype="float32",
        name=name + "ELMo_w",
        attr=fluid.ParamAttr(name=name + "ELMo_w",
                             initializer=fluid.initializer.Constant(0.0),
                             regularizer=fluid.regularizer.L2Decay(l2_coef)))
    normed_weights = layers.softmax(W + 1.0 / n_lm_layers)
    splited_normed_weights = layers.split(normed_weights, n_lm_layers, dim=0)

    # compute the weighted, normalized LM activations
    pieces = []
    for w, t in zip(splited_normed_weights, lm_embeddings):
        pieces.append(t * w)
    sum_pieces = layers.sums(pieces)

    # scale the weighted sum by gamma
    gamma = layers.create_parameter(
        [1],
        dtype="float32",
        name=name + "ELMo_gamma",
        attr=fluid.ParamAttr(name=name + "ELMo_gamma",
                             initializer=fluid.initializer.Constant(1.0)))
    weighted_lm_layers = sum_pieces * gamma
    return weighted_lm_layers
Example #7
0
    def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
	def linear(inputs):
	    return layers.fc(input=inputs, size=size, bias_attr=True)

	forget_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t]))
	input_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t]))
	output_gate = layers.sigmoid(x=linear([hidden_t_prev, x_t]))
	cell_tilde = layers.tanh(x=linear([hidden_t_prev, x_t]))

	cell_t = layers.sums(input=[
	    layers.elementwise_mul(
		x=forget_gate, y=cell_t_prev), layers.elementwise_mul(
		    x=input_gate, y=cell_tilde)
	])

	hidden_t = layers.elementwise_mul(
	    x=output_gate, y=layers.tanh(x=cell_t))

	return hidden_t, cell_t 
Example #8
0
    def create_rnn_op(self):
        x = layers.data(
            shape=[self.sent_len, self.batch_size, self.input_dim],
            dtype='float32',
            name='x',
            append_batch_size=False,
            **self.p_info)
        x.stop_gradient = False
        h_boot1 = layers.data(
            shape=[self.batch_size, self.input_dim],
            dtype='float32',
            name='h_boot1',
            append_batch_size=False,
            **self.p_info)
        h_boot1.stop_gradient = False
        h_boot2 = layers.data(
            shape=[self.batch_size, self.input_dim],
            dtype='float32',
            name='h_boot2',
            append_batch_size=False,
            **self.p_info)
        h_boot2.stop_gradient = False

        rnn = layers.StaticRNN(main_program=self.main_program)
        with rnn.step():
            h_pre1 = rnn.memory(init=h_boot1)
            h_pre2 = rnn.memory(init=h_boot2)
            x_t = rnn.step_input(x)

            mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
            mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
            out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)

            rnn.update_memory(h_pre1, mem1)
            rnn.update_memory(h_pre2, mem2)
            rnn.output(out)

        return rnn()
Example #9
0
    def test_read_write(self):
        x = [
            layers.data(name='x0', shape=[100]),
            layers.data(name='x1', shape=[100]),
            layers.data(name='x2', shape=[100])
        ]
        for each_x in x:
            each_x.stop_gradient = False

        tensor = numpy.random.random(size=(100, 100)).astype('float32')
        a_sum, x_sum = _test_read_write(x)

        place = core.CPUPlace()
        exe = Executor(place)
        outs = exe.run(feed={
            'x0': tensor,
            'x1': tensor,
            'x2': tensor
        },
                       fetch_list=[a_sum, x_sum],
                       scope=core.Scope())
        self.assertEqual(outs[0], outs[1])

        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)

        append_backward(total_sum_scaled)

        g_vars = list(
            map(default_main_program().global_block().var,
                [each_x.name + "@GRAD" for each_x in x]))
        g_out = [
            item.sum() for item in exe.run(feed={
                'x0': tensor,
                'x1': tensor,
                'x2': tensor
            },
                                           fetch_list=g_vars)
        ]
        g_out_sum = numpy.array(g_out).sum()

        # since our final gradient is 1 and the neural network are all linear
        # with mean_op.
        # the input gradient should also be 1
        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)

        with fluid.dygraph.guard(place):
            tensor1 = fluid.dygraph.to_variable(tensor)
            tensor2 = fluid.dygraph.to_variable(tensor)
            tensor3 = fluid.dygraph.to_variable(tensor)
            x_dygraph = [tensor1, tensor2, tensor3]
            for each_x in x_dygraph:
                each_x.stop_gradient = False
            a_sum_dygraph, x_sum_dygraph = _test_read_write(x_dygraph)
            self.assertEqual(a_sum_dygraph, x_sum_dygraph)

            total_sum_dygraph = layers.sums(
                input=[a_sum_dygraph, x_sum_dygraph])
            total_sum_scaled_dygraph = layers.scale(x=total_sum_dygraph,
                                                    scale=1 / 6.0)
            total_sum_scaled_dygraph.backward()
            g_out_dygraph = [
                item._grad_ivar().numpy().sum() for item in x_dygraph
            ]
            g_out_sum_dygraph = numpy.array(g_out_dygraph).sum()

            self.assertAlmostEqual(1.0, g_out_sum_dygraph, delta=0.1)
Example #10
0
    def test_simple_forward(self):
        d0 = layers.data("d0",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d1 = layers.data("d1",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d2 = layers.data("d2",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
        j.stop_gradient = True

        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len2.stop_gradient = True
        cond2 = layers.less_than(x=j, y=array_len2)

        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)

        sum_result = layers.array_read(array=mem_array, i=j)
        loss = layers.mean(sum_result)

        append_backward(loss)

        cpu = core.CPUPlace()
        exe = Executor(cpu)
        d = []

        for i in range(3):
            d.append(numpy.random.random(size=[10]).astype('float32'))

        outs = exe.run(feed={
            'd0': d[0],
            'd1': d[1],
            'd2': d[2]
        },
                       fetch_list=[sum_result])
        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
Example #11
0
    def test_read_write(self):
        x = [
            layers.data(name='x0', shape=[100]),
            layers.data(name='x1', shape=[100]),
            layers.data(name='x2', shape=[100])
        ]

        for each_x in x:
            each_x.stop_gradient = False

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        arr = layers.array_write(x=x[0], i=i)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[1], i=i, array=arr)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[2], i=i, array=arr)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        a0 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a1 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a2 = layers.array_read(array=arr, i=i)

        mean_a0 = layers.mean(a0)
        mean_a1 = layers.mean(a1)
        mean_a2 = layers.mean(a2)

        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])

        mean_x0 = layers.mean(x[0])
        mean_x1 = layers.mean(x[1])
        mean_x2 = layers.mean(x[2])

        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])

        scope = core.Scope()
        cpu = core.CPUPlace()

        exe = Executor(cpu)

        tensor = numpy.random.random(size=(100, 100)).astype('float32')

        outs = exe.run(feed={
            'x0': tensor,
            'x1': tensor,
            'x2': tensor
        },
                       fetch_list=[a_sum, x_sum],
                       scope=scope)
        self.assertEqual(outs[0], outs[1])

        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)

        append_backward(total_sum_scaled)

        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])
        g_out = [
            item.sum() for item in exe.run(feed={
                'x0': tensor,
                'x1': tensor,
                'x2': tensor
            },
                                           fetch_list=g_vars)
        ]
        g_out_sum = numpy.array(g_out).sum()

        # since our final gradient is 1 and the neural network are all linear
        # with mean_op.
        # the input gradient should also be 1
        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
    def run_main(self, place, with_data_parallel):
        self.place = place
        self.with_data_parallel = with_data_parallel

        if not core.is_compiled_with_cuda() and isinstance(
                self.place, core.CUDAPlace):
            return

        if isinstance(self.place, core.CUDAPlace):
            device_cnt = core.get_cuda_device_count(
            ) if self.with_data_parallel else 1
        else:
            device_cnt = int(
                os.environ.get('CPU_NUM', multiprocessing.cpu_count())
            ) if self.with_data_parallel else 1

        d0 = layers.data("d0",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d1 = layers.data("d1",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')
        d2 = layers.data("d2",
                         shape=[10],
                         append_batch_size=False,
                         dtype='float32')

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)

        i = layers.increment(i)
        layers.array_write(d1, i, array=data_array)

        i = layers.increment(i)
        layers.array_write(d2, i, array=data_array)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
        j.stop_gradient = True

        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
        array_len2.stop_gradient = True
        cond2 = layers.less_than(x=j, y=array_len2)

        while_op = layers.While(cond=cond)
        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
            d = layers.reshape(d, shape=[10])
            prev = layers.reshape(prev, shape=[10])
            result = layers.sums(input=[d, prev])

            i = layers.increment(x=i, in_place=True)
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)
            with while_op2.block():
                d2 = layers.array_read(array=data_array, i=j)
                prev2 = layers.array_read(array=mem_array, i=j)
                d2 = layers.reshape(d2, shape=[10])
                prev2 = layers.reshape(prev2, shape=[10])
                result2 = layers.sums(input=[d2, prev2])

                j = layers.increment(x=j, in_place=True)
                layers.array_write(result2, i=j, array=mem_array)
                layers.less_than(x=j, y=array_len2, cond=cond2)

        sum_result = layers.array_read(array=mem_array, i=j)
        sum_result.persistable = True
        tmp = layers.unsqueeze(sum_result, axes=[0])
        tmp = layers.expand(tmp, expand_times=[10, 1])
        fc = layers.fc(tmp, size=256)
        loss = layers.mean(sum_result)

        optim = fluid.optimizer.Adam(learning_rate=1e-3)
        optim.minimize(loss)

        exe = Executor(self.place)
        exe.run(fluid.default_startup_program())

        prog = fluid.default_main_program()
        if self.with_data_parallel:
            prog = compiler.CompiledProgram(
                fluid.default_main_program()).with_data_parallel(
                    loss_name=loss.name)

        for _ in range(5):
            d = []
            for i in range(3):
                tmp = numpy.random.random(size=[10]).astype('float32')
                if not self.with_data_parallel:
                    d.append(tmp)
                else:
                    d.append(numpy.array([tmp] * device_cnt))

            outs = exe.run(program=prog,
                           feed={
                               'd0': d[0],
                               'd1': d[1],
                               'd2': d[2]
                           },
                           fetch_list=[sum_result])
            self.assertAlmostEqual(numpy.sum(d),
                                   numpy.sum(outs[0]),
                                   delta=0.01)
    def test_read_write(self):
        x = [
            layers.data(
                name='x0', shape=[100]), layers.data(
                    name='x1', shape=[100]), layers.data(
                        name='x2', shape=[100])
        ]

        for each_x in x:
            each_x.stop_gradient = False

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        arr = layers.array_write(x=x[0], i=i)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[1], i=i, array=arr)
        i = layers.increment(x=i)
        arr = layers.array_write(x=x[2], i=i, array=arr)

        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = False
        a0 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a1 = layers.array_read(array=arr, i=i)
        i = layers.increment(x=i)
        a2 = layers.array_read(array=arr, i=i)

        mean_a0 = layers.mean(a0)
        mean_a1 = layers.mean(a1)
        mean_a2 = layers.mean(a2)

        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])

        mean_x0 = layers.mean(x[0])
        mean_x1 = layers.mean(x[1])
        mean_x2 = layers.mean(x[2])

        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])

        scope = core.Scope()
        cpu = core.CPUPlace()

        exe = Executor(cpu)

        tensor = numpy.random.random(size=(100, 100)).astype('float32')

        outs = exe.run(feed={'x0': tensor,
                             'x1': tensor,
                             'x2': tensor},
                       fetch_list=[a_sum, x_sum],
                       scope=scope)
        self.assertEqual(outs[0], outs[1])

        total_sum = layers.sums(input=[a_sum, x_sum])
        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)

        append_backward(total_sum_scaled)

        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])
        g_out = [
            item.sum()
            for item in exe.run(
                feed={'x0': tensor,
                      'x1': tensor,
                      'x2': tensor},
                fetch_list=g_vars)
        ]
        g_out_sum = numpy.array(g_out).sum()

        # since our final gradient is 1 and the neural network are all linear
        # with mean_op.
        # the input gradient should also be 1
        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)