def test_sparseblockouter(self):
        o = ftensor4()
        x = ftensor3()
        y = ftensor3()
        xIdx = imatrix()
        yIdx = imatrix()

        out = self.outer_op(o, x, y, xIdx, yIdx)

        f = aesara.function([o, x, y, xIdx, yIdx],
                            out,
                            on_unused_input="warn",
                            mode=self.mode)

        (
            o_val,
            x_val,
            y_val,
            xIdx_val,
            yIdx_val,
        ) = self.outer_data()

        th_out = f(o_val, x_val, y_val, xIdx_val, yIdx_val)
        ref_out = self.outer_numpy(o_val, x_val, y_val, xIdx_val, yIdx_val)

        utt.assert_allclose(ref_out, th_out)
    def test_outer_infershape(self):
        o = ftensor4()
        x = ftensor3()
        y = ftensor3()
        xIdx = imatrix()
        yIdx = imatrix()

        self._compile_and_check(
            [o, x, y, xIdx, yIdx],
            [self.outer_op(o, x, y, xIdx, yIdx)],
            self.outer_data(),
            self.outer_class,
        )
    def test_sparseblockgemvF(self):
        # Test the fortran order for W (which can happen in the grad for some
        # graphs).

        b = fmatrix()
        W = ftensor4()
        h = ftensor3()
        iIdx = imatrix()
        oIdx = imatrix()

        o = self.gemv_op(
            b.take(oIdx, axis=0),
            DimShuffle((False, False, False, False),
                       (0, 1, 3, 2))(at.as_tensor_variable(W)),
            h,
            iIdx,
            oIdx,
        )

        f = aesara.function([W, h, iIdx, b, oIdx], o, mode=self.mode)

        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()

        th_out = f(np.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val)
        ref_out = self.gemv_numpy(b_val.take(oIdx_val, axis=0), W_val, h_val,
                                  iIdx_val, oIdx_val)

        utt.assert_allclose(ref_out, th_out)
Beispiel #4
0
    def make_node(self, activations, labels, input_lengths):
        t_activations = at.as_tensor_variable(activations)
        # Ensure activations array is C-contiguous
        t_activations = cpu_contiguous(t_activations)

        t_labels = at.as_tensor_variable(labels)
        t_input_lengths = at.as_tensor_variable(input_lengths)

        if t_activations.type.dtype != "float32":
            raise TypeError("activations must use the float32 type!")

        if t_activations.ndim != 3:
            raise ValueError("activations must have 3 dimensions.")

        if t_labels.type.dtype != "int32":
            raise TypeError("labels must use the int32 type!")

        if t_labels.ndim != 2:
            raise ValueError("labels must have 2 dimensions.")

        if t_input_lengths.type.dtype != "int32":
            raise TypeError("input_lengths must use the int32 type!")

        if t_input_lengths.ndim != 1:
            raise ValueError("input_lengths must have 1 dimension.")

        costs = fvector(name="ctc_cost")
        outputs = [costs]
        if self.compute_grad:
            gradients = ftensor3(name="ctc_grad")
            outputs += [gradients]

        return Apply(self,
                     inputs=[t_activations, t_labels, t_input_lengths],
                     outputs=outputs)
Beispiel #5
0
    def test_Strides3D(self, mode):
        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
        op_class = partial(self.op_class, mode=mode)
        x = ftensor3("x")

        for axis in (0, 1, 2, None, -1, -2, -3):
            a = np.random.random((42, 30, 25)).astype("float32")
            cumop_function = aesara.function([x],
                                             op_class(axis=axis)(x),
                                             mode=self.mode)

            slicings = [
                slice(None, None, None),  # Normal strides
                slice(None, None, 2),  # Stepped strides
                slice(None, None, -1),  # Negative strides
            ]

            # Cartesian product of all slicings to test.
            for slicing in product(slicings, repeat=x.ndim):
                f = aesara.function([x],
                                    op_class(axis=axis)(x[slicing]),
                                    mode=self.mode)
                assert [
                    n for n in f.maker.fgraph.toposort()
                    if isinstance(n.op, GpuCumOp)
                ]
                utt.assert_allclose(np_func(a[slicing], axis=axis), f(a))
                utt.assert_allclose(np_func(a[slicing], axis=axis),
                                    cumop_function(a[slicing]))
Beispiel #6
0
    def test_GpuCumOp3D(self, mode):
        np_func = dict(add=np.cumsum, mul=np.cumprod)[mode]
        op_class = partial(self.op_class, mode=mode)
        block_max_size = self.max_threads_dim0 * 2

        x = ftensor3("x")
        for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0],
                                    [0, 1, 2, None, -1, -2, -3]):
            f = aesara.function([x], op_class(axis=axis)(x), mode=self.mode)
            assert [
                n for n in f.maker.fgraph.toposort()
                if isinstance(n.op, GpuCumOp)
            ]

            # Extensive testing for the first 1025 sizes
            a_shape = [5, 5, 5]
            a_shape[shape_axis] = 1025
            a = np.random.rand(*a_shape).astype("float32")
            slices = [slice(None), slice(None), slice(None)]
            for i in range(a.shape[shape_axis]):
                slices[shape_axis] = slice(i)
                fa = f(a[slices])
                npa = np_func(a[slices], axis=axis)
                utt.assert_allclose(npa, fa)

            # Use multiple GPU threadblocks (along accumulation axis)
            a_shape = [2, 2, 2]
            a_shape[shape_axis] = block_max_size + 2
            a = np.random.random(a_shape).astype("float32")
            utt.assert_allclose(np_func(a, axis=axis), f(a))

            # Use multiple GPU gridblocks (not along accumulation axis)
            a_shape = [5, 5, 5]
            a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np_func(a, axis=axis), f(a))

            a_shape = [5, 5, 5]
            a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1
            a = np.random.random(a_shape).astype("float32")
            if axis is None:
                # Avoid floating point error
                a = np.sign(a - 0.5).astype("float32")
            utt.assert_allclose(np_func(a, axis=axis), f(a))

            # Use recursive cumop (along accumulation axis)
            a_shape = [3, 3, 3]
            a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2
            a = np.random.random(a_shape).astype("float32")
            a = np.sign(a - 0.5).astype(
                "float32")  # Avoid floating point error
            utt.assert_allclose(np_func(a, axis=axis), f(a))
    def test_gemv_infershape(self):
        b = fmatrix()
        W = ftensor4()
        h = ftensor3()
        iIdx = imatrix()
        oIdx = imatrix()

        self._compile_and_check(
            [W, h, iIdx, b, oIdx],
            [self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)],
            self.gemv_data(),
            self.gemv_class,
        )
    def test_dot_infershape(self):
        b = fmatrix()
        W = ftensor4()
        h = ftensor3()
        iIdx = imatrix()
        oIdx = imatrix()

        self._compile_and_check(
            [W, h, iIdx, b, oIdx],
            [sparse_block_dot(W, h, iIdx, b, oIdx)],
            self.gemv_data(),
            self.gemv_class,
        )
Beispiel #9
0
    def test_memory_reuse_gpudimshuffle(self):
        # Test the memory pre-allocation feature in scan when one output is
        # the result of a GpuDimshuffle (because an optimization in
        # GpuDimshuffle can cause issues with the memory pre-allocation
        # where it falsely thinks that a pre-allocated memory region has
        # been used when it hasn't).
        def inner_fn(seq1, recurrent_out):
            temp = seq1 + recurrent_out.sum()
            output1 = temp.dimshuffle(1, 0)
            output2 = temp.sum() + recurrent_out
            return output1, output2

        input1 = ftensor3()
        init = ftensor3()
        outputs_info = [None, init]

        out, _ = scan(
            inner_fn,
            sequences=[input1],
            outputs_info=outputs_info,
            mode=self.mode_with_gpu,
        )

        out1 = out[0].flatten()
        out2 = out[1].flatten()

        fct = aesara.function([input1, init], [out1, out2],
                              mode=self.mode_with_gpu)

        output = fct(np.ones((2, 1, 1), dtype="float32"),
                     np.ones((1, 1, 1), dtype="float32"))

        expected_output = (
            np.array([2, 4], dtype="float32"),
            np.array([3, 7], dtype="float32"),
        )
        utt.assert_allclose(output, expected_output)
Beispiel #10
0
def test_blocksparse_inplace_gemv_opt():
    b = fmatrix()
    W = ftensor4()
    h = ftensor3()
    iIdx = lmatrix()
    oIdx = lmatrix()

    o = sparse_block_dot(W, h, iIdx, b, oIdx)

    f = aesara.function([W, h, iIdx, b, oIdx], o)

    if aesara.config.mode == "FAST_COMPILE":
        assert not f.maker.fgraph.toposort()[-1].op.inplace
        assert check_stack_trace(f, ops_to_check=[sparse_block_gemv])
    else:
        assert f.maker.fgraph.toposort()[-1].op.inplace
        assert check_stack_trace(f, ops_to_check=[sparse_block_gemv_inplace])
Beispiel #11
0
def test_blocksparse_inplace_outer_opt():
    b = fmatrix()
    W = ftensor4()
    h = ftensor3()
    iIdx = lmatrix()
    oIdx = lmatrix()

    o = sparse_block_dot(W, h, iIdx, b, oIdx)

    f = aesara.function([W, h, iIdx, b, oIdx],
                        [o, aesara.gradient.grad(o.sum(), wrt=W)])

    if aesara.config.mode == "FAST_COMPILE":
        assert not f.maker.fgraph.toposort()[-1].op.inplace
        assert check_stack_trace(f, ops_to_check=sparse_block_outer)
    else:
        assert f.maker.fgraph.toposort()[-1].op.inplace
        assert check_stack_trace(f, ops_to_check=sparse_block_outer_inplace)
    def test_sparseblockgemv_grad_shape(self):
        b = fmatrix()
        W = ftensor4()
        h = ftensor3()
        iIdx = imatrix()
        oIdx = imatrix()

        o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
        go = aesara.grad(o.sum(), [b, W, h])

        f = aesara.function([W, h, iIdx, b, oIdx], go, mode=self.mode)

        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()

        # just make sure that it runs correctly and all the shapes are ok.
        b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val)

        assert b_g.shape == b_val.shape
        assert h_g.shape == h_val.shape
        assert W_g.shape == W_val.shape
    def test_sparseblockgemv(self):
        # Compares the numpy and aesara versions of sparseblockgemv.

        b = fmatrix()
        W = ftensor4()
        h = ftensor3()
        iIdx = imatrix()
        oIdx = imatrix()

        o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)

        f = aesara.function([W, h, iIdx, b, oIdx], o, mode=self.mode)

        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()

        th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val)
        ref_out = self.gemv_numpy(b_val.take(oIdx_val, axis=0), W_val, h_val,
                                  iIdx_val, oIdx_val)

        utt.assert_allclose(ref_out, th_out)
Beispiel #14
0
    def test_blocksparse_grad_merge(self):
        b = fmatrix()
        h = ftensor3()
        iIdx = lmatrix()
        oIdx = lmatrix()

        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
        W = gpuarray_shared_constructor(W_val, context=test_ctx_name)

        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
        gW = aesara.grad(o.sum(), W)

        lr = np.asarray(0.05, dtype="float32")

        upd = W - lr * gW

        f1 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu)

        # Make sure the lr update was merged.
        assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter)

        # Exclude the merge optimizations.
        mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha")
        mode = mode.excluding("local_merge_blocksparse_output")

        f2 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

        # Make sure the lr update is not merged.
        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter)

        f2(h_val, iIdx_val, b_val, oIdx_val)
        W_ref = W.get_value()

        # reset the var
        W.set_value(W_val)
        f1(h_val, iIdx_val, b_val, oIdx_val)
        W_opt = W.get_value()

        utt.assert_allclose(W_ref, W_opt)
Beispiel #15
0
    def test_gpu_memory_usage(self):
        # This test validates that the memory usage of the defined aesara
        # function is reasonnable when executed on the GPU. It checks for
        # a bug in which one of scan's optimization was not applied which
        # made the scan node compute large and unnecessary outputs which
        # brought memory usage on the GPU to ~12G.

        # Dimensionality of input and output data (not one-hot coded)
        n_in = 100
        n_out = 100
        # Number of neurons in hidden layer
        n_hid = 4000

        # Number of minibatches
        mb_size = 2
        # Time steps in minibatch
        mb_length = 200

        # Define input variables
        xin = ftensor3(name="xin")
        yout = ftensor3(name="yout")

        # Initialize the network parameters
        U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"),
                          name="W_xin_to_l1")
        V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"),
                          name="W_l1_to_l1")
        W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"),
                          name="W_l1_to_l2")
        nparams = [U, V, W]

        # Build the forward pass
        l1_base = dot(xin, U)

        def scan_l(baseline, last_step):
            return baseline + dot(last_step, V)

        zero_output = aet.alloc(np.asarray(0.0, dtype="float32"), mb_size,
                                n_hid)

        l1_out, _ = scan(
            scan_l,
            sequences=[l1_base],
            outputs_info=[zero_output],
            mode=self.mode_with_gpu_nodebug,
        )

        l2_out = dot(l1_out, W)

        # Compute the cost and take the gradient wrt params
        cost = tt_sum((l2_out - yout)**2)
        grads = aesara.grad(cost, nparams)
        updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads))))

        # Compile the aesara function
        feval_backprop = aesara.function([xin, yout],
                                         cost,
                                         updates=updates,
                                         mode=self.mode_with_gpu_nodebug)

        # Validate that the PushOutScanOutput optimization has been applied
        # by checking the number of outputs of the grad Scan node in the
        # compiled function.
        nodes = feval_backprop.maker.fgraph.toposort()
        scan_nodes = [n for n in nodes if isinstance(n.op, Scan)]

        # The grad scan is always the 2nd one according to toposort. If the
        # optimization has been applied, it has 2 outputs, otherwise 3.
        grad_scan_node = scan_nodes[1]
        assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs)

        # Call the aesara function to ensure the absence of a memory error
        feval_backprop(
            np.zeros((mb_length, mb_size, n_in), dtype="float32"),
            np.zeros((mb_length, mb_size, n_out), dtype="float32"),
        )