def test_sparseblockouter(self): o = ftensor4() x = ftensor3() y = ftensor3() xIdx = imatrix() yIdx = imatrix() out = self.outer_op(o, x, y, xIdx, yIdx) f = aesara.function([o, x, y, xIdx, yIdx], out, on_unused_input="warn", mode=self.mode) ( o_val, x_val, y_val, xIdx_val, yIdx_val, ) = self.outer_data() th_out = f(o_val, x_val, y_val, xIdx_val, yIdx_val) ref_out = self.outer_numpy(o_val, x_val, y_val, xIdx_val, yIdx_val) utt.assert_allclose(ref_out, th_out)
def test_outer_infershape(self): o = ftensor4() x = ftensor3() y = ftensor3() xIdx = imatrix() yIdx = imatrix() self._compile_and_check( [o, x, y, xIdx, yIdx], [self.outer_op(o, x, y, xIdx, yIdx)], self.outer_data(), self.outer_class, )
def test_sparseblockgemvF(self): # Test the fortran order for W (which can happen in the grad for some # graphs). b = fmatrix() W = ftensor4() h = ftensor3() iIdx = imatrix() oIdx = imatrix() o = self.gemv_op( b.take(oIdx, axis=0), DimShuffle((False, False, False, False), (0, 1, 3, 2))(at.as_tensor_variable(W)), h, iIdx, oIdx, ) f = aesara.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() th_out = f(np.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val) ref_out = self.gemv_numpy(b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def make_node(self, activations, labels, input_lengths): t_activations = at.as_tensor_variable(activations) # Ensure activations array is C-contiguous t_activations = cpu_contiguous(t_activations) t_labels = at.as_tensor_variable(labels) t_input_lengths = at.as_tensor_variable(input_lengths) if t_activations.type.dtype != "float32": raise TypeError("activations must use the float32 type!") if t_activations.ndim != 3: raise ValueError("activations must have 3 dimensions.") if t_labels.type.dtype != "int32": raise TypeError("labels must use the int32 type!") if t_labels.ndim != 2: raise ValueError("labels must have 2 dimensions.") if t_input_lengths.type.dtype != "int32": raise TypeError("input_lengths must use the int32 type!") if t_input_lengths.ndim != 1: raise ValueError("input_lengths must have 1 dimension.") costs = fvector(name="ctc_cost") outputs = [costs] if self.compute_grad: gradients = ftensor3(name="ctc_grad") outputs += [gradients] return Apply(self, inputs=[t_activations, t_labels, t_input_lengths], outputs=outputs)
def test_Strides3D(self, mode): np_func = dict(add=np.cumsum, mul=np.cumprod)[mode] op_class = partial(self.op_class, mode=mode) x = ftensor3("x") for axis in (0, 1, 2, None, -1, -2, -3): a = np.random.random((42, 30, 25)).astype("float32") cumop_function = aesara.function([x], op_class(axis=axis)(x), mode=self.mode) slicings = [ slice(None, None, None), # Normal strides slice(None, None, 2), # Stepped strides slice(None, None, -1), # Negative strides ] # Cartesian product of all slicings to test. for slicing in product(slicings, repeat=x.ndim): f = aesara.function([x], op_class(axis=axis)(x[slicing]), mode=self.mode) assert [ n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp) ] utt.assert_allclose(np_func(a[slicing], axis=axis), f(a)) utt.assert_allclose(np_func(a[slicing], axis=axis), cumop_function(a[slicing]))
def test_GpuCumOp3D(self, mode): np_func = dict(add=np.cumsum, mul=np.cumprod)[mode] op_class = partial(self.op_class, mode=mode) block_max_size = self.max_threads_dim0 * 2 x = ftensor3("x") for shape_axis, axis in zip([0, 1, 2, 0, 2, 1, 0], [0, 1, 2, None, -1, -2, -3]): f = aesara.function([x], op_class(axis=axis)(x), mode=self.mode) assert [ n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp) ] # Extensive testing for the first 1025 sizes a_shape = [5, 5, 5] a_shape[shape_axis] = 1025 a = np.random.rand(*a_shape).astype("float32") slices = [slice(None), slice(None), slice(None)] for i in range(a.shape[shape_axis]): slices[shape_axis] = slice(i) fa = f(a[slices]) npa = np_func(a[slices], axis=axis) utt.assert_allclose(npa, fa) # Use multiple GPU threadblocks (along accumulation axis) a_shape = [2, 2, 2] a_shape[shape_axis] = block_max_size + 2 a = np.random.random(a_shape).astype("float32") utt.assert_allclose(np_func(a, axis=axis), f(a)) # Use multiple GPU gridblocks (not along accumulation axis) a_shape = [5, 5, 5] a_shape[(shape_axis + 1) % 3] = self.max_grid_size1 + 1 a = np.random.random(a_shape).astype("float32") if axis is None: # Avoid floating point error a = np.sign(a - 0.5).astype("float32") utt.assert_allclose(np_func(a, axis=axis), f(a)) a_shape = [5, 5, 5] a_shape[(shape_axis + 2) % 3] = self.max_grid_size1 + 1 a = np.random.random(a_shape).astype("float32") if axis is None: # Avoid floating point error a = np.sign(a - 0.5).astype("float32") utt.assert_allclose(np_func(a, axis=axis), f(a)) # Use recursive cumop (along accumulation axis) a_shape = [3, 3, 3] a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2 a = np.random.random(a_shape).astype("float32") a = np.sign(a - 0.5).astype( "float32") # Avoid floating point error utt.assert_allclose(np_func(a, axis=axis), f(a))
def test_gemv_infershape(self): b = fmatrix() W = ftensor4() h = ftensor3() iIdx = imatrix() oIdx = imatrix() self._compile_and_check( [W, h, iIdx, b, oIdx], [self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx)], self.gemv_data(), self.gemv_class, )
def test_dot_infershape(self): b = fmatrix() W = ftensor4() h = ftensor3() iIdx = imatrix() oIdx = imatrix() self._compile_and_check( [W, h, iIdx, b, oIdx], [sparse_block_dot(W, h, iIdx, b, oIdx)], self.gemv_data(), self.gemv_class, )
def test_memory_reuse_gpudimshuffle(self): # Test the memory pre-allocation feature in scan when one output is # the result of a GpuDimshuffle (because an optimization in # GpuDimshuffle can cause issues with the memory pre-allocation # where it falsely thinks that a pre-allocated memory region has # been used when it hasn't). def inner_fn(seq1, recurrent_out): temp = seq1 + recurrent_out.sum() output1 = temp.dimshuffle(1, 0) output2 = temp.sum() + recurrent_out return output1, output2 input1 = ftensor3() init = ftensor3() outputs_info = [None, init] out, _ = scan( inner_fn, sequences=[input1], outputs_info=outputs_info, mode=self.mode_with_gpu, ) out1 = out[0].flatten() out2 = out[1].flatten() fct = aesara.function([input1, init], [out1, out2], mode=self.mode_with_gpu) output = fct(np.ones((2, 1, 1), dtype="float32"), np.ones((1, 1, 1), dtype="float32")) expected_output = ( np.array([2, 4], dtype="float32"), np.array([3, 7], dtype="float32"), ) utt.assert_allclose(output, expected_output)
def test_blocksparse_inplace_gemv_opt(): b = fmatrix() W = ftensor4() h = ftensor3() iIdx = lmatrix() oIdx = lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = aesara.function([W, h, iIdx, b, oIdx], o) if aesara.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=[sparse_block_gemv]) else: assert f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=[sparse_block_gemv_inplace])
def test_blocksparse_inplace_outer_opt(): b = fmatrix() W = ftensor4() h = ftensor3() iIdx = lmatrix() oIdx = lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = aesara.function([W, h, iIdx, b, oIdx], [o, aesara.gradient.grad(o.sum(), wrt=W)]) if aesara.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer) else: assert f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer_inplace)
def test_sparseblockgemv_grad_shape(self): b = fmatrix() W = ftensor4() h = ftensor3() iIdx = imatrix() oIdx = imatrix() o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx) go = aesara.grad(o.sum(), [b, W, h]) f = aesara.function([W, h, iIdx, b, oIdx], go, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() # just make sure that it runs correctly and all the shapes are ok. b_g, W_g, h_g = f(W_val, h_val, iIdx_val, b_val, oIdx_val) assert b_g.shape == b_val.shape assert h_g.shape == h_val.shape assert W_g.shape == W_val.shape
def test_sparseblockgemv(self): # Compares the numpy and aesara versions of sparseblockgemv. b = fmatrix() W = ftensor4() h = ftensor3() iIdx = imatrix() oIdx = imatrix() o = self.gemv_op(b.take(oIdx, axis=0), W, h, iIdx, oIdx) f = aesara.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val) ref_out = self.gemv_numpy(b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def test_blocksparse_grad_merge(self): b = fmatrix() h = ftensor3() iIdx = lmatrix() oIdx = lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = gpuarray_shared_constructor(W_val, context=test_ctx_name) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = aesara.grad(o.sum(), W) lr = np.asarray(0.05, dtype="float32") upd = W - lr * gW f1 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha") mode = mode.excluding("local_merge_blocksparse_output") f2 = aesara.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def test_gpu_memory_usage(self): # This test validates that the memory usage of the defined aesara # function is reasonnable when executed on the GPU. It checks for # a bug in which one of scan's optimization was not applied which # made the scan node compute large and unnecessary outputs which # brought memory usage on the GPU to ~12G. # Dimensionality of input and output data (not one-hot coded) n_in = 100 n_out = 100 # Number of neurons in hidden layer n_hid = 4000 # Number of minibatches mb_size = 2 # Time steps in minibatch mb_length = 200 # Define input variables xin = ftensor3(name="xin") yout = ftensor3(name="yout") # Initialize the network parameters U = aesara.shared(np.zeros((n_in, n_hid), dtype="float32"), name="W_xin_to_l1") V = aesara.shared(np.zeros((n_hid, n_hid), dtype="float32"), name="W_l1_to_l1") W = aesara.shared(np.zeros((n_hid, n_out), dtype="float32"), name="W_l1_to_l2") nparams = [U, V, W] # Build the forward pass l1_base = dot(xin, U) def scan_l(baseline, last_step): return baseline + dot(last_step, V) zero_output = aet.alloc(np.asarray(0.0, dtype="float32"), mb_size, n_hid) l1_out, _ = scan( scan_l, sequences=[l1_base], outputs_info=[zero_output], mode=self.mode_with_gpu_nodebug, ) l2_out = dot(l1_out, W) # Compute the cost and take the gradient wrt params cost = tt_sum((l2_out - yout)**2) grads = aesara.grad(cost, nparams) updates = list(zip(nparams, (n - g for n, g in zip(nparams, grads)))) # Compile the aesara function feval_backprop = aesara.function([xin, yout], cost, updates=updates, mode=self.mode_with_gpu_nodebug) # Validate that the PushOutScanOutput optimization has been applied # by checking the number of outputs of the grad Scan node in the # compiled function. nodes = feval_backprop.maker.fgraph.toposort() scan_nodes = [n for n in nodes if isinstance(n.op, Scan)] # The grad scan is always the 2nd one according to toposort. If the # optimization has been applied, it has 2 outputs, otherwise 3. grad_scan_node = scan_nodes[1] assert len(grad_scan_node.outputs) == 2, len(grad_scan_node.outputs) # Call the aesara function to ensure the absence of a memory error feval_backprop( np.zeros((mb_length, mb_size, n_in), dtype="float32"), np.zeros((mb_length, mb_size, n_out), dtype="float32"), )