def full(self, X, Xs=None): X, Xs = self._slice(X, Xs) scf_x = self.scaling_func(X, self.args) if Xs is None: return at.outer(scf_x, scf_x) * self.cov_func(X) else: scf_xs = self.scaling_func(Xs, self.args) return at.outer(scf_x, scf_xs) * self.cov_func(X, Xs)
def forward(self): z = self.z0 # sxd u = self.u_ # d w = self.w_ # d b = self.b # . h = self.h # f # h(sxd \dot d + .) = s if not self.batched: hwz = h(z.dot(w) + b) # s # sxd + (s \outer d) = sxd z1 = z + aet.outer(hwz, u) # sxd return z1 else: z = z.swapaxes(0, 1) # z bxsxd # u bxd # w bxd b = b.dimshuffle(0, "x") # b bx- hwz = h(aet.batched_dot(z, w) + b) # bxs # bxsxd + (bxsx- * bx-xd) = bxsxd hwz = hwz.dimshuffle(0, 1, "x") # bxsx- u = u.dimshuffle(0, "x", 1) # bx-xd z1 = z + hwz * u # bxsxd return z1.swapaxes(0, 1) # sxbxd
def test_profiling(self): config1 = config.profile config2 = config.profile_memory config3 = config.profiling__min_peak_memory try: config.profile = True config.profile_memory = True config.profiling__min_peak_memory = True x = [fvector("val%i" % i) for i in range(3)] z = [] z += [ aet.outer(x[i], x[i + 1]).sum(axis=1) for i in range(len(x) - 1) ] z += [x[i] + x[i + 1] for i in range(len(x) - 1)] p = ProfileStats(False, gpu_checks=False) if config.mode in ["DebugMode", "DEBUG_MODE", "FAST_COMPILE"]: m = "FAST_RUN" else: m = None f = function(x, z, profile=p, name="test_profiling", mode=m) inp = [np.arange(1024, dtype="float32") + 1 for i in range(len(x))] f(*inp) buf = StringIO() f.profile.summary(buf) # regression testing for future algo speed up the_string = buf.getvalue() lines1 = [ l for l in the_string.split("\n") if "Max if linker" in l ] lines2 = [l for l in the_string.split("\n") if "Minimum peak" in l] if config.device == "cpu": assert "CPU: 4112KB (4104KB)" in the_string, (lines1, lines2) assert "CPU: 8204KB (8196KB)" in the_string, (lines1, lines2) assert "CPU: 8208KB" in the_string, (lines1, lines2) assert ( "Minimum peak from all valid apply node order is 4104KB" in the_string), (lines1, lines2) else: assert "CPU: 16KB (16KB)" in the_string, (lines1, lines2) assert "GPU: 8204KB (8204KB)" in the_string, (lines1, lines2) assert "GPU: 12300KB (12300KB)" in the_string, (lines1, lines2) assert "GPU: 8212KB" in the_string, (lines1, lines2) assert ( "Minimum peak from all valid apply node order is 4116KB" in the_string), (lines1, lines2) finally: config.profile = config1 config.profile_memory = config2 config.profiling__min_peak_memory = config3
def test_A_plus_scaled_outer(self): skip_if_blas_ldflags_empty() f = self.function( [self.A, self.x, self.y], self.A + 0.1 * at.outer(self.x, self.y) ) self.assertFunctionContains(f, CGer(destructive=False)) self.run_f(f) # DebugMode tests correctness
def L_op(self, inputs, outputs, output_gradients): # Modified from aesara/tensor/slinalg.py A, b = inputs c = outputs[0] c_bar = output_gradients[0] # FIXME: triangular structure would use GpuCublasTriangularsolve? # no need to handle A_structure like slinalg.py? trans_solve_op = GpuCusolverSolve("general") b_bar = trans_solve_op(A.T, c_bar) A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) return [A_bar, b_bar]
def full(self, X, Xs=None): X, Xs = self._slice(X, Xs) rx = self.lfunc(at.as_tensor_variable(X), self.args) if Xs is None: rz = self.lfunc(at.as_tensor_variable(X), self.args) r2 = self.square_dist(X, X) else: rz = self.lfunc(at.as_tensor_variable(Xs), self.args) r2 = self.square_dist(X, Xs) rx2 = at.reshape(at.square(rx), (-1, 1)) rz2 = at.reshape(at.square(rz), (1, -1)) return at.sqrt((2.0 * at.outer(rx, rz)) / (rx2 + rz2)) * at.exp(-1.0 * r2 / (rx2 + rz2))
def __init__(self, v=None, **kwargs): super().__init__(**kwargs) v = self.add_param(v, "v") self.shared_params = dict(v=v) if self.batched: vv = v.dimshuffle(0, 1, "x") * v.dimshuffle(0, "x", 1) I = aet.eye(self.dim).dimshuffle("x", 0, 1) vvn = (1e-10 + (v**2).sum(-1)).dimshuffle(0, "x", "x") else: vv = aet.outer(v, v) I = aet.eye(self.dim) vvn = (v**2).sum(-1) + 1e-10 self.H = I - 2.0 * vv / vvn
def grad(self, inputs, cost_grad): """ In defining the gradient, the Finite Fourier Transform is viewed as a complex-differentiable function of a complex variable """ a = inputs[0] n = inputs[1] axis = inputs[2] grad = cost_grad[0] if not isinstance(axis, tensor.TensorConstant): raise NotImplementedError( "%s: gradient is currently implemented" " only for axis being a Aesara constant" % self.__class__.__name__) axis = int(axis.data) # notice that the number of actual elements in wrto is independent of # possible padding or truncation: elem = tensor.arange(0, tensor.shape(a)[axis], 1) # accounts for padding: freq = tensor.arange(0, n, 1) outer = tensor.outer(freq, elem) pow_outer = tensor.exp(((-2 * math.pi * 1j) * outer) / (1.0 * n)) res = tensor.tensordot(grad, pow_outer, (axis, 0)) # This would be simpler but not implemented by aesara: # res = tensor.switch(tensor.lt(n, tensor.shape(a)[axis]), # tensor.set_subtensor(res[...,n::], 0, False, False), res) # Instead we resort to that to account for truncation: flip_shape = list(np.arange(0, a.ndim)[::-1]) res = res.dimshuffle(flip_shape) res = tensor.switch( tensor.lt(n, tensor.shape(a)[axis]), tensor.set_subtensor( res[n::, ], 0, False, False, ), res, ) res = res.dimshuffle(flip_shape) # insures that gradient shape conforms to input shape: out_shape = (list(np.arange(0, axis)) + [a.ndim - 1] + list(np.arange(axis, a.ndim - 1))) res = res.dimshuffle(*out_shape) return [res, None, None]
def L_op(self, inputs, outputs, output_gradients): # Modified from aesara/tensor/slinalg.py A, b = inputs c = outputs[0] c_bar = output_gradients[0] trans_solve_op = GpuCublasTriangularSolve(not self.lower) b_bar = trans_solve_op(A.T, c_bar) A_bar = -tensor.outer(b_bar, c) if c.ndim == 1 else -b_bar.dot(c.T) if self.lower: A_bar = tensor.tril(A_bar) else: A_bar = tensor.triu(A_bar) return [A_bar, b_bar]
def test_not_inplace(): # Test that we can remove optimizers including inplace optimizers nan_detected = [False] def detect_nan(fgraph, i, node, fn): for output in fn.outputs: if np.isnan(output[0]).any(): print("*** NaN detected ***") debugprint(node) print("Inputs : %s" % [input[0] for input in fn.inputs]) print("Outputs: %s" % [output[0] for output in fn.outputs]) nan_detected[0] = True break x = vector("x") mode = MonitorMode(post_func=detect_nan) # mode = mode.excluding('fusion', 'inplace') mode = mode.excluding("local_elemwise_fusion", "inplace_elemwise_optimizer") o = outer(x, x) out = log(o) * o f = function([x], [out], mode=mode) # Test that the fusion wasn't done assert len(f.maker.fgraph.apply_nodes) == 5 assert not f.maker.fgraph.toposort()[-1].op.destroy_map try: old_stdout = sys.stdout sys.stdout = StringIO() f([0, 0]) # log(0) * 0 = -inf * 0 = NaN finally: sys.stdout = old_stdout # Test that we still detect the nan assert nan_detected[0]
def test_optimization_pipeline_float(self): skip_if_blas_ldflags_empty() self.manual_setup_method("float32") f = self.function([self.x, self.y], aet.outer(self.x, self.y)) self.assertFunctionContains(f, CGer(destructive=True)) f(self.xval, self.yval) # DebugMode tests correctness
def test_int_fails(self): self.manual_setup_method("int32") f = self.function([self.x, self.y], aet.outer(self.x, self.y)) self.assertFunctionContains0(f, CGer(destructive=True)) self.assertFunctionContains0(f, CGer(destructive=False))
def test_outer(self): f = self.function([self.x, self.y], tensor.outer(self.x, self.y)) self.assertFunctionContains(f, ScipyGer(destructive=True))
def flat_outer(a, b): return at.outer(a, b).ravel()
def test_A_plus_scaled_outer(self): f = self.function( [self.A, self.x, self.y], self.A + 0.1 * tensor.outer(self.x, self.y) ) self.assertFunctionContains(f, ScipyGer(destructive=False)) self.run_f(f) # DebugMode tests correctness
def test_scaled_A_plus_scaled_outer(self): f = self.function( [self.A, self.x, self.y], 0.2 * self.A + 0.1 * tensor.outer(self.x, self.y) ) self.assertFunctionContains(f, gemm_no_inplace) self.run_f(f) # DebugMode tests correctness
def test_optimization_pipeline(self): skip_if_blas_ldflags_empty() f = self.function([self.x, self.y], at.outer(self.x, self.y)) self.assertFunctionContains(f, CGer(destructive=True)) f(self.xval, self.yval) # DebugMode tests correctness