def test_bin_op(f): for s in [(1,), (2,2), (128,128)]: an = np.random.randn(*s) bn = np.random.randn(*s) ap = ph.Tensor(an).cuda() bp = ph.Tensor(bn).cuda() cp = f(ap, bp).cpu().np() torch.testing.assert_allclose(cp, f(an, bn))
def test_broadcast(): for s in [(1,), (2,2), (128,128)]: an = np.random.randn(1) liken = np.random.randn(*s) ap = ph.Tensor(an).cuda() likep = ph.Tensor(liken).cuda() cp = ap.broadcast_like(likep).cpu().np() torch.testing.assert_allclose(cp, np.broadcast_to(an, s))
def f3(a, b): return (a * b + a) * a + b def f4(a, b): c = a + b for _ in range(100): c = a * c return c funcs = [f0, f1, f2, f3, f4] for func in funcs: a = ph.Tensor(a_) b = ph.Tensor(b_) c = func(a, b) g = c.grad(a) a_grad = g(ph.Tensor(ones)).np() a = torch.tensor(a_) a.requires_grad = True b = torch.tensor(b_) c = func(a, b) c.backward(torch.tensor(ones)) torch.testing.assert_allclose(a_grad, a.grad.numpy()) for _ in range(4): a_np = np.random.randn(_, _).astype(np.float32)
import hma import pyhma as ph import numpy as np hma.set_debug(True) for A in [2, 5, 16, 27]: for B in [2, 5, 16, 27]: for C in [2, 5, 16, 27]: anp = np.random.randn(A, B).astype(np.float32) bnp = np.random.randn(B, C).astype(np.float32) a = ph.Tensor(anp).cuda() b = ph.Tensor(bnp).cuda() c = ph.Tensor(hma.mm_nn([a.cTensor, b.cTensor])[0]) torch.testing.assert_allclose(c.cpu().np(), (anp) @ (bnp)) anp = np.random.randn(A, B).astype(np.float32) bnp = np.random.randn(C, B).astype(np.float32) a = ph.Tensor(anp).cuda() b = ph.Tensor(bnp).cuda() c = ph.Tensor(hma.mm_nt([a.cTensor, b.cTensor])[0]) torch.testing.assert_allclose(c.cpu().np(), (anp) @ (bnp.T)) anp = np.random.randn(B, A).astype(np.float32) bnp = np.random.randn(B, C).astype(np.float32) a = ph.Tensor(anp).cuda() b = ph.Tensor(bnp).cuda() c = ph.Tensor(hma.mm_tn([a.cTensor, b.cTensor])[0])
def test_sum(): for s in [(1,), (2,2), (128,128)]: an = np.random.randn(*s) ap = ph.Tensor(an).cuda() cp = ap.sum().cpu().np() torch.testing.assert_allclose(cp, an.sum())
def test_unary_op(f, *args): for s in [(1,), (2,2), (128,128)]: an = np.random.randn(*s) ap = ph.Tensor(an).cuda() cp = f(ap, *args).cpu().np() torch.testing.assert_allclose(cp, f(an))
device = torch.device('cuda', 0) N = 64 C_i = 128 C_o = 32 xnp = np.random.randn(N, C_i).astype(np.float32) wnp = np.random.randn(C_o, C_i).astype(np.float32) snp = np.random.randn(N, C_o).astype(np.float32) xt = torch.tensor(xnp).to(device) wt = torch.tensor(wnp).to(device) st = torch.tensor(snp).to(device) yt = torch.nn.functional.linear(xt, wt) x = ph.Tensor(xnp).cuda() w = ph.Tensor(wnp).cuda() s = ph.Tensor(snp).cuda() y = ph.Tensor(hma.mm_nt([x.cTensor, w.cTensor])[0]) ynp = y.cpu().np() torch.testing.assert_allclose(y.cpu().np(), yt.cpu()) print("passed.") t = time.perf_counter() for i in range(10000): _ = torch.nn.functional.linear(xt, wt) print(_.cpu().numpy().shape) print(time.perf_counter() - t)
import numpy as np import hma import pyhma as ph a = np.random.randn(16) a_ = ph.Tensor(a) b = hma.to_cuda([a_.cTensor])[0] c = hma.mul([b, b])[0] d = hma.to_cpu([c])[0] assert np.allclose(ph.Tensor(d).np(), a * a) a = np.random.randn(16) b = np.random.randn(16) c_ref = a * b a = ph.Tensor(a).cuda() b = ph.Tensor(b).cuda() c = a * b assert np.allclose(c.cpu().np(), c_ref) _ = np.random.randn(128) k = ph.Tensor(_).cuda().sum() print(k.cpu().np(), _.sum())
import numpy as np import hma import pyhma as ph import torch import time iters = 10000 a = np.random.randn(1,1) a_ph = ph.Tensor(a) a_t = torch.tensor(a) h2 = ph.Tensor(np.array(2.0)).broadcast_like(a_ph) for _ in range(iters): _ = (a_t + a_t) / 2 t = time.time() for _ in range(iters): a_t = (a_t + a_t) / 2 print(a_t.numpy()) print("PT: ", time.time() - t) t = time.time() for _ in range(iters): a_ph = (a_ph + a_ph) / 2 print(a_ph.np()) print("PH: ", time.time() - t) size = 128 iters = 20000 print()