Esempio n. 1
0
    def _oper_cpu(cls, x, pz, ps, parameter):
        p = parameter
        s = np.zeros(
            (x.shape[0],
             p["w"].shape[1] // 4), dtype=precision) if ps is None else ps
        z = np.zeros(
            (x.shape[0],
             p["w"].shape[1] // 4), dtype=precision) if pz is None else pz

        u = dot(x, p["w"]) + dot(z, p["wr"]) + p["b"]
        m = u.shape[1] // 4
        u, gated = np.split(u, [
            m,
        ], axis=1)
        u = tanh(u)

        gated = sigmoid(gated)

        state = gated[:, m:m * 2] * u + gated[:, :m] * s
        z = tanh(state) * gated[:, m * 2:]

        ret = cls._create_node(z)
        ret.attrs._x = x
        ret.attrs._p = parameter
        ret.attrs._u = u
        ret.attrs._pstate = ps
        ret.attrs._state = state
        ret.attrs._gated = gated
        ret.attrs._dt_d = [p[k] for k in ["wr", "w"]]
        ret._state = state

        if isinstance(pz, Node):
            pz.attrs._pfgate = gated[:, :m]

        return ret
Esempio n. 2
0
    def _oper_cpu(cls, x, pz, ps, w, wr, b):
        s = np.zeros((x.shape[0], w.shape[1] // 4), dtype=precision) if ps is None else ps
        z = np.zeros((x.shape[0], w.shape[1] // 4), dtype=precision) if pz is None else pz

        u = dot(x, w) + dot(z, wr) + b
        m = u.shape[1] // 4
        u, gated = np.split(u, [m, ], axis=1)
        u = tanh(u)

        gated = sigmoid(gated)

        state = gated[:, m:m * 2] * u + gated[:, :m] * s
        z = tanh(state) * gated[:, m * 2:]

        ret = cls._create_node(z)
        ret.attrs._x = x
        ret.attrs._w = w
        ret.attrs._wr = wr
        ret.attrs._b = b
        ret.attrs._pz = pz
        ret.attrs._u = u
        ret.attrs._pstate = ps
        ret.attrs._state = state
        ret.attrs._gated = gated
        ret._state = state

        if isinstance(pz, Node):
            pz.attrs._pfgate = gated[:, :m]

        return ret
Esempio n. 3
0
    def _oper_gpu(cls, x, pz, ps, parameter):
        p = parameter
        s = get_gpu(np.zeros((x.shape[0], p["w"].shape[1]), dtype=precision)) if ps is None else ps
        z = get_gpu(s).zeros_like_me() if pz is None else pz

        u = dot(x, p["w"]) + dot(z, p["wr"]) + p["b"]

        gate_f = sigmoid(dot(x, p["wf"]) +
                         dot(z, p["wfr"]) + p["wfc"] * s + p["bf"])
        gate_i = sigmoid(dot(x, p["wi"]) +
                         dot(z, p["wir"]) + p["wic"] * s + p["bi"])

        state = gate_i * tanh(u) + gate_f * s

        gate_o = sigmoid(
            dot(x, p["wo"]) + dot(z, p["wor"]) + p["bo"] + p["woc"] * state)

        z = tanh(state) * gate_o

        ret = cls._create_node(get_gpu(z))
        ret.attrs._x = x
        ret.attrs._p = parameter
        ret.attrs._u = u
        ret.attrs._pgated_f = None
        ret.attrs._pstate = ps
        ret.attrs._state = state
        ret.attrs._gated_o = gate_o
        ret.attrs._gated_f = gate_f
        ret.attrs._gated_i = gate_i
        ret.attrs._dt_d = [p[k] for k in ["wr", "wi", "wf", "wo", "w"]]
        ret._state = state

        return ret
Esempio n. 4
0
    def _backward_gpu(self, context, dy):
        p = self.attrs._p
        s = self.attrs._state
        ps = self.attrs._pstate
        u = self.attrs._u

        go = self.attrs._gated_o
        gf = self.attrs._gated_f
        gi = self.attrs._gated_i
        pgf = get_gpu(gf).zeros_like_me() if self.attrs._pgated_f is None else self.attrs._pgated_f

        drt, dit, dft, doot, dct = (context.restore(dt, get_gpu(dy).zeros_like_me())
                                    for dt in self.attrs._dt_d)

        activated_s = tanh(s)
        activated_u = tanh(u)

        e = dy + get_gpu(dot(drt, p["wr"].T)) \
               + get_gpu(dot(dit, p["wir"].T)) + \
               + get_gpu(dot(dft, p["wfr"].T)) + \
               + get_gpu(dot(doot, p["wor"].T))

        do = gate_diff(go) * activated_s * e
        ds = go * activation_diff(activated_s) * e
        dc = ds + pgf * dct + p["wfc"] * dft + p["wic"] * dit + p["woc"] * do

        df = gate_diff(gf) * ps * dc if ps is not None else get_gpu(gf).zeros_like_me()
        di = gate_diff(gi) * activated_u * dc

        d = gi * activation_diff(activated_u) * dc

        dx = dot(d, p["w"].T) \
            + dot(di, p["wi"].T) \
            + dot(do, p["wo"].T) \
            + dot(df, p["wf"].T)

        for dt_d, dt in zip(self.attrs._dt_d, (d, di, df, do, dc)):
            context.store(dt_d, get_gpu(dt))

        if isinstance(self.attrs._x, Node):
            self.attrs._x._update_diff(context, get_gpu(dx))

        for k, diff in zip(("w", "wo", "wi", "wf"), (d, do, di, df)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, get_gpu(dot(self.attrs._x.T, diff)))

        for k, diff in zip(("wr", "wor", "wir", "wfr"), (drt, doot, dit, dft)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, get_gpu(dot(self.T, diff)))

        for k, diff in zip(("wfc", "wic", "woc"), (dft, dit, do)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, sum(diff * get_gpu(s), axis=0))

        for k, diff in zip(("b", "bf", "bi", "bo"), (d, df, di, do)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, sum(diff, axis=0))
Esempio n. 5
0
    def _backward_cpu(self, context, dy):
        p = self.attrs._p
        s = self.attrs._state
        ps = self.attrs._pstate
        u = self.attrs._u

        go = self.attrs._gated_o
        gf = self.attrs._gated_f
        gi = self.attrs._gated_i
        pgf = np.zeros_like(gf) if self.attrs._pgated_f is None else self.attrs._pgated_f

        drt, dit, dft, dot, dct = (context.restore(dt, np.zeros_like(dy))
                                   for dt in self.attrs._dt_d)

        activated_s = tanh(s)
        activated_u = tanh(u)

        e = dy + np.dot(drt, p["wr"].T) + np.dot(dit, p["wir"].T) + \
            np.dot(dft, p["wfr"].T) + np.dot(dot, p["wor"].T)

        do = gate_diff(go) * activated_s * e
        ds = go * activation_diff(activated_s) * e
        dc = ds + pgf * dct + p["wfc"] * dft + p["wic"] * dit + p["woc"] * do

        df = gate_diff(gf) * ps * dc if ps is not None else np.zeros_like(gf)
        di = gate_diff(gi) * activated_u * dc

        d = gi * activation_diff(activated_u) * dc

        dx = np.dot(d, p["w"].T) \
            + np.dot(di, p["wi"].T) \
            + np.dot(do, p["wo"].T) \
            + np.dot(df, p["wf"].T)

        for dt_d, dt in zip(self.attrs._dt_d, (d, di, df, do, dc)):
            context.store(dt_d, dt)

        if isinstance(self.attrs._x, Node):
            self.attrs._x._update_diff(context, dx)

        for k, diff in zip(("w", "wo", "wi", "wf"), (d, do, di, df)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, np.dot(to_value(self.attrs._x).T, diff))

        for k, diff in zip(("wr", "wor", "wir", "wfr"), (drt, dot, dit, dft)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, np.dot(to_value(self).T, diff))

        for k, diff in zip(("wfc", "wic", "woc"), (dft, dit, do)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, np.sum(diff * s, axis=0, keepdims=True))

        for k, diff in zip(("b", "bf", "bi", "bo"), (d, df, di, do)):
            if isinstance(p[k], Node):
                p[k]._update_diff(context, np.sum(diff, axis=0, keepdims=True))
Esempio n. 6
0
    def _backward_gpu(self, context, dy):
        p = self.attrs._p
        u = self.attrs._u
        s = tanh(self.attrs._state)
        ps = self.attrs._pstate

        drt = context.restore(p["wr"], get_gpu(u).zeros_like_me())
        dou = context.restore(p["w"], get_gpu(dy).zeros_like_me())
        pfg = getattr(self.attrs, "_pfgate", get_gpu(u).zeros_like_me())

        e = get_gpu(dy) + get_gpu(dot(drt, p["wr"].T))

        dr, dou_n = (get_gpu(a).empty_like_me() for a in (drt, dou))
        cu.culstm_backward(*map(get_gpu, (u, dr, s, ps, e, pfg, dou, dou_n)))

        dx = dot(dr, p["w"].T)

        context.store(p["wr"], dr)
        context.store(p["w"], dou_n)

        if isinstance(self.attrs._x, Node):
            self.attrs._x._update_diff(context, dx)

        if isinstance(p["w"], Node):
            p["w"]._update_diff(context, dot(self.attrs._x.T, dr))

        if isinstance(p["wr"], Node):
            p["wr"]._update_diff(context, dot(self.T, drt))

        if isinstance(p["b"], Node):
            p["b"]._update_diff(context, sum(dr, axis=0))
    def _oper_cpu(cls, x, pz, w, u, b):
        # Initialize Variables
        m = w.shape[1] // 3
        w_z, w_r, w_h = np.split(w, [m, m * 2, ], axis=1)
        u_z, u_r, u_h = np.split(u, [m, m * 2], axis=1)
        hminus = Variable(np.zeros((x.shape[0], w.shape[1] // 3),
                                   dtype=precision)) if pz is None else pz

        b_z, b_r, b_h = np.split(b, [m, m * 2], axis=1) if b is not None else (0, 0, 0)
        A = dot(x, w_z) + dot(hminus, u_z) + b_z
        B = dot(x, w_r) + dot(hminus, u_r) + b_r
        C = dot(x, w_h) + sigmoid(B) * dot(hminus, u_h) + b_h

        h = sigmoid(A) * hminus + (1 - sigmoid(A)) * tanh(C)

        # Store Variables for Graph
        ret = cls._create_node(h)
        ret.attrs._x = x
        ret.attrs._w = w
        ret.attrs._w_z = w_z
        ret.attrs._w_r = w_r
        ret.attrs._w_h = w_h
        ret.attrs._u = u
        ret.attrs._u_z = u_z
        ret.attrs._u_h = u_h
        ret.attrs._u_r = u_r
        ret.attrs._pz = hminus
        ret.attrs._A = A
        ret.attrs._B = B
        ret.attrs._C = C

        if b is not None:
            ret.attrs._b = b

        return ret
Esempio n. 8
0
    def _backward_cpu(self, context, dy, **kwargs):
        n, m = dy.shape

        w = self.attrs._w
        wr = self.attrs._wr
        wc = self.attrs._wc
        b = self.attrs._b

        u = self.attrs._u
        s = tanh(self.attrs._state)

        gated = self.attrs._gated
        gd = gate_diff(gated)
        ps = self.attrs._pstate

        pfg = self.attrs.get("_pfgate", np.zeros_like(self))

        dot = context.restore(w, np.zeros((n, m), dtype=dy.dtype))
        drt = context.restore(wr, np.zeros((n, m * 4), dtype=dy.dtype))

        do = dy * s * gd[:, 2 * m:]
        dou = dy * gated[:, 2 * m:] * activation_diff(s) + do * wc[:, 2 * m:]

        dou += pfg * dot + drt[:, m:2 * m] * wc[:, :m] + drt[:, 2 * m:3 *
                                                             m] * wc[:,
                                                                     m:2 * m]

        df = dou * gd[:, :m] * ps if ps is not None else np.zeros_like(dou)
        di = dou * gd[:, m:2 * m] * u
        du = dou * activation_diff(u) * gated[:, m:2 * m]

        dr = np.hstack((du, df, di, do))

        context.store(wr, dr)
        context.store(w, dou)

        if isinstance(self.attrs._x, Node):
            dx = np.dot(dr, w.T)
            self.attrs._x._update_diff(context, dx)

        if isinstance(w, Node):
            w._update_diff(context, np.dot(self.attrs._x.T, dr))

        if isinstance(wr, Node):
            wr._update_diff(context, np.dot(self.T, drt))

        if isinstance(wc, Node):
            dwc = np.zeros(wc.shape, dtype=wc.dtype)
            dwc[:, 2 * m:] = np.sum(do * self.attrs._state, axis=0)
            dwc[:, :m] = np.sum(drt[:, m:2 * m] * self.attrs._state, axis=0)
            dwc[:, m:2 * m] = np.sum(drt[:, 2 * m:3 * m] * self.attrs._state,
                                     axis=0)
            wc._update_diff(context, dwc)

        if isinstance(b, Node):
            b._update_diff(context, np.sum(dr, axis=0))

        if isinstance(self.attrs._pz, Node):
            self.attrs._pz._update_diff(context, np.dot(dr, wr.T))
    def _backward_cpu(self, context, dy, **kwargs):
        x = self.attrs._x
        w_z = self.attrs._w_z
        w_r = self.attrs._w_r
        w_h = self.attrs._w_h
        A = self.attrs._A
        B = self.attrs._B
        C = self.attrs._C
        u_z = self.attrs._u_z
        u_h = self.attrs._u_h
        u_r = self.attrs._u_r
        hminus = self.attrs._pz
        y = dy

        dA = y * (hminus - tanh(C)) * sigmoid_diff(A)
        dC = y * (1 - sigmoid(A)) * tanh_diff(C)
        dB = dC * dot(hminus, u_h) * sigmoid_diff(B)

        # Calculate dx
        dx_z = dot(dA, w_z.T)
        dx_r = dot(dB, w_r.T)
        dx_h = dot(dC, w_h.T)
        dx = dx_z + dx_r + dx_h

        # Calculate dw
        dw_z = dot(x.T, dA)
        dw_r = dot(x.T, dB)
        dw_h = dot(x.T, dC)
        dw = np.concatenate([dw_z, dw_r, dw_h], axis=1)

        # Calculate db
        db_z = np.sum(dA, axis=0, keepdims=True)
        db_r = np.sum(dB, axis=0, keepdims=True)
        db_h = np.sum(dC, axis=0, keepdims=True)
        db = np.concatenate([db_z, db_r, db_h], axis=1)

        du_z = dot(hminus.T, dA)
        du_r = dot(hminus.T, dB)
        du_h = dot(hminus.T, dC * sigmoid(B))
        du = np.concatenate([du_z, du_r, du_h], axis=1)

        pz_z = dot(dA, u_z.T)
        pz_r = dot(dB, u_r.T)
        pz_h = dot(dC * sigmoid(B), u_h.T)

        dpz = pz_z + pz_r + pz_h + y * sigmoid(A)

        self.attrs._w._update_diff(context, dw)
        self.attrs._u._update_diff(context, du)

        if hasattr(self.attrs, "_b"):
            self.attrs._b._update_diff(context, db)

        if isinstance(self.attrs._x, Node):
            self.attrs._x._update_diff(context, dx)

        if isinstance(self.attrs._pz, Node):
            self.attrs._pz._update_diff(context, dpz)
Esempio n. 10
0
    def _oper_cpu(cls, x, pz, ps, w, wr, wc, b):
        s = np.zeros((x.shape[0],
                      w.shape[1] // 4), dtype=precision) if ps is None else ps
        z = np.zeros((x.shape[0],
                      w.shape[1] // 4), dtype=precision) if pz is None else pz

        u = np.dot(x, w) + np.dot(z, wr)
        if b is not None:
            u += b

        m = u.shape[1] // 4
        u, gate_u = np.split(u.as_ndarray(), [
            m,
        ], axis=1)
        u = tanh(u)

        fg = sigmoid(s * wc[:, :m] + gate_u[:, :m])
        ig = sigmoid(s * wc[:, m:2 * m] + gate_u[:, m:2 * m])
        state = ig * u + fg * s
        og = sigmoid(state * wc[:, 2 * m:] + gate_u[:, 2 * m:])
        z = tanh(state) * og

        gated = np.hstack((fg, ig, og))

        ret = cls._create_node(z)
        ret.attrs._x = x
        ret.attrs._w = w
        ret.attrs._wr = wr
        ret.attrs._wc = wc
        ret.attrs._b = b
        ret.attrs._u = u
        ret.attrs._pz = pz
        ret.attrs._pstate = ps
        ret.attrs._state = state
        ret.attrs._gated = gated
        ret._state = state

        if isinstance(pz, Node):
            pz.attrs._pfgate = gated[:, :m]

        return ret
Esempio n. 11
0
    def _backward_cpu(self, context, dy, **kwargs):
        n, m = dy.shape

        w = self.attrs._w
        wr = self.attrs._wr
        b = self.attrs._b

        u = self.attrs._u
        s = tanh(self.attrs._state)

        gated = self.attrs._gated
        gd = gate_diff(gated)
        ps = self.attrs._pstate

        drt = context.restore(wr, np.zeros((n, m * 4), dtype=dy.dtype))
        dou = context.restore(w, np.zeros((n, m), dtype=dy.dtype))

        pfg = self.attrs.get("_pfgate", np.zeros_like(self))

        e = dy

        do = e * s * gd[:, 2 * m:]
        dou = e * gated[:, 2 * m:] * activation_diff(s) + pfg * dou

        df = dou * gd[:, :m] * ps if ps is not None else np.zeros_like(dou)
        di = dou * gd[:, m:2 * m] * u
        dc = dou * activation_diff(u) * gated[:, m:2 * m]

        dr = np.hstack((dc, df, di, do))
        dx = np.dot(dr, w.T)

        context.store(wr, dr)
        context.store(w, dou)

        if isinstance(self.attrs._x, Node):
            self.attrs._x._update_diff(context, dx)

        if isinstance(w, Node):
            w._update_diff(context, np.dot(self.attrs._x.T, dr))

        if isinstance(wr, Node):
            wr._update_diff(context, np.dot(self.T, drt))

        if isinstance(b, Node):
            b._update_diff(context, np.sum(dr, axis=0, keepdims=True))

        if isinstance(self.attrs._pz, Node):
            self.attrs._pz._update_diff(context, np.dot(dr, wr.T))
Esempio n. 12
0
    def _oper_cpu(cls, x, pz, w, u, b):
        # Initialize Variables
        m = w.shape[1] // 3
        w_z, w_r, w_h = np.split(w, [
            m,
            m * 2,
        ], axis=1)
        u_z, u_r, u_h = np.split(u, [m, m * 2], axis=1)
        hminus = Variable(
            np.zeros((x.shape[0],
                      w.shape[1] // 3), dtype=precision)) if pz is None else pz

        # Perform Forward Calcuations
        if b is None:
            A = dot(x, w_z) + hminus * u_z
            B = dot(x, w_r) + u_r * hminus
            C = dot(x, w_h) + sigmoid(B) * u_h * hminus
        else:
            b_z, b_r, b_h = np.split(b, [m, m * 2], axis=1)
            A = dot(x, w_z) + hminus * u_z + b_z
            B = dot(x, w_r) + u_r * hminus + b_r
            C = dot(x, w_h) + sigmoid(B) * u_h * hminus + b_h

        h = sigmoid(A) + tanh(C)

        # Store Variables for Graph
        ret = cls._create_node(h)
        ret.attrs._x = x
        ret.attrs._w = w
        ret.attrs._w_z = w_z
        ret.attrs._w_r = w_r
        ret.attrs._w_h = w_h
        ret.attrs._b = b
        ret.attrs._b_z = b_z
        ret.attrs._b_r = b_r
        ret.attrs._b_h = b_h
        ret.attrs._u = u
        ret.attrs._u_z = u_z
        ret.attrs._u_h = u_h
        ret.attrs._u_r = u_r
        ret.attrs._pz = hminus
        ret.attrs._A = A
        ret.attrs._B = B
        ret.attrs._C = C

        return ret
Esempio n. 13
0
    def _backward_cpu(self, context, dy):
        n, m = dy.shape
        p = self.attrs._p
        u = self.attrs._u
        s = tanh(self.attrs._state)

        gated = self.attrs._gated
        gd = gate_diff(gated)
        ps = self.attrs._pstate

        drt = context.restore(p["wr"], np.zeros((n, m * 4), dtype=dy.dtype))
        dou = context.restore(p["w"], np.zeros((n, m), dtype=dy.dtype))

        pfg = getattr(self.attrs, "_pfgate", np.zeros_like(self))

        e = dy + np.dot(drt, p["wr"].T)

        do = e * s * gd[:, 2 * m:]
        dou = e * gated[:, 2 * m:] * activation_diff(s) + pfg * dou

        df = dou * gd[:, :m] * ps if ps is not None else np.zeros_like(dou)
        di = dou * gd[:, m:2 * m] * u
        dc = dou * activation_diff(u) * gated[:, m:2 * m]

        dr = np.hstack((dc, df, di, do))
        dx = np.dot(dr, p["w"].T)

        context.store(p["wr"], dr)
        context.store(p["w"], dou)

        if isinstance(self.attrs._x, Node):
            self.attrs._x._update_diff(context, dx)

        if isinstance(p["w"], Node):
            p["w"]._update_diff(context, np.dot(self.attrs._x.T, dr))

        if isinstance(p["wr"], Node):
            p["wr"]._update_diff(context, np.dot(self.T, drt))

        if isinstance(p["b"], Node):
            p["b"]._update_diff(context, np.sum(dr, axis=0, keepdims=True))
Esempio n. 14
0
    def _backward_gpu(self, context, dy, **kwargs):

        w = self.attrs._w
        wr = self.attrs._wr
        b = self.attrs._b

        u = self.attrs._u
        s = tanh(self.attrs._state)
        ps = self.attrs._pstate

        drt = context.restore(wr, get_gpu(u).zeros_like_me())
        dou = context.restore(w, get_gpu(dy).zeros_like_me())
        pfg = self.attrs.get("_pfgate", get_gpu(u).zeros_like_me())

        e = get_gpu(dy)

        dr, dou_n = (get_gpu(a).empty_like_me() for a in (drt, dou))

        cu.culstm_backward(*map(get_gpu, (u, dr, s, ps, e, pfg, dou, dou_n)))

        dx = dot(dr, w.T)

        context.store(wr, dr)
        context.store(w, dou_n)

        if isinstance(self.attrs._x, Node):
            self.attrs._x._update_diff(context, dx)

        if isinstance(w, Node):
            w._update_diff(context, dot(self.attrs._x.T, dr))

        if isinstance(wr, Node):
            wr._update_diff(context, dot(self.T, drt))

        if isinstance(b, Node):
            b._update_diff(context, sum(dr, axis=0))

        if isinstance(self.attrs._pz, Node):
            self.attrs._pz._update_diff(context, dot(dr, wr.T))
def tanh_diff(x):
    return (1.0 - tanh(x) ** 2)
Esempio n. 16
0
 def func(node):
     return sum(tanh(node))