def prove(Eq): n = Symbol.n(domain=[2, oo], integer=True) Eq << apply(n) _, i, j = Eq[0].rhs.args _i = i.copy(domain=[0, n - 1]) _j = j.copy(domain=[0, n - 1]) W = Symbol.W(definition=Eq[0].lhs._subs(i, _i)._subs(j, _j)) V = Symbol.V(definition=Eq[0].rhs._subs(i, _i)._subs(j, _j)) Eq << W.this.definition Eq << V.this.definition h = Symbol.h(integer=True, domain=[0, n]) k = Symbol.k(integer=True, domain=[0, n]) Eq << (V[h, k].this.definition, W[h, k].this.definition) Eq << (Eq[-1].this.rhs.as_KroneckerDelta(), Eq[-2].this.rhs.as_KroneckerDelta()) Eq << Eq[-2] - Eq[-1] Eq << Eq[-1].reference((k, ), (h, )) Eq << Eq[-1].subs(Eq[1]).subs(Eq[2]) Eq << Eq[-1].forall((_i, ), (_j, ))
def prove(Eq): n = Symbol.n(domain=[2, oo], integer=True) W = Symbol.W(shape=(n, n), complex=True) Eq << apply(W) U = Symbol.U(definition=Eq[0].lhs) V = Symbol.V(definition=Eq[0].rhs) Eq << U.this.definition Eq << V.this.definition i = Symbol.i(integer=True, domain=[0, n]) j = Symbol.j(integer=True, domain=[0, n]) Eq << (V[i, j].this.definition, U[i, j].this.definition) Eq << (Eq[-1].this.rhs.as_KroneckerDelta(), Eq[-2].this.rhs.as_KroneckerDelta()) Eq << Eq[-2] - Eq[-1] Eq << Eq[-1].reference((j, ), (i, )) Eq << Eq[-1].subs(Eq[1]).subs(Eq[2]) Eq << Eq[-1].forall((j, ), (i, ))
def apply(n, d): Q = Symbol.Q(shape=(n, d), real=True) K = Symbol.K(shape=(n, d), real=True) V = Symbol.V(shape=(n, d), real=True) S = Symbol.S(shape=(n, d), definition=softmax(Q @ K.T / sympy.sqrt(d)) @ V) return Equality(S[0], softmax(Q[0] @ K.T / sympy.sqrt(d)) @ V)
def prove(Eq): n = Symbol.n(integer=True) dx = Symbol.d_x(integer=True, positive=True) dz = Symbol.d_z(integer=True, positive=True) Eq << apply(n, dx, dz) i, j = Eq[2].lhs.indices Eq << Eq[-1].subs(Eq[0][i].reversed) Eq << Eq[-1].subs(Eq[1][j].reversed) Eq << Eq[-1].this.rhs.args[1].distribute() Eq << Eq[3][i, j] Eq << Eq[4][i, j] Eq << Eq[6][i] V = Symbol.V(definition=MatMul(*Eq[-1].rhs.args[1].args[1:])) Eq.V_definition = V.this.definition Eq << Eq[-1].this.rhs.subs(Eq.V_definition.reversed) k = Symbol.k(integer=True) Eq << Eq[-1].this.rhs.args[0].expand(free_symbol={k}) Eq << Eq[-1].this.rhs.args[-1].as_Sum() Eq << Eq[-1].this.rhs.subs(Eq.V_definition[j]) Eq << Eq[-1].this.rhs.args[0].expand(free_symbol={k}) Eq << Eq[-1].this.rhs.args[0].as_Sum() Eq << Eq[-1].this.rhs.astype(Sum) α = Eq[4].lhs Eq << Eq[-1].this.rhs.function.collect(α[i, j]) Eq << Eq[7].this.lhs.args[1].definition Eq << Eq[-1].this.lhs.astype(Min)
def prove(Eq): n = Symbol.n(domain=[2, oo], integer=True) Eq << apply(n) U = Symbol.U(definition=Eq[0].lhs) V = Symbol.V(definition=Eq[0].rhs) Eq << U.this.definition Eq << V.this.definition i = Symbol.i(integer=True, domain=[0, n]) Eq << Eq[-1][i] Eq << U[i].this.definition Eq << Eq[-2].this.rhs.as_KroneckerDelta() Eq << Eq[-2] - Eq[-1] Eq << Eq[-1].reference((i, )) Eq << Eq[-1].subs(Eq[1]).subs(Eq[2])