Exemple #1
0
 def loss(q, k, scale, proj, attn_dist):
     qp = q
     kp = k
     ra, _ = fat.relu_rff_attn(qp, kp, scale * proj)
     return fat.kl(attn_dist, ra).mean()
Exemple #2
0
 def loss(q, k, scale, proj, attn_dist):
     qp = q
     kp = k
     ra, _ = fat.relu_rff_attn(qp, kp, jax.lax.stop_gradient(proj))
     return fat.kl(attn_dist, ra).mean()
Exemple #3
0
 def loss(q, k, scale, proj, attn_dist):
     qp = renorm(q, axis=-1)
     kp = renorm(k, axis=-1)
     ra, _ = fat.rff_attn(qp, kp, jax.lax.stop_gradient(proj))
     return fat.kl(attn_dist, ra).mean()
Exemple #4
0
 def loss(q, k, scale, proj, attn_dist):
     qp = renorm(q, axis=-1)
     kp = renorm(k, axis=-1)
     ra, _ = fat.rff_attn(qp, kp, proj)
     return fat.kl(attn_dist, ra).mean()
Exemple #5
0
 def loss(q, k, dummy_proj, attn):
     logits = q @ k.T
     probs = softmax(logits)
     return fat.kl(attn, probs).mean()