def infer(self): """-> Model with new fields, autoregressive len_tgt : i32 () steps to unfold aka t pred : i32 (b, t) prediction, hard """ dropout = identity with scope('infer'): with scope('encode'): w = self.position(self.max_src) + self.emb_src(self.src) w = self.encode(w, self.mask_src, dropout) # bds with scope('decode'): cap = placeholder(tf.int32, (), self.cap) msk = tf.log(tf.expand_dims(causal_mask(cap), axis= 0)) # 1tt pos = self.position(cap) # dt i,q = tf.constant(0), tf.zeros_like(self.src[:,:1]) + self.bos def body(i, q): j = i + 1 x = pos[:,:j] + self.emb_tgt(q) # bdj <- bj x = self.decode(x, msk[:,:j,:j], w, self.mask_src, dropout) # bdj p = tf.expand_dims( # b1 tf.argmax( # b self.emb_tgt( # bn tf.squeeze( # bd x[:,:,-1:] # bd1 <- bdj , axis= -1)) , axis= -1, output_type= tf.int32) , axis= -1) return j, tf.concat((q, p), axis= -1) # bk <- bj, b1 cond = lambda i, q: ((i < cap) & ~ tf.reduce_all(tf.equal(q[:,-1], self.eos))) _, p = tf.while_loop(cond, body, (i, q), back_prop= False, swap_memory= True) pred = p[:,1:] return Model(self, len_tgt= cap, pred= pred)
def build(self, x, y, weight): with scope("x"): x = placeholder(tf.float32, [None, self.dim_x], x, "x") with scope("y"): y = placeholder(tf.float32, [None], y, "y") gx = self.gen(x) dx, dgx = self.dis(x), self.dis(gx) with scope("loss"): d_loss_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(dx) * 0.9, logits=dx)) d_loss_fake = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.zeros_like(dgx), logits=dgx)) d_loss = d_loss_real + d_loss_fake epsilon = 1e-10 loss_rec = tf.reduce_mean( -tf.reduce_sum(x * tf.log(epsilon + gx) + (1 - x) * tf.log(epsilon + 1 - gx), axis=1)) g_loss = weight* loss_rec \ + tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(dgx), logits=dgx)) with scope("AUC"): _, auc_dgx = tf.metrics.auc(y, tf.nn.sigmoid(dgx)) _, auc_dx = tf.metrics.auc(y, tf.nn.sigmoid(dx)) _, auc_gx = tf.metrics.auc(y, tf.reduce_mean((x - gx)**2, axis=1)) g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") d_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="discriminator") with scope('train_step'): step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer() d_step = optimizer.minimize(d_loss, step, var_list=d_vars) g_step = optimizer.minimize(g_loss, step, var_list=g_vars) return AEGAN(self, step=step, x=x, y=y, gx=gx, auc_dgx=auc_dgx, auc_gx=auc_gx, auc_dx=auc_dx, g_step=g_step, d_step=d_step, g_loss=g_loss, d_loss=d_loss)
def __call__(self, x, dropout, name= None): with scope(name or self.name): y = self.ante(x) for gate, conv in zip(self.gate, self.conv): y = tf.pad(y, ((0,0),(0,0),(conv.shape()[0]-1,0))) y = tf.sigmoid(gate(y)) * conv(y) return self.norm(x + dropout(self.post(y)))
def __init__(self, dim_x, dim_d, dim_btlnk, name='encoder'): self.name = name with scope(name): self.lin = Linear(dim_d, dim_x, name='lin') self.nrm = Normalize(dim_d, name='nrm') self.l_mu = Linear(dim_btlnk, dim_d, name='mu') self.l_lv = Linear(dim_btlnk, dim_d, name='lv')
def __call__(self, x, name=None): with scope(name or self.name): hl = self.nrm(tf.nn.elu(self.lin(x))) mu = self.l_mu(hl) lv = self.l_lv(hl) z = mu + tf.exp(0.5 * lv) * tf.random_normal(shape=tf.shape(lv)) return z, mu, lv, hl
def __init__(self, dim_x, dim_d, name='discriminator'): self.name = name with scope(name): self.lin = Linear(dim_d, dim_x, name='lin') self.nrm = Normalize(dim_d, name='nrm') self.lin2 = Linear(dim_d, dim_d, name='lin2') self.nrm2 = Normalize(dim_d, name='nrm2') self.lex = Linear(1, dim_d, name='lex')
def __call__(self, x, name=None): with scope(name or self.name): x = self.enc(x) # final layer x = self.conv_out(x) x = self.dec(x) return x
def __init__(self, dim, name, mid= 128, depth= 2): self.name = name with scope(name): self.ante = Conv(mid, dim, size= 1, name= 'ante') self.gate = tuple(Conv(mid, mid, size= 2, name= "gate{}".format(1+i)) for i in range(depth)) self.conv = tuple(Conv(mid, mid, size= 2, name= "conv{}".format(1+i)) for i in range(depth)) self.post = Conv(dim, mid, size= 1, name= 'post') self.norm = Normalize(dim, name= 'norm')
def valid(self, dropout= identity, smooth= None): """-> Model with new fields, teacher forcing output : f32 (?, dim_tgt) prediction on logit scale prob : f32 (?, dim_tgt) prediction, soft pred : i32 (?,) prediction, hard errt_samp : f32 (?,) errors loss_samp : f32 (?,) losses errt : f32 () error rate loss : f32 () mean loss """ with scope('emb_src_'): w = self.position(self.max_src) + dropout(self.emb_src(self.src)) with scope('emb_tgt_'): x = self.position(self.max_tgt) + dropout(self.emb_tgt(self.tgt)) w = self.encode(w, self.mask_src, dropout, name= 'encode_') # bds x = self.decode(x, self.mask_tgt, w, self.mask_src, dropout, name= 'decode_') # bdt with scope('logit_'): y = self.emb_tgt( # ?n tf.boolean_mask( # ?d tf.transpose(x, (0,2,1)) # btd <- bdt , self.mask)) with scope('prob_'): prob = tf.nn.softmax(y, axis= -1) with scope('pred_'): pred = tf.argmax(y, axis= -1, output_type= tf.int32) with scope('errt_'): errt_samp = tf.to_float(tf.not_equal(self.true, pred)) errt = tf.reduce_mean(errt_samp) with scope('loss_'): loss_samp = tf.nn.softmax_cross_entropy_with_logits_v2(labels= smooth(self.true), logits= y) \ if smooth else tf.nn.sparse_softmax_cross_entropy_with_logits(labels= self.true, logits= y) loss = tf.reduce_mean(loss_samp) return Model(self, output= y, prob= prob, pred= pred , errt_samp= errt_samp, errt= errt , loss_samp= loss_samp, loss= loss)
def __call__(self, x, m, dropout, name= None): with scope(name or self.name): for block in self.blocks: btype = block.name[0] if 'c' == btype: x = block(x, dropout) elif 's' == btype: x = block(x, x, m, dropout) elif 'm' == btype: x = block(x, dropout) else: raise TypeError('unknown encode block') return x
def __init__(self, dim, name): self.name = name with scope(name): self.blocks = AttBlock(dim, 's1') \ , MlpBlock(dim, 'm1') \ , AttBlock(dim, 's2') \ , MlpBlock(dim, 'm2') \ , AttBlock(dim, 's3') \ , MlpBlock(dim, 'm3') \
def __init__(self, dim_x, channel_x, dim_g, extra_layers=0, name='discriminator'): self.name = name with scope(name): self.enc = Encoder(dim_x, channel_x, dim_g, extra_layers) self.conv_out = Conv2D(1, (4, 4), padding='valid', use_bias=False, name="conv_out")
def __init__(self, dim_x, channel_x, dim_d, extra_layers=0, name="decoder"): assert dim_x % 16 == 0, "image size has to be a multiple of 16" self.extra_layers = extra_layers with scope(name): cngf, tisize = dim_d // 2, 4 # first n_filter=4, while tisize != dim_x: cngf = cngf * 2 tisize = tisize * 2 # first layer self.conv0 = Conv2DTranspose(cngf, (4, 4), padding='valid', use_bias=False, name="conv0") self.bn0 = BatchNormalization(name="bn0") # increasing layers size_now, i = 4, 1 self.conv, self.bn = {}, {} while size_now < dim_x // 2: self.conv[i] = Conv2DTranspose(cngf // 2, (4, 4), strides=2, padding='same', use_bias=False, name=f"conv{i}") self.bn[i] = BatchNormalization(name=f"bn{i}") cngf = cngf // 2 size_now = size_now * 2 i += 1 if self.extra_layers > 0: # extra layers self.conv_e, self.bn_e = {}, {} for i in range(extra_layers): self.conv_e[i] = Conv2DTranspose(cnfg, (3, 3), padding='same', use_bias=False, name=f"conv_e{i}") self.bn_e[i] = BatchNormalization(name=f"bn_e{i}") # final layer, expand the size with 2 and set channels to number of channels of x self.conv_out = Conv2DTranspose(channel_x, (4, 4), strides=2, padding='same', use_bias=False, name="conv_out")
def data(self, sid, tid, src= None, tgt= None): """-> Model with new fields position : Sinusoid src_ : i32 (b, ?) source feed, in range `[0, dim_src)` tgt_ : i32 (b, ?) target feed, in range `[0, dim_tgt)` src : i32 (b, s) source with `eos` trimmed among the batch tgt : i32 (b, t) target with `eos` trimmed among the batch, padded with `bos` mask : b8 (b, t) target sequence mask true : i32 (?,) target references max_tgt : i32 () maximum target length max_src : i32 () maximum source length mask_tgt : f32 (1, t, t) target attention mask mask_src : f32 (b, 1, s) source attention mask """ src_ = placeholder(tf.int32, (None, None), src, 'src_') tgt_ = placeholder(tf.int32, (None, None), tgt, 'tgt_') with scope('src'): src, msk, max_src = trim(src_, self.eos) mask_src = tf.log(tf.expand_dims(tf.to_float(msk), axis= 1)) with scope('tgt'): tgt, msk, max_tgt = trim(tgt_, self.eos) mask = tf.pad(msk, ((0,0),(1,0)), constant_values= True) btru = tf.pad(tgt, ((0,0),(1,0)), constant_values= self.bos) true = tf.pad(tgt, ((0,0),(0,1)), constant_values= self.eos) true, tgt = tf.boolean_mask(true, mask), btru max_tgt += 1 mask_tgt = tf.log(tf.expand_dims(causal_mask(max_tgt), axis= 0)) return Model( position= Sinusoid(self.dim_emb, self.cap) , src_= src_, mask_src= mask_src, max_src= max_src, src= src , tgt_= tgt_, mask_tgt= mask_tgt, max_tgt= max_tgt, tgt= tgt , true= true, mask= mask , emb_src = self.embeds[sid] , emb_tgt = self.embeds[tid] , **self)
def __init__(self, dim_x, channel_x, dim_btlnk, dim_d, dim_g, extra_layers=0, name='generator'): self.name = name with scope(name): self.enc = Encoder(dim_x, channel_x, dim_g, extra_layers) #resize the layer to channel X 1 X 1 self.conv_out = Conv2D(dim_btlnk, (4, 4), padding='valid', use_bias=False, name="conv_out") self.dec = Decoder(dim_x, channel_x, dim_d, extra_layers)
def train(self, dropout= 0.1, smooth= 0.1, warmup= 4e3, beta1= 0.9, beta2= 0.98, epsilon= 1e-9): """-> Model with new fields, teacher forcing step : i64 () global update step lr : f32 () learning rate for the current step up : update operation along with all the fields from `valid` """ dropout, smooth = Dropout(dropout, (None, self.dim_emb, None)), Smooth(smooth, self.dim_voc) self = self.valid(dropout= dropout, smooth= smooth) with scope('lr'): s = tf.train.get_or_create_global_step() t = tf.to_float(s + 1) lr = (self.dim_emb ** -0.5) * tf.minimum(t ** -0.5, t * (warmup ** -1.5)) # up = tf.train.AdamOptimizer(lr, beta1, beta2, epsilon).minimize(self.loss, s) return Model(self, dropout= dropout, smooth= smooth, step= s, lr= lr)
def __init__(self, dim_x, channel_x, dim_g, extra_layers=0, name="encoder"): assert dim_x % 16 == 0, "image size has to be a multiple of 16" self.extra_layers = extra_layers with scope(name): # first layer self.conv0 = Conv2D(dim_g, (4, 4), strides=2, padding='same', use_bias=False, name="conv0") # extra layers if self.extra_layers > 0: self.conv_e, self.bn_e = {}, {} for i in range(self.extra_layers): self.conv_e[i] = Conv2D(dim_g, (3, 3), padding='same', use_bias=False, name=f"conv_e{i}") self.bn_e[i] = BatchNormalization(name=f"bn_e{i}") # decreasing layers size_now = dim_x // 2 channel = dim_g i = 1 self.conv, self.bn = {}, {} while size_now > 4: channel *= 2 # channel increases, size decreases self.conv[i] = Conv2D(channel, (4, 4), strides=2, padding='same', use_bias=False, name=f"conv{i}") self.bn[i] = BatchNormalization(name=f"bn{i}") size_now = size_now // 2 i += 1
def build(self, x, y, z): with scope("x"): x = placeholder(tf.float32, [None, self.dim_x], x, "x") with scope("y"): y = placeholder(tf.float32, [None], y, "y") with scope("z"): z = placeholder(tf.float32, [None, self.dim_z], z, "z") gz = self.gen(z) dx, dgz = self.dis(x), self.dis(gz) with scope("loss"): loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels= tf.ones_like(dx), logits= dx)) \ + tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels= tf.zeros_like(dgz), logits= dgz)) #with scope("d_loss"): #d_loss_real = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(y_real)*0.9, logits=y_real)) #d_loss_fake = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(y_fake), logits=y_fake)) #d_loss = d_loss_real + d_loss_fake #with scope("g_loss"): #g_loss = tf.reduce_mean( #tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(y_fake), logits=y_fake)) #with scope("g/d_loss"): #loss = d_loss_real + g_loss with scope("AUC"): _, auc_d = tf.metrics.auc(y, tf.nn.sigmoid(dx)) with scope("train_step"): step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer() train_step = optimizer.apply_gradients( [((-grad if var.name.startswith("generator") else grad), var) for grad, var in optimizer.compute_gradients(loss)], step) return GAN(self, step=step, x=x, y=y, z=z, auc_d=auc_d, gz=gz, train_step=train_step)
def __init__(self, dim_x, dim_btlnk, name='generator'): self.name = name with scope(name): self.lin = Linear(dim_btlnk, dim_x, name='lin') self.nrm = Normalize(dim_btlnk, name='nrm') self.lex = Linear(dim_x, dim_btlnk, name='lex')
def __call__(self, x, name=None): with scope(name or self.name): x = self.nrm(tf.nn.leaky_relu(self.lin(x))) x = self.nrm2(tf.nn.leaky_relu(self.lin2(x))) return tf.clip_by_value(self.lex(x), 0.0, 1.0)
def __init__(self, dim, name): self.name = name with scope(name): self.lin = Conv(4*dim, dim, name= 'lin') self.lex = Conv(dim, 4*dim, name= 'lex') self.norm = Normalize(dim)
def __call__(self, x, name=None): with scope(name or self.name): #return tf.clip_by_value(self.lex(self.nrm(tf.nn.relu(self.lin(x)))), 0.0, 1.0) return tf.nn.sigmoid(self.lex(self.nrm(tf.nn.relu(self.lin(x)))))
def build(self, x, y, lr_max, mult): with tf.variable_scope("x"): x = placeholder(tf.float32, [None, self.dim_x], x, "x") with tf.variable_scope("y"): y = placeholder(tf.float32, [None], y, "y") gx = self.gen(x) dx, dgx = self.dis(x), self.dis(gx) with tf.variable_scope("loss"): a = tf.reduce_mean(tf.abs(x - dx)) b = tf.reduce_mean(tf.abs(gx - dgx)) c = tf.reduce_mean(tf.abs(x - gx)) d_vs_g = a - (b + c) / 2 # for balancing the learnign rate lr_d = sigmoid(d_vs_g, mult=mult) lr_g = (tf.constant(1.0) - lr_d) * lr_max lr_d = lr_d * lr_max # balance parameter for discriminator caring more about autoencoding real, or discriminating fake sigma = 0.5 w_fake = tf.clip_by_value( sigmoid(b * sigma - a, shift=0., mult=mult), 0., 0.9 ) # hold the discrim proportion fake aways at less than half d_loss = a - b * w_fake # weights for generator wg_fake = tf.clip_by_value(sigmoid(b - c, shift=0., mult=mult), 0., 1.0) wg_reconstruct = 1 - wg_fake g_loss = b * wg_fake + c * wg_reconstruct with tf.variable_scope("AUC"): _, auc_dgx = tf.metrics.auc(y, tf.reduce_mean((x - dgx)**2, axis=1)) _, auc_dx = tf.metrics.auc(y, tf.reduce_mean((x - dx)**2, axis=1)) _, auc_gx = tf.metrics.auc(y, tf.reduce_mean((x - gx)**2, axis=1)) with scope('down'): g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") d_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="discriminator") step = tf.train.get_or_create_global_step() d_step = tf.train.AdamOptimizer(lr_d).minimize(d_loss, step, var_list=d_vars) g_step = tf.train.AdamOptimizer(lr_g).minimize(g_loss, step, var_list=g_vars) return DAE(self, step=step, x=x, y=y, gx=gx, dgx=dgx, dx=dx, auc_dgx=auc_dgx, auc_gx=auc_gx, auc_dx=auc_dx, g_loss=g_loss, d_loss=d_loss, d_step=d_step, g_step=g_step)
def build(self, x, y, context_weight, loss, lam=0., weight_type="normal"): with scope("x"): x = placeholder(tf.float32, [None, None, None, self.channel_x], x, "x") with scope("y"): y = placeholder(tf.float32, [None], y, "y") gx = self.gen(x) dx = {k: v(x) for k, v in self.dis.items()} dgx = {k: v(gx) for k, v in self.dis.items()} #dx, dgx = self.dis(x), self.dis(gx) with scope("loss"): d_loss = [tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(dx[k])*0.9, logits=dx[k])) \ + \ tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.zeros_like(dgx[k]), logits=dgx[k])) for k in dx.keys()] ### old d_loss #d_loss_real, d_loss_fake = [], [] #for k in dx.keys(): #d_loss_real.append(tf.reduce_mean( #tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(dx[k])*0.9, logits=dx[k]))) #d_loss_fake.append(tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(dgx[k]), logits=dgx[k]))) #if loss=="mean": #d_loss = tf.reduce_mean(d_loss_real) + tf.reduce_mean(d_loss_fake) #elif loss=="max": #d_loss = tf.reduce_mean(d_loss_real) + tf.reduce_mean(d_loss_fake) #elif loss=="softmax": #d_loss = tf.reduce_mean(d_loss_real) + tf.reduce_mean(d_loss_fake) epsilon = 1e-10 loss_rec = tf.reduce_mean( -tf.reduce_sum(x * tf.log(epsilon + gx) + (1 - x) * tf.log(epsilon + 1 - gx), axis=1)) loss_g_fake = [ tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(dgx_), logits=dgx_)) for dgx_ in dgx.values() ] lam = placeholder(tf.float32, None, lam, "lam") # only for softmax, otherwise dummy with scope("lambda"): if loss == "softmax_self_challenged": trained_l = tf.Variable(initial_value=-2., name='controlled_lambda') used_lam = tf.nn.softplus(trained_l, name='used_lambda') else: used_lam = lam if loss == "mean": gl_adv = tf.reduce_mean(loss_g_fake) g_loss = context_weight * loss_rec + gl_adv elif loss == "max": # max picks biggest loss = best discriminators feedback is used gl_adv = tf.reduce_max(loss_g_fake) g_loss = context_weight * loss_rec + gl_adv elif "softmax" in loss: # if lambda is self_learnt if used_lam == 0.: weights = tf.ones_like(loss_g_fake) else: if weight_type == 'log': weights = tf.pow(loss_g_fake, used_lam) else: weights = tf.exp(used_lam * loss_g_fake) gl_adv = weighted_arithmetic(weights, loss_g_fake) if loss == "softmax": g_loss = context_weight * loss_rec + gl_adv else: g_loss = context_weight * loss_rec + gl_adv - 0.001 * used_lam #g_loss = weight* loss_rec + tf.reduce_mean(tf.nn.softmax(loss_g_fake)*loss_g_fake) with scope("AUC"): #_, auc_dgx = tf.metrics.auc(y, tf.nn.sigmoid(tf.reduce_mean(list(dgx.values())))) #_, auc_dx = tf.metrics.auc(y, tf.nn.sigmoid(tf.reduce_mean(list(dx.values())))) _, auc_gx = tf.metrics.auc( y, tf.reduce_mean((x - gx)**2, axis=(1, 2, 3))) g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") if loss == "softmax_self_challenged": lambda_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="loss/lambda") g_vars.extend(lambda_var) #d_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="discriminator") d_vars = { i: tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=f"discriminator_{i}") for i in dx.keys() } with scope('train_step'): step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer() #d_step = optimizer.minimize(d_loss, step, var_list=d_vars) d_step = [ optimizer.minimize(loss, var_list=d_vars[i]) for i, loss in enumerate(d_loss) ] g_step = optimizer.minimize(g_loss, step, var_list=g_vars) return MG_GAN( self, lam=used_lam, step=step, x=x, y=y, gx=gx #, auc_dgx=auc_dgx , auc_gx=auc_gx #, auc_dx=auc_dx , g_step=g_step, d_step=d_step, gl_rec=context_weight * loss_rec, gl_lam=0.001 * used_lam, gl_adv=gl_adv, g_loss=g_loss, d_loss=d_loss, d_loss_mean=tf.reduce_mean(d_loss), d_max=tf.argmax(d_loss))
def __call__(self, x, name=None): with scope(name or self.name): x = self.enc(x) x = self.conv_out(x) return tf.nn.sigmoid(x)
def causal_mask(t, name= 'causal_mask'): """returns the causal mask for `t` steps""" with scope(name): return tf.linalg.LinearOperatorLowerTriangular(tf.ones((t, t))).to_dense()
def __call__(self, x, dropout, name= None): with scope(name or self.name): return self.norm(x + dropout(self.lex(tf.nn.relu(self.lin(x)))))
def __call__(self, x, v, m, dropout, name= None): with scope(name or self.name): return self.norm(x + dropout(self.att(x, v, m)))
def __init__(self, dim, name): self.name = name with scope(name): self.latt = Attention(dim, name= 'latt') self.ratt = Attention(dim, name= 'ratt') self.norm = Normalize(dim)
def __call__(self, x, v, m, w, n, dropout, name= None): with scope(name or self.name): return self.norm(tf.add_n(((dropout(self.latt(x, v, m)), x, dropout(self.ratt(x, w, n))))))