def forward(self, hidden_states, span_mask, position_ids=None): seq_length = span_mask.size(1) if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=span_mask.device) position_ids = position_ids.unsqueeze(0).expand_as(span_mask) position_embeddings = self.position_embeddings( position_ids) # bs,sl,hn # get span representation span_mask = (span_mask > -1) # bs,sl fw_idxs = torch.zeros_like(span_mask, dtype=torch.long) # bs,sl for _idx_col in range(1, span_mask.size()[1]): fw_idxs[:, _idx_col] = torch.where( span_mask[:, _idx_col] & (~span_mask[:, _idx_col - 1]), torch.full_like(fw_idxs[:, _idx_col - 1], _idx_col - 1), fw_idxs[:, _idx_col - 1], ) bw_idxs = torch.full_like(span_mask, span_mask.size(1) - 1, dtype=torch.long) # bs,sl for _idx_col in range(span_mask.size(1) - 2, -1, -1): bw_idxs[:, _idx_col] = torch.where( span_mask[:, _idx_col] & ~span_mask[:, _idx_col + 1], torch.full_like(bw_idxs[:, _idx_col + 1], _idx_col + 1), bw_idxs[:, _idx_col + 1], ) fw_idxs = fw_idxs.unsqueeze(-1).expand_as(hidden_states) # bs,sl,hn bw_idxs = bw_idxs.unsqueeze(-1).expand_as(hidden_states) # bs,sl,hn fw_hidden_states = torch.gather(hidden_states, 1, fw_idxs) # bs,sl,hn bw_hidden_states = torch.gather(hidden_states, 1, bw_idxs) # bs,sl,hn sbo_rep = torch.cat( [fw_hidden_states, bw_hidden_states, position_embeddings], dim=-1) sbo_rep = sbo_rep * span_mask.to(dtype=sbo_rep.dtype).unsqueeze( -1) # bs,sl,3*hn mid_rep = self.layer_norm1( gelu(self.linear1(sbo_rep)).to(torch.float32)) pre_logits = self.layer_norm2( gelu(self.linear2(mid_rep)).to(torch.float32)) logits = self.decoder(pre_logits) + self.bias return logits
def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) x = self.dropout(x) x = self.layer_norm(x) x = self.decoder(x) return x
def forward(self, x): x = self.dense(x) x = gelu(x) x = self.layer_norm(x) x = self.decoder(x) return x
def forward(self, features, weight, **kwargs): x = self.dense(features) x = gelu(x) # x = self.dropout(x) x = self.layer_norm(x) x = x.matmul(weight.t()) return x
def forward(self, x): for i, layer in enumerate(self.layers): x = layer(self.drop_out(x)) if i < len(self.layers) - 1: x = gelu(x) if len(self.norm_layers): x = self.norm_layers[i](x) return x
def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x) + self.bias return x
def forward(self, x): return self.w_2(self.dropout(gelu(self.w_1(x))))