def arch_weights(self): """Get weights of alphas.""" self.alphas_normal = self.get_weights('alphas_normal') self.alphas_reduce = self.get_weights('alphas_reduce') alphas_normal = ops.softmax(torch.stack(self.alphas_normal, dim=0), -1) alphas_reduce = ops.softmax(torch.stack(self.alphas_reduce, dim=0), -1) return [ops.to_numpy(alphas_normal), ops.to_numpy(alphas_reduce)]
def call(self, hidden_states, attention_mask): """Call attention func.""" mixed_query_layer = self.query(hidden_states) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self._transpose_for_scores(mixed_query_layer) key_layer = self._transpose_for_scores(mixed_key_layer) value_layer = self._transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = ops.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt( self.attention_head_size) # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = ops.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) context_layer = ops.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + ( self.all_head_size, ) context_layer = context_layer.view(*new_context_layer_shape) return context_layer
def call(self, inputs, targets): """Compute loss. :param inputs: predict data. :param targets: true data. :return: """ y_true = ops.to(ops.one_hot(targets, 2, ), 'float32') y_pred = ops.softmax(inputs, dim=1) tp = ops.reduce_sum(y_true * y_pred, dtype='float32') # tn = ops.reduce_sum(((1 - y_true) * (1 - y_pred)), dtype='float32') fp = ops.reduce_sum(((1 - y_true) * y_pred), dtype='float32') fn = ops.reduce_sum((y_true * (1 - y_pred)), dtype='float32') precision = tp / (tp + fp + self.epsilon) recall = tp / (tp + fn + self.epsilon) f1 = 2 * (precision * recall) / (precision + recall + self.epsilon) f1 = ops.clamp(f1, min=self.epsilon, max=1 - self.epsilon) return 1 - f1.mean()
def call(self, inputs, targets): """Compute loss. :param inputs: predict data. :param targets: true data. :return: """ N = inputs.size(0) C = inputs.size(1) P = ops.softmax(inputs) class_mask = inputs.data.new(N, C).fill_(0) ids = targets.view(-1, 1) class_mask.scatter_(1, ids.data, 1.) if inputs.is_cuda and not self.alpha.is_cuda: self.alpha = self.alpha.cuda() alpha = self.alpha[ids.data.view(-1)] probs = (P * class_mask).sum(1).view(-1, 1) log_p = probs.log() batch_loss = -alpha * (ops.pow((1 - probs), self.gamma)) * log_p if self.size_average: loss = batch_loss.mean() else: loss = batch_loss.sum() return loss
def calc_alphas(self, alphas, dim=-1, **kwargs): """Calculate Alphas.""" new_alphas = [] for alpha in alphas: new_alphas.append(ops.softmax(alpha, dim)) return new_alphas
def calc_alphas(self, alphas, dim=-1, **kwargs): """Calculate Alphas.""" return ops.softmax(alphas, dim)