def _test_fused_tril_softmax_mask_scale(test_case, seq_length, channel, p, diagonal, tril_scale_value): x = np.random.randn(4, seq_length, channel) # fused version only support in GPU fused_x_tensor = flow.Tensor(x).to("cuda") fused_x_tensor.requires_grad = True fused_out = flow._C.fused_scale_tril_softmax_mask_scale( fused_x_tensor, p=p, diagonal=diagonal, tril_scale_value=tril_scale_value)[0] # The second output is softmax_y origin_x_tensor = flow.Tensor(x).to("cuda") origin_x_tensor.requires_grad = True origin_out = flow.tril(origin_x_tensor, diagonal) origin_out = origin_out * tril_scale_value origin_out = flow.softmax(origin_out, dim=-1) origin_out = flow._C.dropout(origin_out, p=p) total_out = fused_out.sum() + origin_out.sum() total_out.backward() test_case.assertTrue( np.allclose(fused_out.numpy(), origin_out.numpy(), atol=1e-4, rtol=1e-4)) test_case.assertTrue( np.allclose( fused_x_tensor.grad.numpy(), origin_x_tensor.grad.numpy(), atol=1e-4, rtol=1e-4, ))
def predict(model, text): model.eval() logits = model(flow.tensor(text)) logits = flow.softmax(logits) label = flow.argmax(logits) return label.numpy(), logits.numpy()
def predict(model, text): model.eval() text = flow.tensor(text).to("cuda") text.unsqueeze(0) logits = model(text) logits = flow.softmax(logits) label = flow.argmax(logits) return label.numpy(), logits.numpy()
def forward(self, x): need_transpose, permute = _softmax_need_transpose(x, self.dim) if need_transpose: x = x.transpose(perm=permute) x = flow.softmax(x) res = flow.log(x) if need_transpose: res = res.transpose(perm=permute) return res
def predict(config) -> None: """ Predict the emotion of the input audio Args: confguration items audio_path (str): path of input audio """ # utils.play_audio(audio_path) if config.feature_method == "o": of.get_data( config, config.audio_path, config.predict_feature_path_opensmile, train=False, ) test_feature = of.load_feature(config, config.predict_feature_path_opensmile, train=False) elif config.feature_method == "l": test_feature = lf.get_data(config, config.audio_path, config.predict_feature_path_librosa, train=False) test_feature = test_feature.reshape(1, test_feature.shape[0], test_feature.shape[1]) test_feature = flow.tensor(test_feature, dtype=flow.float32, device="cuda") n_feats = test_feature.shape[2] if config.model == "lstm": model = lstm_ser(n_feats, config.rnn_size, len(config.class_labels), 1) else: model = cnn1d_ser(1, config.n_kernels, n_feats, config.hidden_size, len(config.class_labels)) SER_model = model SER_model.to("cuda") model_path = os.path.join(config.checkpoint_path, config.checkpoint_name) SER_model.load_state_dict(flow.load(model_path)) flow.no_grad() logits = SER_model(test_feature) result = np.argmax(logits.numpy(), ) print("Recognition:", config.class_labels[int(result)]) result_prob = flow.softmax(logits, dim=1) utils.radar(result_prob.numpy().squeeze(), config.class_labels)
def _test_fused_scale_mask_softmax( test_case, batch_size, num_heads, seq_length, fill_value, scale_value, ): x = np.random.randn(batch_size, num_heads, seq_length, seq_length) mask = np.random.randint(0, 2, size=(batch_size, num_heads, seq_length, seq_length), dtype=np.bool) fused_x_tensor = flow.tensor(x).to("cuda") fused_mask_tensor = flow.tensor(mask, dtype=flow.bool).to("cuda") fused_x_tensor.requires_grad = True fused_out = flow._C.fused_scale_mask_softmax( fused_x_tensor, fused_mask_tensor, fill_value=fill_value, scale=scale_value, ) origin_x_tensor = flow.tensor(x).to("cuda") origin_mask_tensor = flow.tensor(mask, dtype=flow.float32).to("cuda") origin_x_tensor.requires_grad = True origin_out = flow.mul(origin_x_tensor, origin_mask_tensor) * scale_value + fill_value * ( 1.0 - origin_mask_tensor) origin_out = flow.softmax(origin_out, dim=-1) total_out = fused_out.sum() + origin_out.sum() total_out.backward() test_case.assertTrue( np.allclose(fused_out.numpy(), origin_out.numpy(), atol=1e-4, rtol=1e-4)) test_case.assertTrue( np.allclose( fused_x_tensor.grad.numpy(), origin_x_tensor.grad.numpy(), atol=1e-4, rtol=1e-4, ))
def _scaled_dot_product_attention( q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: float = 0.0, ) -> Tuple[Tensor, Tensor]: B, Nt, E = q.shape q = q / math.sqrt(E) # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) attn = flow.bmm(q, k.transpose(-2, -1)) if attn_mask is not None: attn += attn_mask attn = flow.softmax(attn, dim=-1) if dropout_p > 0.0: attn = flow.nn.functional.dropout(attn, p=dropout_p) # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) output = flow.bmm(attn, v) return output, attn
def inference_afqmc(args): tokenizer = BertTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext") model = ClueAFQMCCPT(args.pretrain_dir, args.n_classes, args.is_train).to(args.device) vec = tokenizer(args.text1, args.text2) input_ids = vec["input_ids"] attention_mask = vec["attention_mask"] input_ids = flow.tensor(input_ids, dtype=flow.int32).reshape(1, -1).to(args.device) attention_mask = (flow.tensor(attention_mask, dtype=flow.int32).reshape( 1, -1).to(args.device)) model.load_state_dict(flow.load(args.model_load_dir)) model.eval() output = model(input_ids, attention_mask) output = flow.softmax(output) label = flow.argmax(output) print("Softmax output:", output.numpy()) print("Predict:", label.numpy())
def compute_context(self, values, scores, mask=None): """ Args: values: [b, t2, v] or [b, nh, t2, v] scores: [b, t1, t2] or [b, nh, t1, t2] mask: [b, t1, t2] or [b, 1/nh, t1, t2] """ assert values.dim() == scores.dim() if mask is not None: scores = flow.masked_fill(scores, mask == 0, -float("inf")) weights = flow.softmax(scores, dim=-1) context = flow.matmul(weights, values) if context.dim() == 4: b, n, t, v = context.size() context = context.transpose(1, 2).reshape(b, t, n * v) if self.enable_output_proj: context = self.output_proj(context) return self.dropout(context), weights
def forward(self, hidden_states, attention_mask): # hidden_states: [batch_size * seq, hidden_size] query_layer = self.transpose_for_scores(self.query(hidden_states)) key_layer = self.transpose_for_scores(self.key(hidden_states)) value_layer = self.transpose_for_scores(self.value(hidden_states)) attention_scores = flow.matmul(query_layer, key_layer, transpose_b=True) attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. attention_probs = flow.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) context_layer = flow.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3) context_layer = flow.reshape(context_layer, [-1, self.all_head_size]) return context_layer
def _softmax(self, dim=None): return flow.softmax(self, dim=dim)
def forward(self, x): x = self.feature_extractor(x) x = flow.flatten(x, 1) logits = self.classifier(x) probs = flow.softmax(logits, dim=1) return logits, probs