def forward(self, queries: np.ndarray, key_values: np.ndarray, heads: np.ndarray, lengths: Optional[np.ndarray] = None, bias: Optional[np.ndarray] = None): # (n*h, lq, lk) logits = npx.interleaved_matmul_encdec_qk(queries, key_values, heads=heads) if bias is not None: logits = logits + bias if lengths is not None: # required shape for lengths: (n*h, lq); required dtype: int32 probs = npx.softmax(logits, axis=-1, length=lengths, use_length=True) else: probs = npx.softmax(logits, axis=-1) probs = npx.dropout(probs, p=self.dropout) if self.dropout > 0.0 else probs # key_values: (lk, n, dv * 2) # probs: (n*h, lq, lk) # result: (n, lq, dv) return npx.interleaved_matmul_encdec_valatt(key_values, probs, heads=heads)
def decode_step(self, step_input: np.ndarray, states: List[np.ndarray], vocab_slice_ids: Optional[np.ndarray] = None): outputs = [] # type: List[np.ndarray] new_states = [] # type: List[np.ndarray] factor_outputs = [] # type: List[List[np.ndarray]] state_index = 0 for model, model_state_structure in zip(self._models, self.state_structure()): model_states = states[state_index:state_index+len(model_state_structure)] state_index += len(model_state_structure) logits, model_states, target_factor_outputs = model.decode_step(step_input, model_states, vocab_slice_ids) probs = npx.softmax(logits, axis=-1, temperature=self._softmax_temperature) outputs.append(probs) target_factor_probs = [npx.softmax(tfo, axis=-1) for tfo in target_factor_outputs] factor_outputs.append(target_factor_probs) new_states += model_states scores = self._interpolation(outputs) target_factors = None # type: Optional[np.ndarray] if factor_outputs: # target factors are greedily 'decoded'. factor_predictions = [npx.cast(np.expand_dims(np.argmin(self._interpolation(fs), axis=-1), axis=1), dtype='int32') for fs in zip(*factor_outputs)] if factor_predictions: target_factors = factor_predictions[0] if len(factor_predictions) == 1 \ else np.concatenate(factor_predictions, axis=1) return scores, new_states, target_factors
def masked_softmax(X, valid_len): # ToDo : Why masked softmax is necessary? What is valid_len? # X: 3-D tensor, valid_len: 1-D or 2-D tensor if valid_len is None: return npx.softmax(X) else: shape = X.shape if valid_len.ndim == 1: valid_len = valid_len.repeat(shape[1], axis=0) else: valid_len = valid_len.reshape(-1) # Fill masked elements with a large negative, whose exp is 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_len, True, axis=1, value=-1e6) return npx.softmax(X).reshape(shape)
def masked_softmax(att_score, mask, axis: int = -1, temperature=None): """Ignore the masked elements when calculating the softmax. The mask can be broadcastable. Parameters ---------- att_score : Symbol or NDArray Shape (..., length, ...) mask : Symbol or NDArray or None Shape (..., length, ...) 1 --> The element is not masked 0 --> The element is masked axis The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis] temperature The temperature. It scales down the scores before applying the softmax. Returns ------- att_weights : Symborl or NDArray Shape (..., length, ...) """ if mask is None: return npx.softmax(att_score, axis=axis, temperature=temperature) else: return npx.masked_softmax(att_score, mask=mask.astype(np.bool), axis=axis, temperature=temperature)
def masked_softmax(X, valid_lens): """Perform softmax operation by masking elements on the last axis.""" # `X`: 3D tensor, `valid_lens`: 1D or 2D tensor if valid_lens is None: return npx.softmax(X) else: shape = X.shape if valid_lens.ndim == 1: valid_lens = valid_lens.repeat(shape[1]) else: valid_lens = valid_lens.reshape(-1) # On the last axis, replace masked elements with a very large negative # value, whose exponentiation outputs 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_lens, True, value=-1e6, axis=1) return npx.softmax(X).reshape(shape)
def masked_softmax(X, valid_len): """Perform softmax by filtering out some elements.""" # X: 3-D tensor, valid_len: 1-D or 2-D tensor if valid_len is None: return npx.softmax(X) else: shape = X.shape if valid_len.ndim == 1: valid_len = valid_len.repeat(shape[1], axis=0) else: valid_len = valid_len.reshape(-1) # Fill masked elements with a large negative, whose exp is 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_len, True, axis=1, value=-1e6) return npx.softmax(X).reshape(shape)
def test_softmax(): input_data = np.ones((SMALL_Y, LARGE_X)) for axis in [0, 1]: true_output = np.full((SMALL_Y, LARGE_X), (1 / input_data.shape[axis])) output = npx.softmax(input_data, axis=axis) assert_almost_equal(output.asnumpy(), true_output, rtol=1e-5, atol=1e-5)
def gumbel_softmax(logits, temperature: float = 1.0, eps: float = 1E-10, hard=True, use_np_gumbel: bool = True): r"""Perform the gumbel-softmax trick to generate differentiable one-hot vectors from the input logits. Here, the gumbel distribution is Gumbel(\alpha) = -log (-log U) + \log \alpha, in which U is the uniform(0, 1) distribution. A nice property of Gumbel is: \argmax({Gumbel(\alpha_i)}) \sim multinomial(\alpha_i) The Gumbel-Softmax trick is to use the softmax + straight-through estimator to produce one-hot vectors that represent the sampling result. References: 1. https://en.wikipedia.org/wiki/Gumbel_distribution 2. [ICLR2017] Categorical Reparameterization with Gumbel-Softmax Parameters ---------- logits Logits. Shape (..., V) temperature The temperature that controls the eps The eps for stability of gradient hard Whether to use the straight-through estimator to produce one-hot vectors. use_np_gumbel Whether to use the random.gumble operator Returns ------- ret The returned output. Shape (..., V) """ # TODO(sxjscience) Investigate the impact of random.gumbel: # Actually, random.gumble has no eps and may have problem in calculating the gradient. if use_np_gumbel: gumbels = np.random.gumbel(np.zeros_like(logits)) else: u = np.random.uniform(np.zeros_like(logits), 1) gumbels = -np.log(-np.log(u + eps) + eps) y = npx.softmax((gumbels + logits) / temperature, axis=-1) if hard: y_hard = np.max(y, axis=-1, keepdims=True) == y y_hard = npx.stop_gradient(y_hard - y) + y return y_hard else: return y
def forward(self, A, B): # Shape of `A`/`B`: (b`atch_size`, no. of words in sequence A/B, # `embed_size`) # Shape of `f_A`/`f_B`: (`batch_size`, no. of words in sequence A/B, # `num_hiddens`) f_A = self.f(A) f_B = self.f(B) # Shape of `e`: (`batch_size`, no. of words in sequence A, # no. of words in sequence B) e = npx.batch_dot(f_A, f_B, transpose_b=True) # Shape of `beta`: (`batch_size`, no. of words in sequence A, # `embed_size`), where sequence B is softly aligned with each word # (axis 1 of `beta`) in sequence A beta = npx.batch_dot(npx.softmax(e), B) # Shape of `alpha`: (`batch_size`, no. of words in sequence B, # `embed_size`), where sequence A is softly aligned with each word # (axis 1 of `alpha`) in sequence B alpha = npx.batch_dot(npx.softmax(e.transpose(0, 2, 1)), A) return beta, alpha
def masked_softmax(att_score, mask, dtype=np.float32, axis: int = -1): """Ignore the masked elements when calculating the softmax. The mask can be broadcastable. Parameters ---------- att_score : Symborl or NDArray Shape (..., length, ...) mask : Symbol or NDArray or None Shape (..., length, ...) 1 --> The element is not masked 0 --> The element is masked dtype data type axis The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis] Returns ------- att_weights : Symborl or NDArray Shape (..., length, ...) """ if mask is not None: # Fill in the masked scores with a very small value neg = -1e18 if _np.dtype(dtype) == np.float16: neg = -1e4 else: try: # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN. from mxnet import amp if amp.amp._amp_initialized: neg = -1e4 except ImportError: pass att_score = np.where(mask, att_score, neg) logits = npx.softmax(att_score, axis=axis) * mask else: logits = npx.softmax(att_score, axis=axis) return logits
def predict(X): anchors, cls_preds, bbox_preds = net(X.as_in_context(ctx[0])) cls_probs = npx.softmax(cls_preds).transpose(0, 2, 1) output = npx.multibox_detection(cls_probs, bbox_preds, anchors) idx = [i for i, row in enumerate(output[0]) if row[0] != -1] return output[0, idx]
"""## Classifying the Testing Set and Submitting Results on Kaggle After obtaining a satisfactory model design and hyperparameters, we use all training datasets (including validation sets) to retrain the model and then classify the testing set. Note that predictions are made by the output network we just trained. """ net = get_net(devices) net.hybridize() train(net, train_valid_iter, None, num_epochs, lr, wd, devices, lr_period, lr_decay) preds = [] for data, label in test_iter: output_features = net.features(data.as_in_ctx(devices[0])) output = npx.softmax(net.output_new(output_features)) preds.extend(output.asnumpy()) ids = sorted(os.listdir( os.path.join(data_dir, 'train_valid_test', 'test', 'unknown'))) with open('submission.csv', 'w') as f: f.write('id,' + ','.join(train_valid_ds.synsets) + '\n') for i, output in zip(ids, preds): f.write(i.split('.')[0] + ',' + ','.join( [str(num) for num in output]) + '\n') """After executing the above code, we will generate a "submission.csv" file. The format of this file is consistent with the Kaggle competition requirements. The method for submitting results is similar to method in :numref:`sec_kaggle_house`.
def forward(self, x): x = self.dense(x) probs = self.action_pred(x) values = self.value_pred(x) return npx.softmax(probs), values