def test_arange(test_case): np_out = np.arange(5) of_out = flow.arange(0, end=5) test_case.assertTrue(np.allclose(of_out.numpy(), np_out)) np_out2 = np.arange(0, 20, 2) of_out2 = flow.arange(0, 20, step=2) test_case.assertTrue(np.allclose(of_out2.numpy(), np_out2))
def test_arange_v2(test_case): np_out = np.arange(20) of_out = flow.arange(start=0, end=20) test_case.assertTrue(np.allclose(of_out.numpy(), np_out)) np_out2 = np.arange(0, 100, 3) of_out2 = flow.arange(start=0, end=100, step=3) test_case.assertTrue(np.allclose(of_out2.numpy(), np_out2))
def __init__(self, d_model, max_len=5000): super(PositionalEncoding, self).__init__() # Compute the positional encodings once in log space. pe = flow.zeros(max_len, d_model, requires_grad=False) position = flow.arange(0, max_len).unsqueeze(1).to(dtype=flow.float32) div_term = flow.exp( flow.arange(0, d_model, 2).to(dtype=flow.float32) * -(math.log(10000.0) / d_model) ) pe[:, 0::2] = flow.sin(position * div_term) pe[:, 1::2] = flow.cos(position * div_term) pe = pe.unsqueeze(0) self.register_buffer("pe", pe)
def __init__(self, d_model, dropout=0.1, max_len=5000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(p=dropout) pe = flow.zeros((max_len, d_model)) position = flow.arange(0, max_len, dtype=flow.float).unsqueeze(1) div_term = flow.exp( flow.arange(0, d_model, 2).to(flow.float) * (-math.log(10000.0) / d_model) ).unsqueeze(0) pe[:, 0::2] = flow.sin(position * div_term) pe[:, 1::2] = flow.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.pe = flow.nn.Parameter(pe, requires_grad=False)
def forward(self, targets, memory, memory_mask): dec_output = self.embedding(targets) if self.relative_positional: position = flow.arange( -(dec_output.size(1) - 1), dec_output.size(1), device=dec_output.device ).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: dec_output, pos = self.pos_emb(dec_output) dec_mask = get_transformer_decoder_mask(targets) attn_weights = {} for i, block in enumerate(self.blocks): dec_output, attn_weight = block( dec_output, dec_mask, memory, memory_mask.unsqueeze(1), pos ) attn_weights["dec_block_%d" % i] = attn_weight if self.normalize_before: dec_output = self.after_norm(dec_output) logits = self.output_layer(dec_output) return logits, attn_weights
def _test_global_stateful_kernel_with_inpersistent_state(test_case, placement, sbp): x = ( flow.arange(64) .reshape(8, 8) .to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast) ) x = x.to_global(placement, sbp) y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) y_np = np.array([[0], [8], [16]]) test_case.assertTrue( np.array_equal( y.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast) .to_local() .numpy(), y_np, ) ) x = x.to_global(sbp=flow.sbp.split(1)) y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) test_case.assertTrue( np.array_equal( y.to_global(flow.env.all_device_placement("cpu"), flow.sbp.broadcast) .to_local() .numpy(), y_np, ) )
def nllloss_1d(self, input, target): n = input.shape[0] idx = flow.unsqueeze(flow.arange(0, n, 1), dim=1) target = flow.unsqueeze(target, dim=1) t = flow.cat([idx, target], dim=1) res = self._gather_nd_op(input, t)[0] return res
def _test_arange_backward(test_case, device): np_out = np.arange(13) x = flow.arange(13, dtype=flow.float32, device=device) x.requires_grad = True y = x.sum() y.backward() test_case.assertTrue(np.allclose(x.grad.numpy(), np.ones(13), 1e-05, 1e-05))
def forward( self, input_ids: flow.Tensor, token_type_ids: Optional[flow.Tensor] = None, position_ids: Optional[flow.Tensor] = None, ) -> flow.Tensor: input_shape = input_ids.size() seq_length = input_shape[1] if token_type_ids is None: token_type_ids = flow.zeros(input_shape, dtype=flow.long, device=input_ids.device) if position_ids is None: position_ids = flow.arange(seq_length, dtype=flow.long, device=input_ids.device) position_ids = position_ids.unsqueeze(0).expand(input_shape) input_embeddings = self.token_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) position_embeddings = self.position_embeddings(position_ids) embeddings = input_embeddings + position_embeddings + \ token_type_embeddings embeddings = self.layer_norm(embeddings) embeddings = self.dropout(embeddings) return embeddings
def __init__( self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim**-0.5 # define a parameter table of relative position bias # Author zzk: we add trunc normal here! self.relative_position_bias_table = nn.Parameter( flow.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH self.relative_position_bias_table.trunc_normal_(std=0.02) # get pair-wise relative position index for each token inside the window coords_h = flow.arange(self.window_size[0]) coords_w = flow.arange(self.window_size[1]) coords = flow.stack(flow.meshgrid(*[coords_h, coords_w])) # 2, Wh, Ww coords_flatten = flow.flatten(coords, 1) # 2, Wh*Ww relative_coords = (coords_flatten[:, :, None] - coords_flatten[:, None, :]) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute(1, 2, 0) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww self.register_buffer("relative_position_index", relative_position_index) self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.softmax = nn.Softmax(dim=-1)
def _pos_encoding(self, inputs): if self.relative_positional: enc_output = inputs position = flow.arange(-(inputs.size(1) - 1), inputs.size(1), device=inputs.device).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: enc_output, pos = self.pos_emb(inputs) return enc_output, pos
def test_tensor_scatter_nd_update_runtime_error(test_case): with test_case.assertRaises(Exception) as context: x = flow.arange(8, dtype=flow.float32, requires_grad=True) indices = flow.tensor([[1], [3], [5]]) updates = flow.tensor([-1, -2, -3], dtype=flow.float64, requires_grad=True) y = flow.tensor_scatter_nd_update(x, indices, updates) test_case.assertTrue("The dtype of tensor and updates must be same." in str(context.exception))
def unpad_sequence( padded_sequences: Tensor, lengths: Tensor, batch_first: bool = False, ) -> List[Tensor]: """ Unpad padded Tensor into a list of variable length Tensors ``unpad_sequence`` unstacks padded Tensor into a list of variable length Tensors. Args: padded_sequences (Tensor): padded sequences. lengths (Tensor): length of original (unpadded) sequences. batch_first (bool, optional): whether batch dimension first or not. Default: False. Returns: a list of :class:`Tensor` objects For example: .. code-block:: python >>> from oneflow.nn.utils.rnn import pad_sequence, unpad_sequence >>> import oneflow as flow >>> import numpy as np >>> a = flow.ones(25, 300) >>> b = flow.ones(22, 300) >>> c = flow.ones(15, 300) >>> sequences = [a, b, c] >>> padded_sequences = pad_sequence(sequences) >>> lengths = flow.as_tensor([v.size(0) for v in sequences]) >>> unpadded_sequences = unpad_sequence(padded_sequences, lengths) >>> np.allclose(sequences[0].numpy(), unpadded_sequences[0].numpy()) True >>> np.allclose(sequences[1].numpy(), unpadded_sequences[1].numpy()) True >>> np.allclose(sequences[2].numpy(), unpadded_sequences[2].numpy()) True """ unpadded_sequences = [] if not batch_first: padded_sequences = padded_sequences.permute((1, 0, 2)) max_length = padded_sequences.shape[1] idx = flow.arange(max_length) for seq, length in zip(padded_sequences, lengths): mask = idx < length unpacked_seq = seq[mask] unpadded_sequences.append(unpacked_seq) return unpadded_sequences
def invert_permutation(permutation: Optional[Tensor]) -> Optional[Tensor]: if permutation is None: return None return flow.scatter( flow.zeros_like(permutation), 0, permutation, flow.arange(0, permutation.numel(), device=permutation.device, dtype=flow.int32), )
def test_stateful_kernel_with_inpersistent_state(test_case): x = flow.arange(4).reshape(2, 2) x = x.to_global(flow.env.all_device_placement("cuda"), flow.sbp.split(0)) y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) y_np = np.array([[0], [2], [0]]) test_case.assertTrue( np.array_equal(y.to_global(sbp=flow.sbp.broadcast).to_local().numpy(), y_np) ) x = x.to_global(sbp=flow.sbp.split(1)) y = flow._C.logical_slice(x, [0, 0], [3, 1], [1, 1]) test_case.assertTrue( np.array_equal(y.to_global(sbp=flow.sbp.broadcast).to_local().numpy(), y_np) )
def _test_arange_with_random_data(test_case, placement, sbp): start = random(0, 10).to(int).value() end = start + random(0, 10).to(int).value() step = random(1, max(2, end - start)).to(int).value() start = start * 8 end = end * 8 x = torch.arange(start=start, end=end, step=step) x.oneflow = flow.arange(start=start, end=end, step=step, placement=placement, sbp=sbp) return x
def decode_step(self, preds, memory, memory_mask, cache, scores, flag): """ decode an utterance in a stepwise way""" batch_size = int(scores.size(0) / self.beam_width) batch_log_probs, dec_cache, dec_attn_weights = self.decode( preds, memory, memory_mask, cache["decoder"]) if self.lm is not None: batch_lm_log_probs, lm_hidden = self.lm_decode(preds, cache["lm"]) batch_lm_log_probs = batch_lm_log_probs.squeeze(1) batch_log_probs = batch_log_probs + self.lm_weight * batch_lm_log_probs else: lm_hidden = None if batch_log_probs.dim() == 3: batch_log_probs = batch_log_probs.squeeze(1) last_k_scores, last_k_preds = batch_log_probs.topk(self.beam_width) last_k_scores = mask_finished_scores(last_k_scores, flag) last_k_preds = mask_finished_preds(last_k_preds, flag) # update scores scores = scores + last_k_scores scores = scores.view(batch_size, self.beam_width * self.beam_width) # pruning scores, offset_k_indices = flow.topk(scores, k=self.beam_width) scores = scores.view(-1, 1) device = scores.device base_k_indices = (flow.arange(batch_size, device=device).view( -1, 1).repeat([1, self.beam_width])) base_k_indices *= self.beam_width**2 best_k_indices = base_k_indices.view(-1) + offset_k_indices.view(-1) # update predictions best_k_preds = flow.index_select(last_k_preds.view(-1), dim=0, index=best_k_indices).to(flow.int64) preds_index = best_k_indices.floor_divide(self.beam_width) preds_symbol = flow.index_select(preds, dim=0, index=preds_index) preds_symbol = flow.cat( [preds_symbol, best_k_preds.view(-1, 1)], dim=1) # finished or not end_flag = flow.eq(preds_symbol[:, -1], EOS).view(-1, 1).to(flow.uint8) return preds_symbol, cache, scores, end_flag
def find_pruneable_heads_and_indices( heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] ) -> Tuple[Set[int], flow.Tensor]: mask = flow.ones(n_heads, head_size) # Convert to set and remove already pruned heads heads = set(heads) - already_pruned_heads for head in heads: # Compute how many pruned heads are before the head and move the index accordingly head = head - sum(1 if h < head else 0 for h in already_pruned_heads) mask[head] = 0 mask = mask.view(-1).contiguous().eq(1) index: flow.Tensor = flow.arange(len(mask), dtype=flow.int64)[mask] return heads, index
def forward(self, x: flow.Tensor): """Add positional encoding. Args: x (torch.Tensor): Input. Its shape is (batch, time, ...) Returns: torch.Tensor: Encoded tensor. Its shape is (batch, time, ...) """ pos = flow.arange(0, x.size(1), device=x.device).reshape(1, -1) # [1, t] posemb = self._embedding_from_positions(pos) # [1, t, emb_dim] if self.scale_learnable: x = x + self.alpha * posemb else: x = x * self.xscale + posemb return self.dropout(x), posemb
def _test_arange_with_float_delta(test_case, placement, sbp): start = random(0, 10).to(int).value() end = start + random(0, 10).to(int).value() step = random(1, max(2, end - start)).to(float).value() start = start * 8 end = end * 8 x = torch.arange(start=start, end=end, step=step, requires_grad=True) x.oneflow = flow.arange( start=start, end=end, step=step, placement=placement, sbp=sbp, requires_grad=True, ) return x
def get_extended_attention_mask(self, attention_mask: flow.Tensor, input_shape: Tuple[int], device: flow.device) -> flow.Tensor: # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.is_decoder: batch_size, seq_length = input_shape seq_ids = flow.arange(seq_length, device=device) causal_mask = (seq_ids[None, None, :].repeat( batch_size, seq_length, 1) <= seq_ids[None, :, None]) # in case past_key_values are used we need to add a prefix ones mask to the causal mask causal_mask = causal_mask.to(attention_mask.dtype) if causal_mask.shape[1] < attention_mask.shape[1]: prefix_seq_len = attention_mask.shape[ 1] - causal_mask.shape[1] causal_mask = flow.cat( [ flow.ones( (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype, ), causal_mask, ], axis=-1, ) extended_attention_mask = (causal_mask[:, None, :, :] * attention_mask[:, None, None, :]) else: extended_attention_mask = attention_mask[:, None, None, :] else: raise ValueError( f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" ) extended_attention_mask = extended_attention_mask.to(dtype=flow.float) extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 return extended_attention_mask
def _prob_in_top_k( self, clean_values, noisy_values, noise_stddev, noisy_top_values ): """Helper function to NoisyTopKGating. Computes the probability that value is in top k, given different random noise. This gives us a way of backpropagating from a loss that balances the number of times each expert is in the top k experts per example. In the case of no noise, pass in None for noise_stddev, and the result will not be differentiable. Args: clean_values: a `Tensor` of shape [batch, n]. noisy_values: a `Tensor` of shape [batch, n]. Equal to clean values plus normally distributed noise with standard deviation noise_stddev. noise_stddev: a `Tensor` of shape [batch, n], or None noisy_top_values: a `Tensor` of shape [batch, m]. "values" Output of tf.top_k(noisy_top_values, m). m >= k+1 Returns: a `Tensor` of shape [batch, n]. """ batch = clean_values.size(0) m = noisy_top_values.size(1) top_values_flat = noisy_top_values.flatten() threshold_positions_if_in = ( flow.arange(batch, device=noisy_values.device) * m + self.k ) threshold_if_in = flow.unsqueeze( flow.gather(top_values_flat, 0, threshold_positions_if_in), 1 ) is_in = flow.gt(noisy_values, threshold_if_in) threshold_positions_if_out = threshold_positions_if_in - 1 threshold_if_out = flow.unsqueeze( flow.gather(top_values_flat, 0, threshold_positions_if_out), 1 ) # is each value currently in the top k. prob_if_in = cdf((clean_values - threshold_if_in) / noise_stddev) prob_if_out = cdf((clean_values - threshold_if_out) / noise_stddev) prob = flow.where(is_in, prob_if_in, prob_if_out) return prob
def select_chunk_states_and_mask_based_index(tensor, tensor_mask, index): # tensor: [b, c, t, v] # index: [b] # return [b, t, v] assert tensor.dim() == 4 assert tensor_mask.dim() == 3 assert index.dim() == 1 b, c, t, v = tensor.size() base_index = flow.arange(b, device=tensor.device) * c indices = base_index + index select_tensor = flow.index_select(tensor.reshape(b * c, t, v), 0, indices.long()) select_tensor_mask = flow.index_select(tensor_mask.reshape(b * c, 1, t), 0, indices.long()) return select_tensor, select_tensor_mask
def __init__( self, vocab_size, type_vocab_size, max_position_embeddings, hidden_size, hidden_dropout_prob, seq_length, ): super().__init__() self.word_embeddings = nn.Embedding(vocab_size, hidden_size) self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) self.LayerNorm = nn.LayerNorm(hidden_size) self.dropout = nn.Dropout(hidden_dropout_prob, inplace=True) self.register_buffer( "position_ids", flow.arange(max_position_embeddings).unsqueeze(0) ) self.seq_length = seq_length
def forward(self, inputs, mask): if self.relative_positional: enc_output = inputs position = flow.arange(-(inputs.size(1) - 1), inputs.size(1), device=inputs.device).reshape(1, -1) pos = self.pos_emb._embedding_from_positions(position) else: enc_output, pos = self.pos_emb(inputs) attn_weights = {} for i, block in enumerate(self.blocks): enc_output, attn_weight = block(enc_output, mask.unsqueeze(1), pos) attn_weights["enc_block_%d" % i] = attn_weight if self.normalize_before: enc_output = self.norm(enc_output) return enc_output, mask, attn_weights
def _embedding_from_positions(self, position): """get absolute pos embedding based position. Args: position (torch.Tensor): Input. Its shape is (b, t) Returns: posemb (torch.Tensor): Encoded tensor. Its shape is (b, time, emb_dim) """ batch_size, time_step = position.size() posemb = flow.zeros(batch_size, time_step, self.emb_dim, device=position.device) div_term = flow.exp( flow.arange( 0, self.emb_dim, 2, device=position.device, dtype=flow.float32) * -(math.log(10000.0) / self.emb_dim)) posemb[:, :, 0::2] = flow.sin(position.float().unsqueeze(-1) * div_term) posemb[:, :, 1::2] = flow.cos(position.float().unsqueeze(-1) * div_term) return posemb
def test_arange_graph(test_case): of_eager_out = flow.arange(start=0, end=100, step=3, device=flow.device("cuda")) class ArangeGraph(flow.nn.Graph): def __init__(self): super().__init__() def build(self): return flow.arange(start=0, end=100, step=3, device=flow.device("cuda")) arange_g = ArangeGraph() of_lazy_out = arange_g() test_case.assertTrue( np.allclose(of_eager_out.numpy(), of_lazy_out.numpy(), 1e-05, 1e-05))
def lm_rescoring(self, preds, pred_lens): # preds [beam_size, lens] # preds_len [beam_size] if self.lm.model_type == "transformer_lm": log_probs = self.lm.predict(preds, last_frame=False) else: log_probs = [] hidden = None for t in range(preds.size(1)): log_prob, hidden = self.lm.predict(preds[:, t].unsqueeze(-1), hidden) log_probs.append(log_prob) log_probs = flow.cat(log_probs, dim=1) rescores = [] max_length = log_probs.size(1) vocab_size = log_probs.size(-1) for b in range(preds.size(0)): base_index = flow.arange(max_length, device=preds.device) bias_index = preds[b].reshape(-1) index = base_index * vocab_size + bias_index score = flow.index_select(log_probs[b].reshape(-1), dim=-1, index=index) label_len = min(int(pred_lens[b]), score.size(0)) score[label_len - 1:] = 0 rescores.append(flow.sum(score) / label_len) rescores = flow.tensor(rescores, dtype=flow.float32) _, indices = flow.sort(rescores, dim=-1, descending=True) sorted_preds = preds[indices] sorted_length = pred_lens[indices] return sorted_preds, sorted_length
def _make_causal_mask( input_ids_shape: flow.Size, dtype: flow.dtype, device: flow.device, past_key_values_length: int = 0, ): """ Make causal mask used for bi-directional self-attention. """ bsz, tgt_len = input_ids_shape mask = flow.ones((tgt_len, tgt_len)) * float("-inf") mask_cond = flow.arange(mask.size(-1)) mask = mask.masked_fill(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) mask = mask.to(dtype) if past_key_values_length > 0: mask = flow.cat( [flow.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1) return (mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length).to(device))
def select_tensor_based_index(tensor, index): # tensor: [b, c, t, v] # index: [b] # return [b, t, v] assert tensor.dim() >= 2 assert index.dim() == 1 batch_size = tensor.size(0) tensor_len = tensor.size(1) base_index = flow.arange(batch_size, device=tensor.device) * tensor_len indices = base_index + index if tensor.dim() == 2: select_tensor = flow.index_select( tensor.reshape(batch_size * tensor_len), 0, indices.long()) else: assert tensor.dim() == 3 select_tensor = flow.index_select( tensor.reshape(batch_size * tensor_len, tensor.size(-1)), 0, indices.long()) return select_tensor