def make_pad_mask(lengths: Union[torch.LongTensor, List[int]], xs: torch.FloatTensor, length_dim: int): if length_dim == 0: raise ValueError(f"length_dim cannot be {length_dim}") if not isinstance(lengths, list): lengths = lengths.tolist() bs = int(len(lengths)) if xs is None: maxlen = int(max(lengths)) else: maxlen = xs.shape[length_dim] seq_range = torch.arange(0, maxlen, dtype=torch.int64) seq_range_expand = seq_range.unsqueeze(0).expand(bs, maxlen) seq_length_expand = seq_range_expand.new(lengths).unsqueeze(-1) mask = seq_range_expand >= seq_length_expand if xs is not None: assert xs.shape[0] == bs, (xs.shape[0], bs) if length_dim < 0: length_dim = xs.dim() + length_dim ind = tuple( slice(None) if i in (0, length_dim) else None for i in range(xs.dim())) mask = mask[ind].expand_as(xs).to(xs.device) return mask
def forward(self, src: torch.FloatTensor, attn_mask: torch.FloatTensor) -> torch.FloatTensor: # attn mask if attn_mask.dim() == 2: attn_mask = attn_mask.unsqueeze(0) if attn_mask.dim()==3: attn_mask = attn_mask.unsqueeze(1) # generate q, k, v by Linear q, k, v = self.qkv_linear(src).chunk(3, dim=-1) # bsz*seq_len*hid # change shape for multi head # q = q.contiguous().view(src.shape[0] * self.n_head, src.shape[1], src.shape[2] // self.n_head) # k = k.contiguous().view(src.shape[0] * self.n_head, src.shape[1], src.shape[2] // self.n_head) # v = v.contiguous().view(src.shape[0] * self.n_head, src.shape[1], src.shape[2] // self.n_head) q = q.contiguous().view(src.shape[0], src.shape[1], self.n_head, src.shape[2] // self.n_head).permute(0, 2, 1, 3) # bsz*n_head*seq_len*h k = k.contiguous().view(src.shape[0], src.shape[1], self.n_head, src.shape[2] // self.n_head).permute(0, 2, 3, 1) # bsz*n_head*h*seq_len v = v.contiguous().view(src.shape[0], src.shape[1], self.n_head, src.shape[2] // self.n_head).permute(0, 2, 1, 3) # bsz*n_head*seq_len*h # compute weight attn_weights = torch.matmul(q, k) # bsz * n_head * seq_len * seq_len attn_weights = attn_weights * float((src.shape[2] // self.n_head)) ** -0.5 attn_weights = attn_weights * attn_mask + (attn_mask - 1) * 1e4 attn_weights = F.softmax(attn_weights, dim=-1) # TODO 把dropout加上, attn_weights加 attn_weights = self.dropout(attn_weights) # compute value attn_output = torch.matmul(attn_weights, v) attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(src.shape) attn_output = self.output_linear(attn_output) return attn_output
def cov(m: torch.FloatTensor, rowvar: bool = True, inplace: bool = False): '''Estimate a covariance matrix given data. Covariance indicates the level to which two variables vary together. If we examine N-dimensional samples, `X = [x_1, x_2, ... x_N]^T`, then the covariance matrix element `C_{ij}` is the covariance of `x_i` and `x_j`. The element `C_{ii}` is the variance of `x_i`. Args: m: A 1-D or 2-D array containing multiple variables and observations. Each row of `m` represents a variable, and each column a single observation of all those variables. rowvar: If `rowvar` is True, then each row represents a variable, with observations in the columns. Otherwise, the relationship is transposed: each column represents a variable, while the rows contain observations. Returns: The covariance matrix of the variables. ''' if m.dim() > 2: raise ValueError('m has more than 2 dimensions') if m.dim() < 2: m = m.view(1, -1) if not rowvar and m.size(0) != 1: m = m.t() # m = m.type(torch.double) # uncomment this line if desired fact = 1.0 / (m.size(1) - 1) if inplace: m -= torch.mean(m, dim=1, keepdim=True) else: m = m - torch.mean(m, dim=1, keepdim=True) mt = m.t() # if complex: mt = m.t().conj() return fact * m.matmul(mt).squeeze()
def forward(self, x: torch.FloatTensor) -> torch.FloatTensor: # type: ignore """Adds the stored bias parameters to `x`.""" assert x.dim() in [2, 4] if x.dim() == 2: bias = self._bias.t().view(1, -1) else: bias = self._bias.t().view(1, -1, 1, 1) return x + bias # type:ignore
def img_derivative(input: torch.FloatTensor, sobel_kernel: torch.FloatTensor) -> torch.FloatTensor: assert input.dim() == 4 assert sobel_kernel.dim() == 4 conv = torch.nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1, bias=False) conv.weight = torch.nn.Parameter(sobel_kernel.type_as(input), requires_grad=False) return conv(input) # [N, C, H, W]
def beam_search( self, img: FloatTensor, beam_size: int = 10, max_len: int = 200, alpha: float = 1.0, ) -> str: """for inference, one image at a time Parameters ---------- img : FloatTensor [1, h, w] beam_size : int, optional by default 10 max_len : int, optional by default 200 alpha : float, optional by default 1.0 Returns ------- str LaTex string """ assert img.dim() == 3 img_mask = torch.zeros_like(img, dtype=torch.long) # squeeze channel hyps = self.bttr.beam_search(img.unsqueeze(0), img_mask, beam_size, max_len) best_hyp = max(hyps, key=lambda h: h.score / (len(h)**alpha)) return vocab.indices2label(best_hyp.seq)
def rank_by_plackettluce( scores: _torch.FloatTensor, n: _torch.LongTensor, generator: Optional[_torch.Generator] = None) -> _torch.LongTensor: """Samples a ranking from a plackett luce distribution. This method ensures that padded documents are placed last. Args: scores: A tensor of size (batch_size, list_size, 1) or (batch_size, list_size) containing scores. n: A tensor of size (batch_size) containing list size of each query. """ if scores.dim() == 3: scores = scores.reshape((scores.shape[0], scores.shape[1])) masked_scores = mask_padded_values(scores, n) # This implementation uses reservoir sampling, which comes down to doing # Uniform(0, 1) ^ (1 / p) and then sorting by the resulting values. The # following implementation is a numerically stable variant that operates in # log-space. log_p = _torch.nn.LogSoftmax(dim=1)(masked_scores) rng_kwargs = {"generator": generator} if generator is not None else {} u = _torch.rand(log_p.shape, device=scores.device, **rng_kwargs) r = _torch.log(-_torch.log(u)) - log_p return tiebreak_argsort(r, descending=False, generator=generator)
def forward(self, input: torch.FloatTensor, target: torch.LongTensor): """ :param input: (N, C) where C = number of classes. :param target: (N) where each value is 0 <= targets[i] <= C-1 :return: Scaler. """ if input.dim() > 2: input = input.view(input.size(0), input.size(1), -1) # N,C,H,W => N,C,H*W input = input.transpose(1, 2) # N,C,H*W => N,H*W,C input = input.contiguous().view( -1, input.size(2)) # N,H*W,C => N*H*W,C target = target.view(-1, 1) logpt = F.log_softmax(input, dim=1) logpt = logpt.gather(1, target) logpt = logpt.view(-1) pt = logpt.exp() if self.alpha is not None: if self.alpha.type() != input.data.type(): self.alpha = self.alpha.type_as(input.data) at = self.alpha.gather(0, target.data.view(-1)) logpt = logpt * at loss = -1 * (1 - pt)**self.gamma * logpt if self.size_average: return loss.mean() else: return loss.sum()
def forward( self, s_hidden_states: FloatTensor, t_hidden_states: FloatTensor, attention_mask: LongTensor = None, ) -> FloatTensor: if s_hidden_states.dim() > 3: raise TypeError( "Cosine loss can be applied only to flatten hiddens") if attention_mask is not None: # HF transformers case return _cosine_loss_hf( s_hidden_states=s_hidden_states, t_hidden_states=t_hidden_states, attention_mask=attention_mask, ) if self.need_mapping: assert s_hidden_states.size(-1) == self.student_hidden_state_dim assert t_hidden_states.size(-1) == self.teacher_hidden_state_dim s_hidden_states = s_hidden_states.reshape( -1, self.student_hidden_state_dim) t_hidden_states = self.proj( t_hidden_states.reshape(-1, self.teacher_hidden_state_dim)) else: hidden_dim = s_hidden_states.size(-1) s_hidden_states = s_hidden_states.reshape(-1, hidden_dim) t_hidden_states = t_hidden_states.reshape(-1, hidden_dim) assert s_hidden_states.shape == t_hidden_states.shape target = torch.ones(t_hidden_states.size(0)) return self.loss_fn(s_hidden_states, t_hidden_states, target)
def format_tensor_img(t_img: torch.FloatTensor, code: str) -> torch.FloatTensor: ''' Transform the tensor image to a specified format. Args: t_img: tensor image. must be torch.FloatTensor between 0-1 code: str ''' assert isinstance(t_img, torch.FloatTensor) and 0 <= t_img.mean() <= 1 assert t_img.dim() == 3 and t_img.shape[0] == 3 if code == 'RGB_1': pass elif code == 'RGB_1_norm': means = [0.485, 0.456, 0.406] stds = [0.229, 0.224, 0.225] t_img = tvf.normalize(t_img, means, stds) elif code == 'BGR_255_norm': # to BGR, to 255 t_img = t_img[[2, 1, 0], :, :] * 255 # normalization t_img = tvf.normalize(t_img, [102.9801, 115.9465, 122.7717], [1, 1, 1]) else: raise NotImplementedError() return t_img
def __viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) -> List[List[int]]: assert emissions.dim() == 3 and mask.dim() == 2 assert emissions.shape[:2] == mask.shape assert emissions.size(2) == self.num_tags assert mask[0].all() seq_length, batch_size = mask.shape # self.start_transitions start 到其他tag(不包含end)的得分 score = self.start_transitions + emissions[0] history = [] for i in range(1, seq_length): broadcast_score = score.unsqueeze(2) broadcast_emissions = emissions[i].unsqueeze(1) next_score = broadcast_score + self.transitions + broadcast_emissions next_score, indices = next_score.max(dim=1) score = torch.where(mask[i].unsqueeze(1), next_score, score) history.append(indices) score += self.end_transitions seq_ends = mask.long().sum(dim=0) - 1 best_tags_list = [] for idx in range(batch_size): _, best_last_tag = score[idx].max(dim=0) best_tags = [best_last_tag.item()] for hist in reversed(history[:seq_ends[idx]]): best_last_tag = hist[idx][best_tags[-1]] best_tags.append(best_last_tag.item()) best_tags.reverse() best_tags_list.append(best_tags) return best_tags_list
def forward(self, queries: torch.FloatTensor, keys: torch.FloatTensor, values: torch.FloatTensor, mask: torch.ByteTensor = None) -> torch.Tensor: """Runs the attention mechanism. Args: queries (torch.FloatTensor): The queries as (batch_size x Q x dim_model)-tensor. keys (torch.FloatTensor): The keys as (batch_size x KV x dim_model)-tensor. values (torch.FloatTensor): The values as (batch_size x KV x dim_model)-tensor. mask (torch.ByteTensor, optional): An optional binary mask that indicates which key-value pairs to consider for each of the queries. If provided, then this has to be a (batch_size x Q x KV)-tensor. Returns: torch.FloatTensor: The values computed by the attention mechanism as (batch_size x Q x dim_model)-tensor. """ assert isinstance(queries, torch.FloatTensor) or isinstance( queries, torch.cuda.FloatTensor) assert isinstance(keys, torch.FloatTensor) or isinstance( keys, torch.cuda.FloatTensor) assert isinstance(values, torch.FloatTensor) or isinstance( values, torch.cuda.FloatTensor) assert queries.dim() == 3 assert keys.dim() == 3 assert values.dim() == 3 assert queries.size(0) == keys.size(0) assert queries.size(0) == values.size(0) assert queries.size(2) == keys.size(2) assert queries.size(2) == values.size(2) assert keys.size(1) == values.size(1) if mask is not None: assert isinstance(mask, torch.ByteTensor) or isinstance( mask, torch.cuda.ByteTensor) assert mask.dim() == 3 assert queries.size(0) == mask.size(0) assert queries.size(1) == mask.size(1) assert keys.size(1) == mask.size(2) # for each of the attention heads, project inputs to the needed dimensions queries, keys, values = self._project_inputs(queries, keys, values) # compute attention value attn_values = self._apply_attention(queries, keys, values, mask) # project retrieved values to needed dimensions return self._project_output(attn_values)
def eval(self, x: torch.FloatTensor, xe: torch.LongTensor) -> torch.FloatTensor: # XXX: not to be used for population-based optimisation method assert (x.dim() == 2) assert (x.shape[0] == self.q) sample = self.model.sample_y(x, xe, n_sample=20) best_y = sample.min(dim=1).values return best_y.mean()
def fit(self, x: torch.FloatTensor): assert (x.dim() == 2) with torch.no_grad(): self.data_lb = x.min(dim=0).values.detach().clone() self.data_ub = x.max(dim=0).values.detach().clone() self.fitted = True assert (torch.isfinite(self.data_lb).all()) assert (torch.isfinite(self.data_ub).all()) return self
def fit(self, x: torch.FloatTensor): assert (x.dim() == 2) with torch.no_grad(): scaler = MinMaxScaler((self.range_lb, self.range_ub)) scaler.fit(x.detach().numpy()) self.scale_ = torch.FloatTensor(scaler.scale_) self.min_ = torch.FloatTensor(scaler.min_) self.fitted = True return self
def _forward_hidden(self, s_hidden_states: FloatTensor, t_hidden_states: FloatTensor, layer_idx: int = None) -> FloatTensor: if self.need_mapping: if s_hidden_states.dim() > 3: raise TypeError( "MSE loss with mapping can be applied only to flatten hidden state" ) assert s_hidden_states.size(-1) == self.student_hidden_state_dim assert t_hidden_states.size(-1) == self.teacher_hidden_state_dim s_hidden_states = s_hidden_states.reshape( -1, self.student_hidden_state_dim) if self.layer_idx is not None: t_hidden_states = self.proj[layer_idx](t_hidden_states.reshape( -1, self.teacher_hidden_state_dim)) else: t_hidden_states = self.proj( t_hidden_states.reshape(-1, self.teacher_hidden_state_dim)) if self.normalize: s_hidden_states = F.normalize(s_hidden_states) t_hidden_states = F.normalize(t_hidden_states) else: if s_hidden_states.dim() <= 3: hidden_dim = s_hidden_states.size(-1) s_hidden_states = s_hidden_states.reshape(-1, hidden_dim) t_hidden_states = t_hidden_states.reshape(-1, hidden_dim) if self.normalize: s_hidden_states = F.normalize(s_hidden_states) t_hidden_states = F.normalize(t_hidden_states) else: if self.normalize: raise TypeError( "Normalizing can be applied only to flatten hidden state" ) s_hidden_states = s_hidden_states.flatten() t_hidden_states = t_hidden_states.flatten() assert s_hidden_states.shape == t_hidden_states.shape return self.loss_fn(s_hidden_states, t_hidden_states)
def forward(self, in_sequence: torch.FloatTensor, out_sequence: torch.FloatTensor, padding_mask: torch.ByteTensor = None) -> torch.FloatTensor: """Runs the decoder. Args: in_sequence (torch.FloatTensor): The input sequence as (batch-size x in-seq-len x dim_model)-tensor. out_sequence (torch.FloatTensor): The output sequence as (batch-size x out-seq-len x dim_model)-tensor. padding_mask (torch.ByteTensor, optional): Optionally, a padding mask as (batch-size x in-seq-len x in-seq-len)-tensor. To that end, ``1``s indicate those positions that are part of the according sequence, and ``0``s mark padding tokens. Returns: FloatTensor: The computed output as (batch_size x out-seq-len x dim_model)-tensor. """ assert in_sequence.dim() == 3 assert in_sequence.size(2) == self._dim_model assert out_sequence.dim() == 3 assert out_sequence.size(0) == in_sequence.size(0) assert out_sequence.size(2) == self._dim_model if padding_mask is not None: assert padding_mask.dim() == 3 assert padding_mask.size(0) == in_sequence.size(0) assert padding_mask.size(1) == in_sequence.size(1) assert padding_mask.size(2) == in_sequence.size(1) # create shifted output mask shifted_output_mask = util.create_shifted_output_mask(out_sequence) # shift provided target output to the right out_sequence = util.shift_output_sequence(out_sequence) # apply all layers to the input for layer in self._layers: out_sequence = layer(in_sequence, out_sequence, padding_mask, shifted_output_mask) # provide the created output return out_sequence
def masked_topk( input_: torch.FloatTensor, mask: torch.BoolTensor, k: Union[int, torch.LongTensor], dim: int = -1, ) -> Tuple[torch.LongTensor, torch.LongTensor, torch.FloatTensor]: if input_.size() != mask.size(): raise ValueError("`input_` and `mask` must have the same shape.") if not -input_.dim() <= dim < input_.dim(): raise ValueError("`dim` must be in `[-input_.dim(), input_.dim())`") dim = (dim + input_.dim()) % input_.dim() max_k = k if isinstance(k, int) else k.max() permutation = list(range(input_.dim())) permutation.pop(dim) permutation += [dim] reverse_permutation = list(range(input_.dim() - 1)) reverse_permutation.insert(dim, -1) other_dims_size = list(input_.size()) other_dims_size.pop(dim) permuted_size = other_dims_size + [max_k] # for restoration if isinstance(k, int): k = k * torch.ones(*other_dims_size, dtype=torch.long, device=mask.device) else: if list(k.size()) != other_dims_size: raise ValueError( "`k` must have the same shape as `input_` with dimension `dim` removed." ) num_items = input_.size(dim) input_ = input_.permute(*permutation).reshape(-1, num_items) mask = mask.permute(*permutation).reshape(-1, num_items) k = k.reshape(-1) input_ = replace_masked_values(input_, mask, min_value_of_dtype(input_.dtype)) _, top_indices = input_.topk(max_k, 1) top_indices_mask = get_mask_from_sequence_lengths(k, max_k).bool() fill_value, _ = top_indices.max(dim=1, keepdim=True) top_indices = torch.where(top_indices_mask, top_indices, fill_value) top_indices, _ = top_indices.sort(1) sequence_mask = mask.gather(1, top_indices) top_mask = top_indices_mask & sequence_mask top_input = input_.gather(1, top_indices) return ( top_input.reshape(*permuted_size).permute(*reverse_permutation), top_mask.reshape(*permuted_size).permute(*reverse_permutation), top_indices.reshape(*permuted_size).permute(*reverse_permutation), )
def fit(self, x: torch.FloatTensor): assert (x.dim() == 2) with torch.no_grad(): scaler = StandardScaler() scaler.fit(x.detach().numpy()) self.mean = torch.FloatTensor(scaler.mean_.copy()).view(-1) self.std = torch.FloatTensor(scaler.scale_.copy()).view(-1) invalid = ~(torch.isfinite(self.mean) & torch.isfinite(self.std)) self.mean[ invalid] = 0. # somethime we face data with some all-NaN columns self.std[invalid] = 1. return self
def forward(self, sequence: torch.FloatTensor) -> torch.FloatTensor: """Runs the feed-forward layer. Args: sequence (torch.FloatTensor): The input sequence given as (batch_size x seq_len x dim_model)-tensor. Returns: torch.FloatTensor: The computed values as (batch_size x seq_len x dim_model)-tensor. """ assert sequence.dim() == 3 assert sequence.size(2) == self._dim_model sequence = functional.relu(self._layer_1(sequence.transpose(1, 2))) sequence = self._layer_2(sequence).transpose(1, 2) return sequence
def rank_by_score( scores: _torch.FloatTensor, n: _torch.LongTensor, generator: Optional[_torch.Generator] = None) -> _torch.LongTensor: """Sorts scores in decreasing order. This method ensures that padded documents are placed last and ties are broken randomly. Args: scores: A tensor of size (batch_size, list_size, 1) or (batch_size, list_size) containing scores. n: A tensor of size (batch_size) containing list size of each query. """ if scores.dim() == 3: scores = scores.reshape((scores.shape[0], scores.shape[1])) return tiebreak_argsort(mask_padded_values(scores, n), generator=generator)
def forward(self, sequence: torch.FloatTensor, mask: Union[None, torch.ByteTensor]) -> torch.FloatTensor: """ 执行模型。对多个 kernel size 最终会将每一个 kernel 输出的向量, concat 在一起。 pooling 使用的 max pooling. :param sequence: 输入的token 序列, shape: (batch_size, seq_len, embedding_dim) :param mask: mask :return: cnn 编码向量, shape: (batch_size, num_filter * len(kernel_sizes)) """ assert sequence.dim( ) == 3, f"tokens.dim: {sequence.dim()} 与 shape: (batch_size, seq_len, embedding_dim) 不匹配" if mask is not None: assert mask.dim( ) == 2, f"mask.dim: {mask.dim()} 与 shape: (batch_size, seq_len) 不匹配" # 将 mask 的 token 清零,避免影响 cnn sequence = sequence * mask.unsqueeze(dim=-1).float() # 将 1 和 2 转置, 转置后 shape: (batch_size, embedding_dim, seq_len) sequence = torch.transpose(sequence, 1, 2) # 每一个 cnn_vector_i: (batch_size, embedding_dim, new_seq_len_i) # 注意不同 kernel_size 的 cnn, 产生的 new_seq_len 长度是不同的 所以这里用下标 i 来表示. cnn_vectors = [ self.activtion(cnn(sequence)) for cnn in self.cnn_layers ] assert cnn_vectors[0].dim() == 3, \ f"cnn_vectors.dim: {cnn_vectors[0].dim()} 与 shape: (batch_size, num_filter, new_seq_len) 不匹配" assert cnn_vectors[0].size(1) == self.num_filters # max pooling, 直接使用 max,而不是使用 MaxPool1D, max 更方便,MaxPool1D 需要设置 kernel size 为 seq_len. max_pooled_cnn_vectors = [ cnn_vector.max(dim=-1)[0] for cnn_vector in cnn_vectors ] assert max_pooled_cnn_vectors[0].dim() == 2, \ f"max_pooled_cnn_vectors.dim: {max_pooled_cnn_vectors[0].dim()} 与 shape: (batch_size, num_filter) 不匹配" # 最后 max_pooled_cnn_vectors concat 在一起 vector = \ torch.cat(max_pooled_cnn_vectors, dim=-1) if len(max_pooled_cnn_vectors) > 1 else max_pooled_cnn_vectors[0] return vector
def tensor_to_np(t_img: torch.FloatTensor, encoding: str, out_format: str): ''' Convert a tensor image to numpy image. This is sort of the inverse operation of format_tensor_img(). \\ NOTE: this function is not optimized for speed Args: t_img: tensor image encoding: how tensor image is transformed. Available: 'RGB_1', 'RGB_1_norm', 'BGR_255_norm' out_format: 'RGB_1', 'BGR_1' ''' assert torch.is_tensor(t_img) and t_img.dim() == 3 and t_img.shape[0] == 3 assert encoding in {'RGB_1', 'RGB_1_norm', 'BGR_255_norm'} assert out_format in {'RGB_1', 'BGR_1', 'BGR_uint8', 'RGB_uint8'} t_img = t_img.clone() # 0. convert everthing to RGB_1 if encoding == 'RGB_1': pass elif encoding == 'RGB_1_norm': means = [0.485, 0.456, 0.406] stds = [0.229, 0.224, 0.225] for channel, m, sd in zip(t_img, means, stds): channel.mul_(sd).add_(m) elif encoding == 'BGR_255_norm': raise NotImplementedError() else: raise NotImplementedError() im = t_img.permute(1, 2, 0).numpy() # 1. convert RGB_1 to output format if out_format == 'RGB_1': pass elif out_format == 'BGR_1': im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) elif out_format == 'RGB_uint8': im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) im = (im * 255).astype('uint8') elif out_format == 'BGR_uint8': im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) im = (im * 255).astype('uint8') else: raise NotImplementedError() return im
def forward(self, sequence: torch.FloatTensor, padding_mask: torch.ByteTensor = None) -> torch.FloatTensor: """Runs the encoder. Args: sequence (torch.FloatTensor): The input sequence as (batch-size x seq-len x dim-model)-tensor. padding_mask (torch.ByteTensor, optional): Optionally, a padding mask as (batch-size x in-seq-len x in-seq-len)-tensor. To that end, ``1``s indicate those positions that are part of the according sequence, and ``0``s mark padding tokens. Returns: FloatTensor: The encoded sequence as (batch_size x seq_len x dim_model)-tensor. """ assert sequence.dim() == 3 assert sequence.size(2) == self._dim_model # apply all layers to the input for layer in self._layers: sequence = layer(sequence, padding_mask) # provide the final sequence return sequence
def bbox_to_mask(bboxes: torch.FloatTensor, bb_format='cxcywhd', mask_size=2048) -> torch.BoolTensor: ''' Convert bounding boxes to binary masks Args: bboxes: bounding boxes, shape [N, bb_param] Return: masks: shape [N, mask_size, mask_size] ''' assert isinstance(bboxes, torch.FloatTensor) and bboxes.dim() == 2 if bb_format == 'cxcywhd': assert bboxes.shape[1] == 5 bboxes = bboxes.clone() bboxes[:, 4] = bboxes[:, 4] / 180 * pi vertices = xywha2vertex(bboxes, is_degree=False) masks = vertex2masks(vertices, mask_size=mask_size) else: raise NotImplementedError() return masks
def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) -> List[List[int]]: # emissions: (seq_length, batch_size, num_tags) # mask: (seq_length, batch_size) assert emissions.dim() == 3 and mask.dim() == 2 assert emissions.shape[:2] == mask.shape assert emissions.size(2) == self.num_tags assert mask[0].all() seq_length, batch_size = mask.shape # Start transition and first emission # shape: (batch_size, num_tags) temp1 = torch.matmul(self.start_transitions, self.transitions) temp2 = torch.transpose(self.labelembedding.weight, 0, 1) temp = torch.matmul(temp1, temp2).squeeze() score = temp + emissions[0] history = [] trans = torch.matmul( torch.matmul(self.labelembedding.weight, self.transitions), torch.transpose(self.labelembedding.weight, 0, 1)) # score is a tensor of size (batch_size, num_tags) where for every batch, # value at column j stores the score of the best tag sequence so far that ends # with tag j # history saves where the best tags candidate transitioned from; this is used # when we trace back the best tag sequence # Viterbi algorithm recursive case: we compute the score of the best tag sequence # for every possible next tag for i in range(1, seq_length): # Broadcast viterbi score for every possible next tag # shape: (batch_size, num_tags, 1) broadcast_score = score.unsqueeze(2) # Broadcast emission score for every possible current tag # shape: (batch_size, 1, num_tags) broadcast_emission = emissions[i].unsqueeze(1) # Compute the score tensor of size (batch_size, num_tags, num_tags) where # for each sample, entry at row i and column j stores the score of the best # tag sequence so far that ends with transitioning from tag i to tag j and emitting # shape: (batch_size, num_tags, num_tags) next_score = broadcast_score + trans + broadcast_emission # Find the maximum score over all possible current tag # shape: (batch_size, num_tags) next_score, indices = next_score.max(dim=1) # Set score to the next score if this timestep is valid (mask == 1) # and save the index that produces the next score # shape: (batch_size, num_tags) score = torch.where(mask[i].unsqueeze(1), next_score, score) history.append(indices) # End transition score # shape: (batch_size, num_tags) temp1 = torch.matmul(self.labelembedding.weight, self.transitions) temp = torch.matmul(temp1, self.end_transitions).squeeze() score += temp # Now, compute the best path for each sample # shape: (batch_size,) seq_ends = mask.long().sum(dim=0) - 1 best_tags_list = [] for idx in range(batch_size): # Find the tag which maximizes the score at the last timestep; this is our best tag # for the last timestep _, best_last_tag = score[idx].max(dim=0) best_tags = [best_last_tag.item()] # We trace back where the best last tag comes from, append that to our best tag # sequence, and trace it back again, and so on for hist in reversed(history[:seq_ends[idx]]): best_last_tag = hist[idx][best_tags[-1]] best_tags.append(best_last_tag.item()) # Reverse the order because we start from the last timestep best_tags.reverse() best_tags_list.append(best_tags) return best_tags_list
def forward( self, source: torch.FloatTensor, # [batch, tgt_len, dim] memory_bank_list: List[ torch.FloatTensor], # [num_srcs] x [batch, src_len, dim] memory_lengths_list: List[ torch.FloatTensor] = None, # [num_srcs] x [batch] coverage=None ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: assert coverage is None # one step input if source.dim() == 2: one_step = True source = source.unsqueeze(1) else: one_step = False # end if # Join memory bank memory_bank = torch.cat(memory_bank_list, dim=1) batch, source_l, dim = memory_bank.size() batch_, target_l, dim_ = source.size() aeq(batch, batch_) aeq(dim, dim_) aeq(self.dim, dim) if coverage is not None: batch_, source_l_ = coverage.size() aeq(batch, batch_) aeq(source_l, source_l_) if coverage is not None: cover = coverage.view(-1).unsqueeze(1) memory_bank += self.linear_cover(cover).view_as(memory_bank) memory_bank = torch.tanh(memory_bank) # compute attention scores, as in Luong et al. align = self.score(source, memory_bank) if memory_lengths_list is not None: mask = torch.cat([ sequence_mask(memory_lengths, max_len=memory_bank_list[src_i].size(1)) for src_i, memory_lengths in enumerate(memory_lengths_list) ], dim=1) mask = mask.unsqueeze(1) # Make it broadcastable. align.masked_fill_(1 - mask, -float('inf')) # end if # Softmax or sparsemax to normalize attention weights if self.attn_func == "softmax": align_vectors = F.softmax(align.view(batch * target_l, source_l), -1) else: align_vectors = sparsemax(align.view(batch * target_l, source_l), -1) align_vectors = align_vectors.view(batch, target_l, source_l) # each context vector c_t is the weighted average # over all the source hidden states c = torch.bmm(align_vectors, memory_bank) # concatenate concat_c = torch.cat([c, source], 2).view(batch * target_l, dim * 2) attn_h = self.linear_out(concat_c).view(batch, target_l, dim) if self.attn_type in ["general", "dot"]: attn_h = torch.tanh(attn_h) # end if if one_step: attn_h = attn_h.squeeze(1) align_vectors = align_vectors.squeeze(1) # Check output sizes batch_, dim_ = attn_h.size() aeq(batch, batch_) aeq(dim, dim_) batch_, source_l_ = align_vectors.size() aeq(batch, batch_) aeq(source_l, source_l_) else: attn_h = attn_h.transpose(0, 1).contiguous() align_vectors = align_vectors.transpose(0, 1).contiguous() # Check output sizes target_l_, batch_, dim_ = attn_h.size() aeq(target_l, target_l_) aeq(batch, batch_) aeq(dim, dim_) target_l_, batch_, source_l_ = align_vectors.size() aeq(target_l, target_l_) aeq(batch, batch_) aeq(source_l, source_l_) # end if return attn_h, align_vectors
def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) -> List[List[int]]: # emissions: (seq_length, batch_size, num_tags) # mask: (seq_length, batch_size) assert emissions.dim() == 3 and mask.dim() == 2 assert emissions.shape[:2] == mask.shape assert emissions.size(2) == self.num_tags mask = torch.tensor(mask, dtype=torch.uint8).cuda() assert mask[0].all() seq_length, batch_size = mask.shape # self.start_transitions start 到其他tag(不包含end)的得分 # <start>->其他tag的发射得分 + 每句话的第一个字的tag的发射得分 # emissions.shape = [62,32,3] start_transitions.shape = [3] score = self.start_transitions + emissions[0] # 广播 history = [] for i in range(1, seq_length): # score.shape = [32,3] -> [32,3,1] # 起始得分,发射得分, 扩展维度,公式中的expand previous broadcast_score = score.unsqueeze(2) # emissions.shape = [62,32,3] 然后emissions[i]是每句话中的第i个单词的发射得分 # emissions[i].shape = [32,3].unsqueeze(1) = [32,1,3] broadcast_emission = emissions[i].unsqueeze(1) # 扩展维度 # 初始得分 + 发射得分 + 之前得分+现在得分 # [32,3,3]= [32,3,1] + [3,3,3] + [32,1,3] # 初始得分 公式中的t,转移得分 发射得分,单词wi->tagj的发射概率 next_score = broadcast_score + self.transitions + broadcast_emission # 每句话中每个单词对应的tag的得分最大值,tag[i]->tag[j]最大得分 # next_score.shape = [32,3] =indices.shape # 这个时刻中的最大值被保留下来 next_score, indices = next_score.max(dim=1) # 不计算padding部分的得分 score = torch.where(mask[i].unsqueeze(1), next_score, score) history.append(indices) # 遍历完一句话,还得加上最后 <end> tag [32,3] score += self.end_transitions # 计算到最后一个单词的下标 seq_ends = mask.long().sum(dim=0) - 1 # [32] best_tags_list = [] jj = 0 for idx in range(batch_size): # 32 # score.shape = [32,3] 每句话中找最好的tag _, best_last_tag = score[idx].max(dim=0) # 然后找最好的tag best_tags = [best_last_tag.item()] # history[:seq_ends[idx]].shape (seq_ends[idx]) # history 的长度是一个句子的长度 61 # history[i].shape = [32,3] # history[:seq_ends[idx] 取句子长度,seq_ends之后是padding部分 # reversed(history[:seq_ends[idx]]) 将句子反过来 # hist第一个取到的是最后一个字 for hist in reversed(history[:seq_ends[idx]]): # 画图 # hist.shape = [32,3] # code.interact(local = locals()) best_last_tag = hist[idx][best_tags[-1]] best_tags.append(best_last_tag.item()) best_tags.reverse() best_tags_list.append(best_tags) return best_tags_list
def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) -> List[List[int]]: # emissions: (seq_length, batch_size, num_tags) # mask: (seq_length, batch_size) assert emissions.dim() == 3 and mask.dim() == 2 assert emissions.shape[:2] == mask.shape assert emissions.size(2) == self.num_tags assert mask[0].all() seq_length, batch_size = mask.shape # Start transition and first emission # shape: (batch_size, num_tags) score = self.start_transitions + emissions[0] history = [] # Keep the scores to later return if the user wants them them based on # the score reduction, which will have shape: # (batch_size, seq_len, num_tags) if self.score_reduction == 'tags': tag_scores = [] else: tag_scores = None # score is a tensor of size (batch_size, num_tags) where for every batch, # value at column j stores the score of the best tag sequence so far that ends # with tag j # history saves where the best tags candidate transitioned from; this is used # when we trace back the best tag sequence # Viterbi algorithm recursive case: we compute the score of the best tag sequence # for every possible next tag for i in range(1, seq_length): # Broadcast viterbi score for every possible next tag # shape: (batch_size, num_tags, 1) broadcast_score = score.unsqueeze(2) # Broadcast emission score for every possible current tag # shape: (batch_size, 1, num_tags) broadcast_emission = emissions[i].unsqueeze(1) # Compute the score tensor of size (batch_size, num_tags, num_tags) where # for each sample, entry at row i and column j stores the score of the best # tag sequence so far that ends with transitioning from tag i to tag j and emitting # shape: (batch_size, num_tags, num_tags) next_score = broadcast_score + self.transitions + broadcast_emission # Find the maximum score over all possible current tag # shape: (batch_size, num_tags) next_score, indices = next_score.max(dim=1) # Set score to the next score if this timestep is valid (mask == 1) # and save the index that produces the next score # shape: (batch_size, num_tags) score = torch.where(mask[i].unsqueeze(1), next_score, score) # Add scores to cat them later based on the score reduction # shape: (batch_size, num_tags) if tag_scores is not None: tag_scores.append(score.detach().unsqueeze(1)) history.append(indices) # End transition score # shape: (batch_size, num_tags) score += self.end_transitions # Add the final transition score to our returned scores # shape: (batch_size, num_tags) if tag_scores is not None: tag_scores.append(score.detach().unsqueeze(1)) # Now, compute the best path for each sample # shape: (batch_size,) seq_ends = mask.long().sum(dim=0) - 1 best_tags_list = [] for idx in range(batch_size): # Find the tag which maximizes the score at the last timestep; this is our best tag # for the last timestep _, best_last_tag = score[idx].max(dim=0) best_tags = [best_last_tag.item()] # We trace back where the best last tag comes from, append that to our best tag # sequence, and trace it back again, and so on for hist in reversed(history[:seq_ends[idx]]): best_last_tag = hist[idx][best_tags[-1]] best_tags.append(best_last_tag.item()) # Reverse the order because we start from the last timestep best_tags.reverse() best_tags_list.append(best_tags) # If the user wants scores, return them in the desired format if self.score_reduction == 'skip': return best_tags_list else: if self.score_reduction == 'tags': # shape: (batch_size, seq_length, num_tags) score = torch.cat(tag_scores, 1) elif self.score_reduction != 'none': score = score.max(dim=1)[0] score = { 'sum': score.sum(), 'max': score.max(), 'min': score.min(), 'mean': score.mean() }[self.score_reduction] score = score / batch_size return best_tags_list, score
def _audio_postprocess(self, feats: torch.FloatTensor) -> torch.FloatTensor: if feats.dim == 2: feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() return F.layer_norm(feats, feats.shape)