def buffered_mask(self, tensor): dim = tensor.size(-1) if self._mask is None: self._mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) if self._mask.size(0) < dim: self._mask = torch.triu(utils.fill_with_neg_inf(self._mask.resize_(dim, dim)), 1) return self._mask[:dim, :dim]
def forward(self, input_features, adj): #x = self.conv1(input_features, adj) #x = self.bn1(x) #x = self.act(x) #x = self.conv2(x, adj) #x = self.bn2(x) # pool over all nodes #graph_h = self.pool_graph(x) graph_h = input_features.view(-1, self.max_num_nodes * self.max_num_nodes) # vae h_decode, z_mu, z_lsgms = self.vae(graph_h) out = F.sigmoid(h_decode) out_tensor = out.cpu().data recon_adj_lower = self.recover_adj_lower(out_tensor) recon_adj_tensor = self.recover_full_adj_from_lower(recon_adj_lower) # set matching features be degree out_features = torch.sum(recon_adj_tensor, 1) adj_data = adj.cpu().data[0] adj_features = torch.sum(adj_data, 1) S = self.edge_similarity_matrix(adj_data, recon_adj_tensor, adj_features, out_features, self.deg_feature_similarity) # initialization strategies init_corr = 1 / self.max_num_nodes init_assignment = torch.ones(self.max_num_nodes, self.max_num_nodes) * init_corr #init_assignment = torch.FloatTensor(4, 4) #init.uniform(init_assignment) assignment = self.mpm(init_assignment, S) #print('Assignment: ', assignment) # matching # use negative of the assignment score since the alg finds min cost flow row_ind, col_ind = scipy.optimize.linear_sum_assignment(-assignment.numpy()) print('row: ', row_ind) print('col: ', col_ind) # order row index according to col index #adj_permuted = self.permute_adj(adj_data, row_ind, col_ind) adj_permuted = adj_data adj_vectorized = adj_permuted[torch.triu(torch.ones(self.max_num_nodes,self.max_num_nodes) )== 1].squeeze_() adj_vectorized_var = Variable(adj_vectorized).cuda() #print(adj) #print('permuted: ', adj_permuted) #print('recon: ', recon_adj_tensor) adj_recon_loss = self.adj_recon_loss(adj_vectorized_var, out[0]) print('recon: ', adj_recon_loss) print(adj_vectorized_var) print(out[0]) loss_kl = -0.5 * torch.sum(1 + z_lsgms - z_mu.pow(2) - z_lsgms.exp()) loss_kl /= self.max_num_nodes * self.max_num_nodes # normalize print('kl: ', loss_kl) loss = adj_recon_loss + loss_kl return loss
def get_subsequent_mask(seq): """ For preventing lookahead """ batch_size,seq_len = seq.size() subsequent_mask = torch.triu(torch.ones((seq_len,seq_len),device=seq.device,dtype=torch.uint8),diagonal=1) subsequent_mask = subsequent_mask.unsqueeze(0).expand(batch_size,-1,-1) # size [batch_size x seq_len x seq_len] return subsequent_mask
def __call__(self, y_pred, y_true=None): """ y_pred should be two projections """ covar_mat = th.abs(th_matrixcorr(y_pred[0].data, y_pred[1].data)) upper_sum = th.sum(th.triu(covar_mat,1)) lower_sum = th.sum(th.tril(covar_mat,-1)) self.anticorr_sum += upper_sum self.anticorr_sum += lower_sum self.total_count += covar_mat.size(0)*(covar_mat.size(1) - 1) return self.anticorr_sum / self.total_count
def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): r"""Unpacks the data and pivots from a batched LU factorization (btrifact) of a tensor. Returns a tuple indexed by: 0: The pivots. 1: The L tensor. 2: The U tensor. Arguments: LU_data (Tensor): the packed LU factorization data LU_pivots (Tensor): the packed LU factorization pivots unpack_data (bool): flag indicating if the data should be unpacked unpack_pivots (bool): tlag indicating if the pivots should be unpacked Example:: >>> A = torch.randn(2, 3, 3) >>> A_LU, pivots = A.btrifact() >>> P, a_L, a_U = torch.btriunpack(A_LU, pivots) >>> >>> # test that (P, A_L, A_U) gives LU factorization >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U)) >>> assert torch.equal(A_, A) == True # can recover A """ nBatch, sz, _ = LU_data.size() if unpack_data: I_U = torch.triu(torch.ones(sz, sz)).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz) I_L = 1 - I_U L = LU_data.new(LU_data.size()).zero_() U = LU_data.new(LU_data.size()).zero_() I_diag = torch.eye(sz).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz) L[I_diag] = 1.0 L[I_L] = LU_data[I_L] U[I_U] = LU_data[I_U] else: L = U = None if unpack_pivots: P = torch.eye(sz).type_as(LU_data).unsqueeze(0).repeat(nBatch, 1, 1) for i in range(nBatch): for j in range(sz): k = LU_pivots[i, j] - 1 t = P[i, :, j].clone() P[i, :, j] = P[i, :, k] P[i, :, k] = t else: P = None return P, L, U
def forward( self, query, key, value, mask_future_timesteps=False, key_padding_mask=None, use_scalar_bias=False, ): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Future timesteps can be masked with the `mask_future_timesteps` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ src_len, bsz, out_channels = key.size() tgt_len = query.size(0) assert list(query.size()) == [tgt_len, bsz, out_channels] assert key.size() == value.size() if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.downsample: size = bsz else: size = bsz * self.num_heads k = key v = value q = query if self.project_input: q = self.in_proj_q(q) k = self.in_proj_k(k) v = self.in_proj_v(v) src_len = k.size()[0] q *= self.scaling if not self.downsample: q = q.view(tgt_len, size, self.head_dim) k = k.view(src_len, size, self.head_dim) v = v.view(src_len, size, self.head_dim) q = q.transpose(0, 1) k = k.transpose(0, 1) v = v.transpose(0, 1) attn_weights = torch.bmm(q, k.transpose(1, 2)) if mask_future_timesteps: assert query.size() == key.size(), \ 'mask_future_timesteps only applies to self-attention' attn_weights *= torch.tril( attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(), diagonal=-1, )[:, ::self.head_index + 1 if self.downsample else 1].unsqueeze(0) attn_weights += torch.triu( attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(), diagonal=0 )[:, ::self.head_index + 1 if self.downsample else 1].unsqueeze(0) tgt_size = tgt_len if use_scalar_bias: attn_weights = scalar_bias(attn_weights, 2) v = scalar_bias(v, 1) tgt_size += 1 if key_padding_mask is not None: # don't attend to padding symbols if key_padding_mask.max() > 0: if self.downsample: attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len) else: attn_weights = attn_weights.view(size, self.num_heads, tgt_len, src_len) attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), -math.inf, ) attn_weights = attn_weights.view(size, tgt_len, src_len) attn_weights = F.softmax(attn_weights, dim=-1) attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training) attn = torch.bmm(attn_weights, v) if self.downsample: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim) attn = self.out_proj(attn) return attn, attn_weights
def subsequent_mask(size): attn_shape = (1, size, size) mask = torch.triu(torch.ones(attn_shape, dtype=torch.float), diagonal=1) return mask == 0
def _forward(self, dec_inp, mems=None): qlen, bsz = dec_inp.size() word_emb = self.word_emb(dec_inp) mlen = mems[0].size(0) if mems is not None else 0 klen = mlen + qlen if self.same_length: all_ones = word_emb.new_ones(qlen, klen) mask_len = klen - self.mem_len if mask_len > 0: mask_shift_len = qlen - mask_len else: mask_shift_len = qlen dec_attn_mask = ( torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len)).bool()[:, :, None] # -1 else: dec_attn_mask = torch.triu(word_emb.new_ones(qlen, klen), diagonal=1 + mlen).bool()[:, :, None] hids = [] if self.attn_type == 0: # default pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype) if self.clamp_len > 0: pos_seq.clamp_(max=self.clamp_len) pos_emb = self.pos_emb(pos_seq) core_out = self.drop(word_emb) pos_emb = self.drop(pos_emb) hids.append(core_out) for i, layer in enumerate(self.layers): mems_i = None if mems is None else mems[i] core_out = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) elif self.attn_type == 1: # learnable core_out = self.drop(word_emb) hids.append(core_out) for i, layer in enumerate(self.layers): if self.clamp_len > 0: r_emb = self.r_emb[i][-self.clamp_len:] r_bias = self.r_bias[i][-self.clamp_len:] else: r_emb, r_bias = self.r_emb[i], self.r_bias[i] mems_i = None if mems is None else mems[i] core_out = layer(core_out, r_emb, self.r_w_bias[i], r_bias, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) elif self.attn_type == 2: # absolute pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype) if self.clamp_len > 0: pos_seq.clamp_(max=self.clamp_len) pos_emb = self.pos_emb(pos_seq) core_out = self.drop(word_emb + pos_emb[-qlen:]) hids.append(core_out) for i, layer in enumerate(self.layers): mems_i = None if mems is None else mems[i] if mems_i is not None and i == 0: mems_i += pos_emb[:mlen] core_out = layer(core_out, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) elif self.attn_type == 3: core_out = self.drop(word_emb) hids.append(core_out) for i, layer in enumerate(self.layers): mems_i = None if mems is None else mems[i] if mems_i is not None and mlen > 0: cur_emb = self.r_emb[i][:-qlen] cur_size = cur_emb.size(0) if cur_size < mlen: cur_emb_pad = cur_emb[0:1].expand( mlen - cur_size, -1, -1) cur_emb = torch.cat([cur_emb_pad, cur_emb], 0) else: cur_emb = cur_emb[-mlen:] mems_i += cur_emb.view(mlen, 1, -1) core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1) core_out = layer(core_out, dec_attn_mask=dec_attn_mask, mems=mems_i) hids.append(core_out) core_out = self.drop(core_out) new_mems = self._update_mems(hids, mems, mlen, qlen) return core_out, new_mems
def make_mask(n_head, length): a = torch.ones(1, n_head, length, length) b = torch.triu(a, diagonal=0) return b.transpose(2, 3)
def _generate_square_subsequent_mask(self, sz): #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below) mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) return mask
def buffered_future_mask(self, tensor): dim = tensor.size(0) if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim: self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1) return self._future_mask[:dim, :dim]
def get_mask(self, seq): mask = (torch.triu(torch.ones(seq.size(1), seq.size(1))) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill( mask == 1, float(0.0)) return mask.to(self._config.device)
device = src.device mask = self._generate_square_subsequent_mask(len(src)).to(device) self.src_mask = mask src = self.encoder(src) * math.sqrt(self.ninp) src = self.pos_encoder(src) output = self.transformer_encoder(src, self.src_mask) output = self.decoder(output) return output """#### Masking By passing the mask into the transformer_encoder forward() function, the attention will only be calculated on the earlier positions in the sequence. """ #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below) torch.triu(torch.ones(3, 3)) # Masking def masking(): sz = 4 mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) return mask masking() """### Positional Encoding Transformer architecture takes base architecture of Seq2Seq model (Encoder - Decoder). However, the transformer does not use recurrent model so this means we need a module that captures sequence information of the input/output.
def forward_seq2seq(self, batch, target_masking=None, zero_encoder=False): """ Inputs Shapes: input: (Variable) batch_size x len_tgt (wanna tranpose) context: (Variable) batch_size x len_src x d_model mask_src (Tensor) batch_size x len_src Outputs Shapes: out: batch_size x len_tgt x d_model coverage: batch_size x len_tgt x len_src """ src = batch.get('source') tgt = batch.get('target_input') input = torch.cat([src, tgt], dim=0) """ Embedding: batch_size x len_tgt x d_model """ # we work with two embeddings at the same time src_emb = embedded_dropout(self.src_word_lut, src, dropout=self.word_dropout if self.training else 0) tgt_emb = embedded_dropout(self.tgt_word_lut, tgt, dropout=self.word_dropout if self.training else 0) # Concatenate the embeddings by time dimension emb = torch.cat([src_emb, tgt_emb], dim=0) # Add dropout and scale emb = self.preprocess_layer(emb) emb = emb * math.sqrt(self.model_size) klen, batch_size = emb.size(0), emb.size(1) # Prepare positional encoding: pos_seq = torch.arange(klen - 1, -1, -1.0, device=emb.device, dtype=emb.dtype) # pos_seq = torch.arange(0, klen, device=emb.device, dtype=emb.dtype) pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq)) if self.use_feature: raise NotImplementedError # No feature/attributes for the moment # attention masking qlen = klen mlen = 0 # we don't have any memory in this mode # print(input) dec_attn_mask = torch.triu( emb.new_ones(qlen, klen), diagonal=1 + mlen).byte()[:, :, None] # Size T x T ? pad_mask = input.eq(onmt.Constants.PAD).byte().unsqueeze(1) # Size 1 x T x B # pad_mask = input.new(*input.size()).zero_() mask = dec_attn_mask + pad_mask mask = torch.gt(mask, 0).bool() # mask = dec_attn_mask mask = mask.bool() output = emb for i, layer in enumerate(self.layer_modules): output, coverage = layer(output, pos_emb, self.r_w_bias, self.r_r_bias, mask) # batch_size x len_src x d_model # From Google T2T # if normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. output = self.postprocess_layer(output) all_output = output src_len = src.size(0) context = output[src_len:, :, :] tgt_len = tgt.size(0) tgt_hiddens = output[:tgt_len, :, :] # output_dict = {'hidden': output, 'coverage': coverage, 'context': context} output_dict = defaultdict(lambda: None) output_dict['hidden'] = tgt_hiddens output_dict['encoder'] = context output_dict['src_mask'] = mask[src_len:, :, :] output = tgt_hiddens # This step removes the padding to reduce the load for the final layer if target_masking is not None: output = output.contiguous().view(-1, output.size(-1)) mask = target_masking """ We remove all positions with PAD """ flattened_mask = mask.view(-1) non_pad_indices = torch.nonzero(flattened_mask).squeeze(1) output = output.index_select(0, non_pad_indices) # final layer: computing softmax logprobs = self.generator[0](output) output_dict['logprobs'] = logprobs # return output, None return output_dict
def get_subsequent_mask(seq): ''' For masking out the subsequent info. ''' sz_b, len_s = seq.size() subsequent_mask = (1 - torch.triu( torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool() return subsequent_mask
def generate_square_subsequent_mask(size: int): """Generate a triangular (size, size) mask.""" mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0)) return mask
def _forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, future=False): """ A naive forward pass for transformer decoder. # T: could be 1 in the case of stepwise decoding or tgt_len Args: inputs (FloatTensor): ``(batch_size, T, model_dim)`` memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)`` src_pad_mask (LongTensor): ``(batch_size, 1, src_len)`` tgt_pad_mask (LongTensor): ``(batch_size, 1, T)`` layer_cache (dict or None): cached layer info when stepwise decode step (int or None): stepwise decoding counter future (bool): If set True, do not apply future_mask. Returns: (FloatTensor, FloatTensor): * output ``(batch_size, T, model_dim)`` * attns ``(batch_size, head, T, src_len)`` """ dec_mask = None if step is None: tgt_len = tgt_pad_mask.size(-1) if not future: # apply future_mask, result mask in (B, T, T) future_mask = torch.ones([tgt_len, tgt_len], device=tgt_pad_mask.device, dtype=torch.uint8) future_mask = torch.triu(future_mask, 1).view(1, tgt_len, tgt_len) #future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len) # BoolTensor was introduced in pytorch 1.2 try: future_mask = future_mask.bool() except AttributeError: pass dec_mask = torch.gt(tgt_pad_mask + future_mask, 0) else: # only mask padding, result mask in (B, 1, T) dec_mask = tgt_pad_mask input_norm = self.layer_norm_1(inputs) if isinstance(self.self_attn, MultiHeadedAttention): query, _ = self.self_attn(input_norm, input_norm, input_norm, mask=dec_mask, layer_cache=layer_cache, attn_type="self") elif isinstance(self.self_attn, AverageAttention): query, _ = self.self_attn(input_norm, mask=dec_mask, layer_cache=layer_cache, step=step) query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, attns = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, attn_type="context") output = self.feed_forward(self.drop(mid) + query) return output, attns
def __init__(self, B, L, device="cpu"): mask_shape = [B, 1, L, L] with torch.no_grad(): self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
def batch_head_stats(attn_variables, triu_masking=False): # Retrieve context (shape bsz x nheads x L x dhead), mask (shape bsz x L) and weights (shape bsz x nheads x L x l) ctx = attn_variables["context"].detach() in_mask = attn_variables["in_mask"] out_mask = attn_variables["out_mask"] p = attn_variables["weights"].detach() logp = torch.log(p) device = p.device # Results results = {} # Triu mask for self att triu_mask = torch.triu(p.new_ones((p.size(2), p.size(3))), 1).byte() # Reverse mask if in_mask is not None: in_mask = torch.eq(in_mask, 0.0).float() else: in_mask = torch.ones(p.size(0), p.size(3)).to(ctx.device) # Reverse mask if out_mask is not None: out_mask = torch.eq(out_mask, 0.0).float() else: out_mask = torch.ones(ctx.size(0), ctx.size(2)).to(ctx.device) def reduce_head(x): return (x * out_mask.unsqueeze(1)).sum(0).sum(-1).detach().cpu() def reduce_head_pairs(x): return ( x * out_mask.unsqueeze(1).unsqueeze(1)).sum(0).sum(-1).detach().cpu() # p_mask has shape bsz x -1 x -1 x l p_mask = in_mask.unsqueeze(1).unsqueeze(1) # Entropy plogp = p * logp plogp[p == 0] = 0 if triu_masking: plogp.masked_fill_(triu_mask.unsqueeze(0).unsqueeze(0), 0) #plogp.masked_fill_(p_mask.eq(0), 0) H_p = -plogp.sum(-1) results["entropy"] = reduce_head(H_p) # Cross entropy plogq = torch.einsum("bilk,bjlk->bijlk", [p, logp]) plogq.masked_fill_((p == 0).unsqueeze(1), 0) if triu_masking: plogq.masked_fill_(triu_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), 0) H_pq = -plogq.sum(-1) # Avg KL (bsz x nhead x L) avg_KL = (H_pq - H_p.unsqueeze(2)) results["kl"] = reduce_head_pairs(avg_KL) if (results["kl"] == float('inf')).any(): print(triu_mask) print(p) print(avg_KL) exit() # avg output disagreement out = ctx / (torch.sqrt((ctx**2).sum(-1, keepdim=True)) + 1e-20) out_dis = torch.einsum("bild,bjld->bijl", [out, out]) / out.size(1)**2 results["out_dis"] = reduce_head_pairs(out_dis) # avg attn disagreement attn_dis = torch.einsum("bilk,bjlk->bijl", [p, p]) / p.size(1)**2 results["attn_dis"] = reduce_head_pairs(attn_dis) # Avg attn offset self_pos = torch.arange(p.size(2)).to(device).float().view(1, 1, -1) if triu_masking: masked_p = torch.where( triu_mask.unsqueeze(0).unsqueeze(0), -p.new_ones(p.size()), p) else: masked_p = p attn_pos = masked_p.argmax(dim=-1).float() attn_offset = self_pos - attn_pos results["attn_pos"] = reduce_head(attn_pos) results["attn_offset"] = reduce_head(attn_offset) # Avg attn offset attn_dist = torch.abs(attn_offset) results["attn_dist"] = reduce_head(attn_dist) # Avg squared attn offset results["attn_offset_sq"] = reduce_head(attn_offset**2) results["attn_pos_sq"] = reduce_head(attn_pos**2) # Denominator denom = out_mask.sum().detach().cpu().data return results, denom
def create_and_check_xlnet_model_use_mems( self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels, token_labels, ): model = XLNetModel(config=config) model.to(torch_device) model.eval() # first forward pass causal_mask = torch.ones( input_ids_1.shape[0], input_ids_1.shape[1], input_ids_1.shape[1], dtype=torch.float, device=torch_device, ) causal_mask = torch.triu(causal_mask, diagonal=0) outputs_cache = model(input_ids_1, use_mems=True, perm_mask=causal_mask) outputs_no_cache = model(input_ids_1, use_mems=False, perm_mask=causal_mask) outputs_conf = model(input_ids_1) self.parent.assertTrue(len(outputs_cache) == len(outputs_conf)) self.parent.assertTrue(len(outputs_cache) == len(outputs_no_cache) + 1) output, mems = outputs_cache.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) # append to next input_ids and token_type_ids next_input_ids = torch.cat([input_ids_1, next_tokens], dim=-1) # causal mask causal_mask = torch.ones( input_ids_1.shape[0], input_ids_1.shape[1] + 1, input_ids_1.shape[1] + 1, dtype=torch.float, device=torch_device, ) causal_mask = torch.triu(causal_mask, diagonal=0) single_mask = torch.ones(input_ids_1.shape[0], 1, 1, dtype=torch.float, device=torch_device) # second forward pass output_from_no_past = model(next_input_ids, perm_mask=causal_mask)["last_hidden_state"] output_from_past = model(next_tokens, mems=mems, perm_mask=single_mask)["last_hidden_state"] # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach() output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import CTRLTokenizer, CTRLModel import torch tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLModel.from_pretrained('ctrl') input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = past[0][0].size(-2) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to( dtype=self.dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed head_mask = self.get_head_mask(head_mask, self.config.n_layer) if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) token_type_embeds = self.w(token_type_ids) token_type_embeds *= np.sqrt(self.d_model_size) else: token_type_embeds = 0 position_ids = position_ids.view(-1, input_shape[-1]) if inputs_embeds is None: inputs_embeds = self.w(input_ids) # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded seq_len = input_shape[-1] mask = torch.triu( torch.ones(seq_len + past_length, seq_len + past_length), 1).to(inputs_embeds.device) inputs_embeds *= np.sqrt(self.d_model_size) pos_embeds = self.pos_encoding[position_ids, :].to( inputs_embeds.device) hidden_states = inputs_embeds + pos_embeds + token_type_embeds hidden_states = self.dropout(hidden_states) output_shape = input_shape + (inputs_embeds.size(-1), ) presents = () all_hidden_states = () all_attentions = [] for i, (h, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view( *output_shape), ) outputs = h( hidden_states, mask, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], use_cache=use_cache, ) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present, ) if self.output_attentions: all_attentions.append(outputs[2]) hidden_states = self.layernorm(hidden_states) hidden_states = hidden_states.view(*output_shape) if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) outputs = (hidden_states, ) if use_cache is True: outputs = outputs + (presents, ) if self.output_hidden_states: outputs = outputs + (all_hidden_states, ) if self.output_attentions: # let the number of heads free (-1) so we can extract attention even after head pruning attention_output_shape = input_shape[:-1] + ( -1, ) + all_attentions[0].shape[-2:] all_attentions = tuple( t.view(*attention_output_shape) for t in all_attentions) outputs = outputs + (all_attentions, ) return outputs
def conditional_corrcoeff( density: Any, limits: Tensor, condition: Tensor, subset: Optional[List[int]] = None, resolution: int = 50, warn_about_deprecation: bool = True, ) -> Tensor: r""" Returns the conditional correlation matrix of a distribution. To compute the conditional distribution, we condition all but two parameters to values from `condition`, and then compute the Pearson correlation coefficient $\rho$ between the remaining two parameters under the distribution `density`. We do so for any pair of parameters specified in `subset`, thus creating a matrix containing conditional correlations between any pair of parameters. If `condition` is a batch of conditions, this function computes the conditional correlation matrix for each one of them and returns the mean. Args: density: Probability density function with `.log_prob()` function. limits: Limits within which to evaluate the `density`. condition: Values to condition the `density` on. If a batch of conditions is passed, we compute the conditional correlation matrix for each of them and return the average conditional correlation matrix. subset: Evaluate the conditional distribution only on a subset of dimensions. If `None` this function uses all dimensions. resolution: Number of grid points on which the conditional distribution is evaluated. A higher value increases the accuracy of the estimated correlation but also increases the computational cost. warn_about_deprecation: With sbi v0.15.0, we depracated the import of this function from `sbi.utils`. Instead, it should be imported from `sbi.analysis`. Returns: Average conditional correlation matrix of shape either `(num_dim, num_dim)` or `(len(subset), len(subset))` if `subset` was specified. """ if warn_about_deprecation: warn( "Importing `conditional_corrcoeff` from `sbi.utils` is deprecated since " "sbi v0.15.0. Instead, use " "`from sbi.analysis import conditional_corrcoeff`." ) condition = ensure_theta_batched(condition) if subset is None: subset = range(condition.shape[1]) correlation_matrices = [] for cond in condition: correlation_matrices.append( torch.stack( [ _compute_corrcoeff( eval_conditional_density( density, cond, limits, dim1=dim1, dim2=dim2, resolution=resolution, warn_about_deprecation=False, ), limits[[dim1, dim2]], ) for dim1 in subset for dim2 in subset if dim1 < dim2 ] ) ) average_correlations = torch.mean(torch.stack(correlation_matrices), dim=0) # `average_correlations` is still a vector containing the upper triangular entries. # Below, assemble them into a matrix: av_correlation_matrix = torch.zeros((len(subset), len(subset))) triu_indices = torch.triu_indices(row=len(subset), col=len(subset), offset=1) av_correlation_matrix[triu_indices[0], triu_indices[1]] = average_correlations # Make the matrix symmetric by copying upper diagonal to lower diagonal. av_correlation_matrix = torch.triu(av_correlation_matrix) + torch.tril( av_correlation_matrix.T ) av_correlation_matrix.fill_diagonal_(1.0) return av_correlation_matrix
def masking(): sz = 4 mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) return mask
def get_subsequent_mask(seq): sz_b, len_s = seq.size() subsequent_mask = torch.triu(torch.ones((1, len_s, len_s)), diagonal=1).bool() return subsequent_mask
def get_mask(size): weights = torch.triu(torch.ones((size, size), dtype = torch.bool), 1) return weights
def forward( self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_tuple=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_ids = input_ids.transpose(0, 1).contiguous() qlen, bsz = input_ids.size() elif inputs_embeds is not None: inputs_embeds = inputs_embeds.transpose(0, 1).contiguous() qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") if mems is None: mems = self.init_mems(bsz) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( 0).unsqueeze(0) head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) head_mask = head_mask.to(dtype=next(self.parameters( )).dtype) # switch to fload if need + fp16 compatibility else: head_mask = [None] * self.n_layer if inputs_embeds is not None: word_emb = inputs_embeds else: word_emb = self.word_emb(input_ids) mlen = mems[0].size(0) if mems is not None else 0 klen = mlen + qlen if self.same_length: all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) mask_len = klen - self.mem_len if mask_len > 0: mask_shift_len = qlen - mask_len else: mask_shift_len = qlen dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 else: dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1 + mlen)[:, :, None] hids = [] attentions = [] if output_attentions else None if self.attn_type == 0: # default pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=word_emb.dtype) if self.clamp_len > 0: pos_seq.clamp_(max=self.clamp_len) pos_emb = self.pos_emb(pos_seq) core_out = self.drop(word_emb) pos_emb = self.drop(pos_emb) for i, layer in enumerate(self.layers): hids.append(core_out) mems_i = None if mems is None else mems[i] layer_outputs = layer( core_out, pos_emb, dec_attn_mask=dec_attn_mask, mems=mems_i, head_mask=head_mask[i], output_attentions=output_attentions, ) core_out = layer_outputs[0] if output_attentions: attentions.append(layer_outputs[1]) else: # learnable embeddings and absolute embeddings raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint core_out = self.drop(core_out) new_mems = self._update_mems(hids, mems, mlen, qlen) if output_hidden_states: # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] hids.append(core_out) hids = tuple(t.transpose(0, 1).contiguous() for t in hids) else: hids = None if output_attentions: # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] attentions = tuple( t.permute(2, 3, 0, 1).contiguous() for t in attentions) # We transpose back here to shape [bsz, len, hidden_dim] core_out = core_out.transpose(0, 1).contiguous() if return_tuple: return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) return TransfoXLModelOutput( last_hidden_state=core_out, mems=new_mems, hidden_states=hids, attentions=attentions, )
def _assemble_W(self): """ assemble W from its pieces (P, L, U, S) """ L = torch.tril(self.L, diagonal=-1) + torch.diag(torch.ones(self.dim)) U = torch.triu(self.U, diagonal=1) W = self.P @ L @ (U + torch.diag(self.S)) return W
def get_att(head, seq_len): att = torch.triu(torch.ones(seq_len, seq_len).byte(), diagonal=1) for _ in range(2): att = torch.unsqueeze(att, dim=0) return att.repeat(1, head, 1, 1)
def _generate_square_subsequent_mask(self, sz): mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill( mask == 1, float(0.0)) return mask
def recover_adj_lower(self, l): # NOTE: Assumes 1 per minibatch adj = torch.zeros(self.max_num_nodes, self.max_num_nodes) adj[torch.triu(torch.ones(self.max_num_nodes, self.max_num_nodes)) == 1] = l return adj
def generate_square_subsequent_mask(sz): return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
def generate_square_subsequent_mask(self, sz): mask = torch.triu(torch.ones(sz, sz), 1) mask = mask.masked_fill(mask == 1, float('-inf')) return mask
def convert_examples_to_features(examples, seq_length, tokenizer): """Loads a data file into a list of `InputBatch`s.""" features = [] attention_mask = torch.tril( torch.ones(seq_length, seq_length, dtype=torch.long), -1) + torch.triu( torch.ones(seq_length, seq_length, dtype=torch.long), 1) for (ex_index, example) in enumerate(examples): tokens_a = example.text_a labels = example.labels tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > seq_length - 2: tokens_a = tokens_a[0:(seq_length - 2)] if len(labels) > seq_length - 2: labels = labels[0:(seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambigiously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] input_type_ids = [] targets = [] tokens.append("[CLS]") targets.append("[PAD]") input_type_ids.append(0) for i, token in enumerate(tokens_a): tokens.append(token) input_type_ids.append(0) targets.append(labels[i]) tokens.append("[SEP]") targets.append("[PAD]") input_type_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) input_type_ids.append(1) tokens.append("[SEP]") input_type_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) target_ids = tokenizer.convert_tokens_to_ids(targets) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < seq_length: input_ids.append(0) input_mask.append(0) target_ids.append(0) input_type_ids.append(0) assert len(input_ids) == seq_length assert len(input_mask) == seq_length assert len(target_ids) == seq_length assert len(input_type_ids) == seq_length #input_mask=(torch.tensor(input_mask, dtype=torch.long).unsqueeze(0)*attention_mask).tolist() candidates = example.candidates if ex_index < 0: logger.info("*** Example ***") logger.info("unique_id: %s" % (example.unique_id)) logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("target_ids: %s" % " ".join([str(x) for x in target_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) if False: logger.info("*** Example ***") logger.info("unique_id: %s" % (example.unique_id)) logger.info("tokens: %s" % " ".join([str(x) for x in labels])) features.append( InputFeatures( unique_id=example.unique_id, target_ids=target_ids, input_ids=input_ids, input_mask=input_mask, input_type_ids=input_type_ids, candidates=candidates, )) return features
def masking(self, weight): # pdb.set_trace() mask = torch.triu(torch.ones_like(weight)).transpose(2, 3) weight[mask == 0] = float('-inf') return weight
def forward( self, # type: ignore token_ids: torch.LongTensor, type_ids: torch.LongTensor, offsets: torch.LongTensor, wordpiece_mask: torch.BoolTensor, pos_tags: torch.LongTensor, word_mask: torch.BoolTensor, subtree_spans: torch.LongTensor = None, ): """ todo implement docstring Args: token_ids: [batch_size, num_word_pieces] type_ids: [batch_size, num_word_pieces] offsets: [batch_size, num_words, 2] wordpiece_mask: [batch_size, num_word_pieces] pos_tags: [batch_size, num_words] word_mask: [batch_size, num_words] subtree_spans: [batch_size, num_words, 2] Returns: span_start_logits: [batch_size, num_words, num_words] span_end_logits: [batch_size, num_words, num_words] """ # [bsz, seq_len, hidden] embedded_text_input = self.get_word_embedding( token_ids=token_ids, offsets=offsets, wordpiece_mask=wordpiece_mask, type_ids=type_ids, ) if self.pos_embedding is not None: embedded_pos_tags = self.pos_embedding(pos_tags) embedded_text_input = torch.cat( [embedded_text_input, embedded_pos_tags], -1) if self.fuse_layer is not None: embedded_text_input = self.fuse_layer(embedded_text_input) # todo compare normal dropout with InputVariationalDropout embedded_text_input = self._dropout(embedded_text_input) if self.additional_encoder is not None: if self.config.additional_layer_type == "transformer": extended_attention_mask = self.bert.get_extended_attention_mask( word_mask, word_mask.size(), word_mask.device) encoded_text = self.additional_encoder( hidden_states=embedded_text_input, attention_mask=extended_attention_mask)[0] else: encoded_text = self.additional_encoder( inputs=embedded_text_input, mask=word_mask) else: encoded_text = embedded_text_input batch_size, seq_len, encoding_dim = encoded_text.size() # [bsz, seq_len, dim] subtree_start_representation = self._dropout( self.subtree_start_feedforward(encoded_text)) subtree_end_representation = self._dropout( self.subtree_end_feedforward(encoded_text)) # [bsz, seq_len, seq_len] span_start_scores = self.subtree_start_attention( subtree_start_representation, subtree_start_representation) span_end_scores = self.subtree_end_attention( subtree_end_representation, subtree_end_representation) # start of word should less equal to it start_mask = word_mask.unsqueeze(-1) & ( ~torch.triu(span_start_scores.bool(), 1)) # end of word should greater equal to it. end_mask = word_mask.unsqueeze(-1) & torch.triu(span_end_scores.bool()) minus_inf = -1e8 span_start_scores = span_start_scores + ( ~start_mask).float() * minus_inf span_end_scores = span_end_scores + (~end_mask).float() * minus_inf output = (F.log_softmax(span_start_scores, dim=-1), F.log_softmax(span_end_scores, dim=-1)) if subtree_spans is not None: start_loss = F.cross_entropy( span_start_scores.view(batch_size * seq_len, -1), subtree_spans[:, :, 0].view(-1)) end_loss = F.cross_entropy( span_end_scores.view(batch_size * seq_len, -1), subtree_spans[:, :, 1].view(-1)) span_loss = start_loss + end_loss output = output + (span_loss, ) return output
def half_and_half(a, b): a = torch.stack([torch.triu(x) for x in a], 0) b = torch.stack([torch.tril(x, diagonal=-1) for x in b], 0) return a + b
def sequence_mask(seq): batch_size, seq_len = seq.size() mask = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8), diagonal=1) mask = mask.unsqueeze(0).expand(batch_size, -1, -1) return mask
print(torch.rand(5)) # tensor([ 0.4177, 0.4903, 0.5730, 0.1205, 0.1452]); It is a Vector print(torch.randint(10, (2, 2))) # tensor([[3., 3.], [8., 2.]]); 还是实数 print(torch.randint(10, (2, 2), dtype=torch.long)) # tensor([[8, 2], [8, 5]]) # arange & linspace x = torch.arange(1, 8); print(x) # tensor([1, 2, 3, 4, 5, 6, 7]) x = torch.linspace(-1, 1, 10); print(x, x.shape) # torch.Size([10]) x = torch.full((2, 3), 0.1); print(x) # tensor([[ 0.1000, 0.1000, 0.1000], [ 0.1000, 0.1000, 0.1000]]) x = torch.ones(2, 3); print(x) print(x.zero_()) # 将所有的元素都变为 0; 这个会对原始数据进行修改 x = torch.zeros(2, 3); print(x) print(x.random_(2)) # discrete uniform distribution over [from, to - 1]; 这里是 [0, 1] 之间的均匀分布 x = torch.empty(2, 3); print(x) # Returns a tensor filled with uninitialized data ## triu(input, diagonal=0, out=None): 下三角置零 a = torch.randn(3, 3); print(a) print(torch.triu(a)) # diagonal=0, all elements on and below the main diagonal are retained; 对角线不为 0 print(torch.triu(a, diagonal=1)) # a positive value excludes just as many diagonals above the main diagonal; 对角线也为 0 print(torch.triu(a, diagonal=-1)) # a negative value includes just as many diagonals below the main diagonal ## squeeze x = torch.unsqueeze(torch.linspace(-1, 1, 10), dim=0); print(x.shape, x) # torch.Size([1, 10]) x = torch.unsqueeze(torch.linspace(-1, 1, 10), dim=1); print(x.shape, x) # torch.Size([10, 1]); new tensor with a dimension of size one inserted at the specified position. x = torch.unsqueeze(torch.linspace(-1, 1, 10), dim=-1); print(x.shape, x) # torch.Size([10, 1]); x = torch.unsqueeze(torch.rand(2, 4), dim=1); print(x.shape) # torch.Size([2, 1, 4]) x = torch.zeros(2, 1, 2, 1, 2); print(x.shape) # torch.Size([2, 1, 2, 1, 2]) y = torch.squeeze(x); print(y.shape) # torch.Size([2, 2, 2]); 所有维度是 1 的去掉 y = torch.squeeze(x, 0); print(y.shape) # torch.Size([2, 1, 2, 1, 2]); 只对第 0 维操作 y = torch.squeeze(x, 1); print(y.shape) # torch.Size([2, 2, 1, 2]); 只对第 1 维操作 ## Operation; 和 np 的区别就是 torch 需要变量都是 tensor 类型的 # 求和以及按索引求和: torch.sum() torch.Tensor.indexadd()