Esempio n. 1
0
 def buffered_mask(self, tensor):
     dim = tensor.size(-1)
     if self._mask is None:
         self._mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
     if self._mask.size(0) < dim:
         self._mask = torch.triu(utils.fill_with_neg_inf(self._mask.resize_(dim, dim)), 1)
     return self._mask[:dim, :dim]
Esempio n. 2
0
    def forward(self, input_features, adj):
        #x = self.conv1(input_features, adj)
        #x = self.bn1(x)
        #x = self.act(x)
        #x = self.conv2(x, adj)
        #x = self.bn2(x)

        # pool over all nodes 
        #graph_h = self.pool_graph(x)
        graph_h = input_features.view(-1, self.max_num_nodes * self.max_num_nodes)
        # vae
        h_decode, z_mu, z_lsgms = self.vae(graph_h)
        out = F.sigmoid(h_decode)
        out_tensor = out.cpu().data
        recon_adj_lower = self.recover_adj_lower(out_tensor)
        recon_adj_tensor = self.recover_full_adj_from_lower(recon_adj_lower)

        # set matching features be degree
        out_features = torch.sum(recon_adj_tensor, 1)

        adj_data = adj.cpu().data[0]
        adj_features = torch.sum(adj_data, 1)

        S = self.edge_similarity_matrix(adj_data, recon_adj_tensor, adj_features, out_features,
                self.deg_feature_similarity)

        # initialization strategies
        init_corr = 1 / self.max_num_nodes
        init_assignment = torch.ones(self.max_num_nodes, self.max_num_nodes) * init_corr
        #init_assignment = torch.FloatTensor(4, 4)
        #init.uniform(init_assignment)
        assignment = self.mpm(init_assignment, S)
        #print('Assignment: ', assignment)

        # matching
        # use negative of the assignment score since the alg finds min cost flow
        row_ind, col_ind = scipy.optimize.linear_sum_assignment(-assignment.numpy())
        print('row: ', row_ind)
        print('col: ', col_ind)
        # order row index according to col index
        #adj_permuted = self.permute_adj(adj_data, row_ind, col_ind)
        adj_permuted = adj_data
        adj_vectorized = adj_permuted[torch.triu(torch.ones(self.max_num_nodes,self.max_num_nodes) )== 1].squeeze_()
        adj_vectorized_var = Variable(adj_vectorized).cuda()

        #print(adj)
        #print('permuted: ', adj_permuted)
        #print('recon: ', recon_adj_tensor)
        adj_recon_loss = self.adj_recon_loss(adj_vectorized_var, out[0])
        print('recon: ', adj_recon_loss)
        print(adj_vectorized_var)
        print(out[0])

        loss_kl = -0.5 * torch.sum(1 + z_lsgms - z_mu.pow(2) - z_lsgms.exp())
        loss_kl /= self.max_num_nodes * self.max_num_nodes # normalize
        print('kl: ', loss_kl)

        loss = adj_recon_loss + loss_kl

        return loss
def get_subsequent_mask(seq):
	"""
	For preventing lookahead
	"""
	batch_size,seq_len = seq.size()
	subsequent_mask = torch.triu(torch.ones((seq_len,seq_len),device=seq.device,dtype=torch.uint8),diagonal=1)
	subsequent_mask = subsequent_mask.unsqueeze(0).expand(batch_size,-1,-1) # size [batch_size x seq_len x seq_len]
	return subsequent_mask
Esempio n. 4
0
 def __call__(self, y_pred, y_true=None):
     """
     y_pred should be two projections
     """
     covar_mat = th.abs(th_matrixcorr(y_pred[0].data, y_pred[1].data))
     upper_sum = th.sum(th.triu(covar_mat,1))
     lower_sum = th.sum(th.tril(covar_mat,-1))
     self.anticorr_sum += upper_sum
     self.anticorr_sum += lower_sum
     self.total_count += covar_mat.size(0)*(covar_mat.size(1) - 1)
     return self.anticorr_sum / self.total_count
Esempio n. 5
0
def btriunpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True):
    r"""Unpacks the data and pivots from a batched LU factorization (btrifact) of a tensor.

    Returns a tuple indexed by:
      0: The pivots.
      1: The L tensor.
      2: The U tensor.

    Arguments:
        LU_data (Tensor): the packed LU factorization data
        LU_pivots (Tensor): the packed LU factorization pivots
        unpack_data (bool): flag indicating if the data should be unpacked
        unpack_pivots (bool): tlag indicating if the pivots should be unpacked

    Example::

        >>> A = torch.randn(2, 3, 3)
        >>> A_LU, pivots = A.btrifact()
        >>> P, a_L, a_U = torch.btriunpack(A_LU, pivots)
        >>>
        >>> # test that (P, A_L, A_U) gives LU factorization
        >>> A_ = torch.bmm(P, torch.bmm(A_L, A_U))
        >>> assert torch.equal(A_, A) == True  # can recover A
    """

    nBatch, sz, _ = LU_data.size()

    if unpack_data:
        I_U = torch.triu(torch.ones(sz, sz)).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz)
        I_L = 1 - I_U
        L = LU_data.new(LU_data.size()).zero_()
        U = LU_data.new(LU_data.size()).zero_()
        I_diag = torch.eye(sz).type_as(LU_data).byte().unsqueeze(0).expand(nBatch, sz, sz)
        L[I_diag] = 1.0
        L[I_L] = LU_data[I_L]
        U[I_U] = LU_data[I_U]
    else:
        L = U = None

    if unpack_pivots:
        P = torch.eye(sz).type_as(LU_data).unsqueeze(0).repeat(nBatch, 1, 1)
        for i in range(nBatch):
            for j in range(sz):
                k = LU_pivots[i, j] - 1
                t = P[i, :, j].clone()
                P[i, :, j] = P[i, :, k]
                P[i, :, k] = t
    else:
        P = None

    return P, L, U
    def forward(
        self, query, key, value, mask_future_timesteps=False,
        key_padding_mask=None, use_scalar_bias=False,
    ):
        """Input shape: Time x Batch x Channel
        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Future timesteps can be masked with the
        `mask_future_timesteps` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """
        src_len, bsz, out_channels = key.size()
        tgt_len = query.size(0)
        assert list(query.size()) == [tgt_len, bsz, out_channels]
        assert key.size() == value.size()

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len

        if self.downsample:
            size = bsz
        else:
            size = bsz * self.num_heads

        k = key
        v = value
        q = query
        if self.project_input:
            q = self.in_proj_q(q)
            k = self.in_proj_k(k)
            v = self.in_proj_v(v)
            src_len = k.size()[0]
        q *= self.scaling

        if not self.downsample:
            q = q.view(tgt_len, size, self.head_dim)
            k = k.view(src_len, size, self.head_dim)
            v = v.view(src_len, size, self.head_dim)

        q = q.transpose(0, 1)
        k = k.transpose(0, 1)
        v = v.transpose(0, 1)

        attn_weights = torch.bmm(q, k.transpose(1, 2))
        if mask_future_timesteps:
            assert query.size() == key.size(), \
                'mask_future_timesteps only applies to self-attention'
            attn_weights *= torch.tril(
                attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(),
                diagonal=-1,
            )[:, ::self.head_index + 1 if self.downsample else 1].unsqueeze(0)
            attn_weights += torch.triu(
                attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(),
                diagonal=0
            )[:, ::self.head_index + 1 if self.downsample else 1].unsqueeze(0)
        tgt_size = tgt_len
        if use_scalar_bias:
            attn_weights = scalar_bias(attn_weights, 2)
            v = scalar_bias(v, 1)
            tgt_size += 1

        if key_padding_mask is not None:
            # don't attend to padding symbols
            if key_padding_mask.max() > 0:
                if self.downsample:
                    attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len)
                else:
                    attn_weights = attn_weights.view(size, self.num_heads, tgt_len, src_len)
                attn_weights = attn_weights.masked_fill(
                    key_padding_mask.unsqueeze(1).unsqueeze(2),
                    -math.inf,
                )
                attn_weights = attn_weights.view(size, tgt_len, src_len)
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_weights = F.dropout(attn_weights, p=self.dropout, training=self.training)

        attn = torch.bmm(attn_weights, v)
        if self.downsample:
            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim)
        else:
            attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim)

        attn = self.out_proj(attn)

        return attn, attn_weights
Esempio n. 7
0
def subsequent_mask(size):
    attn_shape = (1, size, size)
    mask = torch.triu(torch.ones(attn_shape, dtype=torch.float), diagonal=1)
    return mask == 0
Esempio n. 8
0
    def _forward(self, dec_inp, mems=None):
        qlen, bsz = dec_inp.size()

        word_emb = self.word_emb(dec_inp)

        mlen = mems[0].size(0) if mems is not None else 0
        klen = mlen + qlen
        if self.same_length:
            all_ones = word_emb.new_ones(qlen, klen)
            mask_len = klen - self.mem_len
            if mask_len > 0:
                mask_shift_len = qlen - mask_len
            else:
                mask_shift_len = qlen
            dec_attn_mask = (
                torch.triu(all_ones, 1 + mlen) +
                torch.tril(all_ones, -mask_shift_len)).bool()[:, :, None]  # -1
        else:
            dec_attn_mask = torch.triu(word_emb.new_ones(qlen, klen),
                                       diagonal=1 + mlen).bool()[:, :, None]

        hids = []
        if self.attn_type == 0:  # default
            pos_seq = torch.arange(klen - 1,
                                   -1,
                                   -1.0,
                                   device=word_emb.device,
                                   dtype=word_emb.dtype)
            if self.clamp_len > 0:
                pos_seq.clamp_(max=self.clamp_len)
            pos_emb = self.pos_emb(pos_seq)

            core_out = self.drop(word_emb)
            pos_emb = self.drop(pos_emb)

            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                mems_i = None if mems is None else mems[i]
                core_out = layer(core_out,
                                 pos_emb,
                                 dec_attn_mask=dec_attn_mask,
                                 mems=mems_i)
                hids.append(core_out)
        elif self.attn_type == 1:  # learnable
            core_out = self.drop(word_emb)
            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                if self.clamp_len > 0:
                    r_emb = self.r_emb[i][-self.clamp_len:]
                    r_bias = self.r_bias[i][-self.clamp_len:]
                else:
                    r_emb, r_bias = self.r_emb[i], self.r_bias[i]

                mems_i = None if mems is None else mems[i]
                core_out = layer(core_out,
                                 r_emb,
                                 self.r_w_bias[i],
                                 r_bias,
                                 dec_attn_mask=dec_attn_mask,
                                 mems=mems_i)
                hids.append(core_out)
        elif self.attn_type == 2:  # absolute
            pos_seq = torch.arange(klen - 1,
                                   -1,
                                   -1.0,
                                   device=word_emb.device,
                                   dtype=word_emb.dtype)
            if self.clamp_len > 0:
                pos_seq.clamp_(max=self.clamp_len)
            pos_emb = self.pos_emb(pos_seq)

            core_out = self.drop(word_emb + pos_emb[-qlen:])

            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                mems_i = None if mems is None else mems[i]
                if mems_i is not None and i == 0:
                    mems_i += pos_emb[:mlen]
                core_out = layer(core_out,
                                 dec_attn_mask=dec_attn_mask,
                                 mems=mems_i)
                hids.append(core_out)
        elif self.attn_type == 3:
            core_out = self.drop(word_emb)

            hids.append(core_out)
            for i, layer in enumerate(self.layers):
                mems_i = None if mems is None else mems[i]
                if mems_i is not None and mlen > 0:
                    cur_emb = self.r_emb[i][:-qlen]
                    cur_size = cur_emb.size(0)
                    if cur_size < mlen:
                        cur_emb_pad = cur_emb[0:1].expand(
                            mlen - cur_size, -1, -1)
                        cur_emb = torch.cat([cur_emb_pad, cur_emb], 0)
                    else:
                        cur_emb = cur_emb[-mlen:]
                    mems_i += cur_emb.view(mlen, 1, -1)
                core_out += self.r_emb[i][-qlen:].view(qlen, 1, -1)

                core_out = layer(core_out,
                                 dec_attn_mask=dec_attn_mask,
                                 mems=mems_i)
                hids.append(core_out)

        core_out = self.drop(core_out)

        new_mems = self._update_mems(hids, mems, mlen, qlen)

        return core_out, new_mems
Esempio n. 9
0
def make_mask(n_head, length):
    a = torch.ones(1, n_head, length, length)
    b = torch.triu(a, diagonal=0)
    return b.transpose(2, 3)
Esempio n. 10
0
 def _generate_square_subsequent_mask(self, sz):
     #triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
     mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
     mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
     return mask
 def buffered_future_mask(self, tensor):
     dim = tensor.size(0)
     if not hasattr(self, '_future_mask') or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim:
         self._future_mask = torch.triu(utils.fill_with_neg_inf(tensor.new(dim, dim)), 1)
     return self._future_mask[:dim, :dim]
 def get_mask(self, seq):
     mask = (torch.triu(torch.ones(seq.size(1),
                                   seq.size(1))) == 1).transpose(0, 1)
     mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(
         mask == 1, float(0.0))
     return mask.to(self._config.device)
Esempio n. 13
0
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

"""#### Masking
By passing the mask into the transformer_encoder forward() function, the attention will only be calculated on the earlier positions in the sequence.
"""

#triu returns the upper triangular part of a matrix (2-D tensor) or batch of matrices (see section below)
torch.triu(torch.ones(3, 3))

# Masking
def masking():
  sz = 4
  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
  
  return mask

masking()

"""### Positional Encoding

Transformer architecture takes base architecture of Seq2Seq model (Encoder - Decoder). However, the transformer does not use recurrent model so this means we need a module that captures sequence information of the input/output.
Esempio n. 14
0
    def forward_seq2seq(self, batch, target_masking=None, zero_encoder=False):
        """
        Inputs Shapes:
            input: (Variable) batch_size x len_tgt (wanna tranpose)
            context: (Variable) batch_size x len_src x d_model
            mask_src (Tensor) batch_size x len_src
        Outputs Shapes:
            out: batch_size x len_tgt x d_model
            coverage: batch_size x len_tgt x len_src

        """
        src = batch.get('source')
        tgt = batch.get('target_input')
        input = torch.cat([src, tgt], dim=0)

        """ Embedding: batch_size x len_tgt x d_model """

        # we work with two embeddings at the same time
        src_emb = embedded_dropout(self.src_word_lut, src, dropout=self.word_dropout if self.training else 0)
        tgt_emb = embedded_dropout(self.tgt_word_lut, tgt, dropout=self.word_dropout if self.training else 0)

        # Concatenate the embeddings by time dimension
        emb = torch.cat([src_emb, tgt_emb], dim=0)

        # Add dropout and scale
        emb = self.preprocess_layer(emb)
        emb = emb * math.sqrt(self.model_size)

        klen, batch_size = emb.size(0), emb.size(1)

        # Prepare positional encoding:
        pos_seq = torch.arange(klen - 1, -1, -1.0, device=emb.device, dtype=emb.dtype)

        # pos_seq = torch.arange(0, klen, device=emb.device, dtype=emb.dtype)
        pos_emb = self.preprocess_layer(self.positional_encoder(pos_seq))

        if self.use_feature:
            raise NotImplementedError  # No feature/attributes for the moment

        # attention masking
        qlen = klen

        mlen = 0  # we don't have any memory in this mode

        # print(input)
        dec_attn_mask = torch.triu(
            emb.new_ones(qlen, klen), diagonal=1 + mlen).byte()[:, :, None]  #  Size T x T ?
        pad_mask = input.eq(onmt.Constants.PAD).byte().unsqueeze(1)  # Size 1 x T x B
        # pad_mask = input.new(*input.size()).zero_()
        mask = dec_attn_mask + pad_mask

        mask = torch.gt(mask, 0).bool()
        # mask = dec_attn_mask
        mask = mask.bool()
        output = emb

        for i, layer in enumerate(self.layer_modules):
            output, coverage = layer(output, pos_emb, self.r_w_bias, self.r_r_bias, mask)  # batch_size x len_src x d_model

        # From Google T2T
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        output = self.postprocess_layer(output)

        all_output = output

        src_len = src.size(0)
        context = output[src_len:, :, :]

        tgt_len = tgt.size(0)
        tgt_hiddens = output[:tgt_len, :, :]
        # output_dict = {'hidden': output, 'coverage': coverage, 'context': context}

        output_dict = defaultdict(lambda: None)
        output_dict['hidden'] = tgt_hiddens
        output_dict['encoder'] = context
        output_dict['src_mask'] = mask[src_len:, :, :]

        output = tgt_hiddens

        # This step removes the padding to reduce the load for the final layer
        if target_masking is not None:
            output = output.contiguous().view(-1, output.size(-1))

            mask = target_masking
            """ We remove all positions with PAD """
            flattened_mask = mask.view(-1)

            non_pad_indices = torch.nonzero(flattened_mask).squeeze(1)

            output = output.index_select(0, non_pad_indices)

        # final layer: computing softmax
        logprobs = self.generator[0](output)
        output_dict['logprobs'] = logprobs

        # return output, None
        return output_dict
def get_subsequent_mask(seq):
    ''' For masking out the subsequent info. '''
    sz_b, len_s = seq.size()
    subsequent_mask = (1 - torch.triu(
        torch.ones((1, len_s, len_s), device=seq.device), diagonal=1)).bool()
    return subsequent_mask
def generate_square_subsequent_mask(size: int):
    """Generate a triangular (size, size) mask."""
    mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float("-inf")).masked_fill(mask == 1, float(0.0))
    return mask
Esempio n. 17
0
    def _forward(self,
                 inputs,
                 memory_bank,
                 src_pad_mask,
                 tgt_pad_mask,
                 layer_cache=None,
                 step=None,
                 future=False):
        """ A naive forward pass for transformer decoder.

        # T: could be 1 in the case of stepwise decoding or tgt_len

        Args:
            inputs (FloatTensor): ``(batch_size, T, model_dim)``
            memory_bank (FloatTensor): ``(batch_size, src_len, model_dim)``
            src_pad_mask (LongTensor): ``(batch_size, 1, src_len)``
            tgt_pad_mask (LongTensor): ``(batch_size, 1, T)``
            layer_cache (dict or None): cached layer info when stepwise decode
            step (int or None): stepwise decoding counter
            future (bool): If set True, do not apply future_mask.

        Returns:
            (FloatTensor, FloatTensor):

            * output ``(batch_size, T, model_dim)``
            * attns ``(batch_size, head, T, src_len)``

        """
        dec_mask = None

        if step is None:
            tgt_len = tgt_pad_mask.size(-1)
            if not future:  # apply future_mask, result mask in (B, T, T)
                future_mask = torch.ones([tgt_len, tgt_len],
                                         device=tgt_pad_mask.device,
                                         dtype=torch.uint8)
                future_mask = torch.triu(future_mask,
                                         1).view(1, tgt_len, tgt_len)
                #future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
                # BoolTensor was introduced in pytorch 1.2
                try:
                    future_mask = future_mask.bool()
                except AttributeError:
                    pass
                dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
            else:  # only mask padding, result mask in (B, 1, T)
                dec_mask = tgt_pad_mask

        input_norm = self.layer_norm_1(inputs)

        if isinstance(self.self_attn, MultiHeadedAttention):
            query, _ = self.self_attn(input_norm,
                                      input_norm,
                                      input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      attn_type="self")
        elif isinstance(self.self_attn, AverageAttention):
            query, _ = self.self_attn(input_norm,
                                      mask=dec_mask,
                                      layer_cache=layer_cache,
                                      step=step)

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, attns = self.context_attn(memory_bank,
                                       memory_bank,
                                       query_norm,
                                       mask=src_pad_mask,
                                       layer_cache=layer_cache,
                                       attn_type="context")
        output = self.feed_forward(self.drop(mid) + query)

        return output, attns
Esempio n. 18
0
 def __init__(self, B, L, device="cpu"):
     mask_shape = [B, 1, L, L]
     with torch.no_grad():
         self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
Esempio n. 19
0
def batch_head_stats(attn_variables, triu_masking=False):
    # Retrieve context (shape bsz x nheads x L x dhead), mask (shape bsz x L) and weights (shape bsz x nheads x L x l)
    ctx = attn_variables["context"].detach()
    in_mask = attn_variables["in_mask"]
    out_mask = attn_variables["out_mask"]
    p = attn_variables["weights"].detach()
    logp = torch.log(p)
    device = p.device
    # Results
    results = {}
    # Triu mask for self att
    triu_mask = torch.triu(p.new_ones((p.size(2), p.size(3))), 1).byte()
    # Reverse mask
    if in_mask is not None:
        in_mask = torch.eq(in_mask, 0.0).float()
    else:
        in_mask = torch.ones(p.size(0), p.size(3)).to(ctx.device)

    # Reverse mask
    if out_mask is not None:
        out_mask = torch.eq(out_mask, 0.0).float()
    else:
        out_mask = torch.ones(ctx.size(0), ctx.size(2)).to(ctx.device)

    def reduce_head(x):
        return (x * out_mask.unsqueeze(1)).sum(0).sum(-1).detach().cpu()

    def reduce_head_pairs(x):
        return (
            x *
            out_mask.unsqueeze(1).unsqueeze(1)).sum(0).sum(-1).detach().cpu()

    # p_mask has shape bsz x -1 x -1 x l
    p_mask = in_mask.unsqueeze(1).unsqueeze(1)
    # Entropy
    plogp = p * logp
    plogp[p == 0] = 0
    if triu_masking:
        plogp.masked_fill_(triu_mask.unsqueeze(0).unsqueeze(0), 0)
    #plogp.masked_fill_(p_mask.eq(0), 0)
    H_p = -plogp.sum(-1)
    results["entropy"] = reduce_head(H_p)
    # Cross entropy
    plogq = torch.einsum("bilk,bjlk->bijlk", [p, logp])
    plogq.masked_fill_((p == 0).unsqueeze(1), 0)
    if triu_masking:
        plogq.masked_fill_(triu_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), 0)
    H_pq = -plogq.sum(-1)
    # Avg KL (bsz x nhead x L)
    avg_KL = (H_pq - H_p.unsqueeze(2))
    results["kl"] = reduce_head_pairs(avg_KL)
    if (results["kl"] == float('inf')).any():
        print(triu_mask)
        print(p)
        print(avg_KL)
        exit()
    # avg output disagreement
    out = ctx / (torch.sqrt((ctx**2).sum(-1, keepdim=True)) + 1e-20)
    out_dis = torch.einsum("bild,bjld->bijl", [out, out]) / out.size(1)**2
    results["out_dis"] = reduce_head_pairs(out_dis)
    # avg attn disagreement
    attn_dis = torch.einsum("bilk,bjlk->bijl", [p, p]) / p.size(1)**2
    results["attn_dis"] = reduce_head_pairs(attn_dis)
    # Avg attn offset
    self_pos = torch.arange(p.size(2)).to(device).float().view(1, 1, -1)
    if triu_masking:
        masked_p = torch.where(
            triu_mask.unsqueeze(0).unsqueeze(0), -p.new_ones(p.size()), p)
    else:
        masked_p = p
    attn_pos = masked_p.argmax(dim=-1).float()
    attn_offset = self_pos - attn_pos
    results["attn_pos"] = reduce_head(attn_pos)
    results["attn_offset"] = reduce_head(attn_offset)
    # Avg attn offset
    attn_dist = torch.abs(attn_offset)
    results["attn_dist"] = reduce_head(attn_dist)
    # Avg squared attn offset
    results["attn_offset_sq"] = reduce_head(attn_offset**2)
    results["attn_pos_sq"] = reduce_head(attn_pos**2)
    # Denominator
    denom = out_mask.sum().detach().cpu().data
    return results, denom
    def create_and_check_xlnet_model_use_mems(
        self,
        config,
        input_ids_1,
        input_ids_2,
        input_ids_q,
        perm_mask,
        input_mask,
        target_mapping,
        segment_ids,
        lm_labels,
        sequence_labels,
        is_impossible_labels,
        token_labels,
    ):
        model = XLNetModel(config=config)
        model.to(torch_device)
        model.eval()

        # first forward pass
        causal_mask = torch.ones(
            input_ids_1.shape[0],
            input_ids_1.shape[1],
            input_ids_1.shape[1],
            dtype=torch.float,
            device=torch_device,
        )
        causal_mask = torch.triu(causal_mask, diagonal=0)
        outputs_cache = model(input_ids_1, use_mems=True, perm_mask=causal_mask)
        outputs_no_cache = model(input_ids_1, use_mems=False, perm_mask=causal_mask)
        outputs_conf = model(input_ids_1)

        self.parent.assertTrue(len(outputs_cache) == len(outputs_conf))
        self.parent.assertTrue(len(outputs_cache) == len(outputs_no_cache) + 1)

        output, mems = outputs_cache.to_tuple()

        # create hypothetical next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)

        # append to next input_ids and token_type_ids
        next_input_ids = torch.cat([input_ids_1, next_tokens], dim=-1)

        # causal mask
        causal_mask = torch.ones(
            input_ids_1.shape[0],
            input_ids_1.shape[1] + 1,
            input_ids_1.shape[1] + 1,
            dtype=torch.float,
            device=torch_device,
        )
        causal_mask = torch.triu(causal_mask, diagonal=0)
        single_mask = torch.ones(input_ids_1.shape[0], 1, 1, dtype=torch.float, device=torch_device)

        # second forward pass
        output_from_no_past = model(next_input_ids, perm_mask=causal_mask)["last_hidden_state"]
        output_from_past = model(next_tokens, mems=mems, perm_mask=single_mask)["last_hidden_state"]

        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
        output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach()
        output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()

        # test that outputs are equal for slice
        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
Esempio n. 21
0
    def forward(
        self,
        input_ids=None,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        use_cache=True,
    ):
        r"""
    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the last layer of the model.
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        from transformers import CTRLTokenizer, CTRLModel
        import torch

        tokenizer = CTRLTokenizer.from_pretrained('ctrl')
        model = CTRLModel.from_pretrained('ctrl')

        input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids)

        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

        """

        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_shape = input_ids.size()
            input_ids = input_ids.view(-1, input_shape[-1])
            batch_size = input_ids.shape[0]
        elif inputs_embeds is not None:
            input_shape = inputs_embeds.size()[:-1]
            batch_size = inputs_embeds.shape[0]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        if past is None:
            past_length = 0
            past = [None] * len(self.h)
        else:
            past_length = past[0][0].size(-2)
        if position_ids is None:
            device = input_ids.device if input_ids is not None else inputs_embeds.device
            position_ids = torch.arange(past_length,
                                        input_shape[-1] + past_length,
                                        dtype=torch.long,
                                        device=device)
            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])

        # Attention mask.
        if attention_mask is not None:
            assert batch_size > 0, "batch_size has to be defined and > 0"
            attention_mask = attention_mask.view(batch_size, -1)
            # We create a 3D attention mask from a 2D tensor mask.
            # Sizes are [batch_size, 1, 1, to_seq_length]
            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
            # this attention mask is more simple than the triangular masking of causal attention
            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            attention_mask = attention_mask.to(
                dtype=self.dtype)  # fp16 compatibility
            attention_mask = (1.0 - attention_mask) * -10000.0

        # Prepare head mask if needed
        head_mask = self.get_head_mask(head_mask, self.config.n_layer)

        if token_type_ids is not None:
            token_type_ids = token_type_ids.view(-1, input_shape[-1])
            token_type_embeds = self.w(token_type_ids)
            token_type_embeds *= np.sqrt(self.d_model_size)
        else:
            token_type_embeds = 0
        position_ids = position_ids.view(-1, input_shape[-1])

        if inputs_embeds is None:
            inputs_embeds = self.w(input_ids)
        # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
        seq_len = input_shape[-1]
        mask = torch.triu(
            torch.ones(seq_len + past_length, seq_len + past_length),
            1).to(inputs_embeds.device)

        inputs_embeds *= np.sqrt(self.d_model_size)

        pos_embeds = self.pos_encoding[position_ids, :].to(
            inputs_embeds.device)

        hidden_states = inputs_embeds + pos_embeds + token_type_embeds

        hidden_states = self.dropout(hidden_states)

        output_shape = input_shape + (inputs_embeds.size(-1), )
        presents = ()
        all_hidden_states = ()
        all_attentions = []
        for i, (h, layer_past) in enumerate(zip(self.h, past)):
            if self.output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states.view(
                    *output_shape), )
            outputs = h(
                hidden_states,
                mask,
                layer_past=layer_past,
                attention_mask=attention_mask,
                head_mask=head_mask[i],
                use_cache=use_cache,
            )
            hidden_states, present = outputs[:2]
            if use_cache is True:
                presents = presents + (present, )

            if self.output_attentions:
                all_attentions.append(outputs[2])

        hidden_states = self.layernorm(hidden_states)
        hidden_states = hidden_states.view(*output_shape)
        if self.output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states, )

        outputs = (hidden_states, )
        if use_cache is True:
            outputs = outputs + (presents, )
        if self.output_hidden_states:
            outputs = outputs + (all_hidden_states, )
        if self.output_attentions:
            # let the number of heads free (-1) so we can extract attention even after head pruning
            attention_output_shape = input_shape[:-1] + (
                -1, ) + all_attentions[0].shape[-2:]
            all_attentions = tuple(
                t.view(*attention_output_shape) for t in all_attentions)
            outputs = outputs + (all_attentions, )
        return outputs
Esempio n. 22
0
def conditional_corrcoeff(
    density: Any,
    limits: Tensor,
    condition: Tensor,
    subset: Optional[List[int]] = None,
    resolution: int = 50,
    warn_about_deprecation: bool = True,
) -> Tensor:
    r"""
    Returns the conditional correlation matrix of a distribution.

    To compute the conditional distribution, we condition all but two parameters to
    values from `condition`, and then compute the Pearson correlation
    coefficient $\rho$ between the remaining two parameters under the distribution
    `density`. We do so for any pair of parameters specified in `subset`, thus
    creating a matrix containing conditional correlations between any pair of
    parameters.

    If `condition` is a batch of conditions, this function computes the conditional
    correlation matrix for each one of them and returns the mean.

    Args:
        density: Probability density function with `.log_prob()` function.
        limits: Limits within which to evaluate the `density`.
        condition: Values to condition the `density` on. If a batch of conditions is
            passed, we compute the conditional correlation matrix for each of them and
            return the average conditional correlation matrix.
        subset: Evaluate the conditional distribution only on a subset of dimensions.
            If `None` this function uses all dimensions.
        resolution: Number of grid points on which the conditional distribution is
            evaluated. A higher value increases the accuracy of the estimated
            correlation but also increases the computational cost.
        warn_about_deprecation: With sbi v0.15.0, we depracated the import of this
            function from `sbi.utils`. Instead, it should be imported from
            `sbi.analysis`.

    Returns: Average conditional correlation matrix of shape either `(num_dim, num_dim)`
    or `(len(subset), len(subset))` if `subset` was specified.
    """

    if warn_about_deprecation:
        warn(
            "Importing `conditional_corrcoeff` from `sbi.utils` is deprecated since "
            "sbi v0.15.0. Instead, use "
            "`from sbi.analysis import conditional_corrcoeff`."
        )

    condition = ensure_theta_batched(condition)

    if subset is None:
        subset = range(condition.shape[1])

    correlation_matrices = []
    for cond in condition:
        correlation_matrices.append(
            torch.stack(
                [
                    _compute_corrcoeff(
                        eval_conditional_density(
                            density,
                            cond,
                            limits,
                            dim1=dim1,
                            dim2=dim2,
                            resolution=resolution,
                            warn_about_deprecation=False,
                        ),
                        limits[[dim1, dim2]],
                    )
                    for dim1 in subset
                    for dim2 in subset
                    if dim1 < dim2
                ]
            )
        )

    average_correlations = torch.mean(torch.stack(correlation_matrices), dim=0)

    # `average_correlations` is still a vector containing the upper triangular entries.
    # Below, assemble them into a matrix:
    av_correlation_matrix = torch.zeros((len(subset), len(subset)))
    triu_indices = torch.triu_indices(row=len(subset), col=len(subset), offset=1)
    av_correlation_matrix[triu_indices[0], triu_indices[1]] = average_correlations

    # Make the matrix symmetric by copying upper diagonal to lower diagonal.
    av_correlation_matrix = torch.triu(av_correlation_matrix) + torch.tril(
        av_correlation_matrix.T
    )

    av_correlation_matrix.fill_diagonal_(1.0)
    return av_correlation_matrix
Esempio n. 23
0
def masking():
  sz = 4
  mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
  mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
  
  return mask
Esempio n. 24
0
def get_subsequent_mask(seq):
    sz_b, len_s = seq.size()
    subsequent_mask = torch.triu(torch.ones((1, len_s, len_s)),
                                 diagonal=1).bool()
    return subsequent_mask
Esempio n. 25
0
 def get_mask(size):
     weights = torch.triu(torch.ones((size, size), dtype = torch.bool), 1)
     return weights
Esempio n. 26
0
    def forward(
        self,
        input_ids=None,
        mems=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_tuple=None,
    ):
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (output_hidden_states
                                if output_hidden_states is not None else
                                self.config.output_hidden_states)
        return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple

        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
        # so we transpose here from shape [bsz, len] to shape [len, bsz]
        if input_ids is not None and inputs_embeds is not None:
            raise ValueError(
                "You cannot specify both input_ids and inputs_embeds at the same time"
            )
        elif input_ids is not None:
            input_ids = input_ids.transpose(0, 1).contiguous()
            qlen, bsz = input_ids.size()
        elif inputs_embeds is not None:
            inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
        else:
            raise ValueError(
                "You have to specify either input_ids or inputs_embeds")

        if mems is None:
            mems = self.init_mems(bsz)

        # Prepare head mask if needed
        # 1.0 in head_mask indicate we keep the head
        # attention_probs has shape bsz x n_heads x N x N
        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
        if head_mask is not None:
            if head_mask.dim() == 1:
                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
                    0).unsqueeze(0)
                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
            head_mask = head_mask.to(dtype=next(self.parameters(
            )).dtype)  # switch to fload if need + fp16 compatibility
        else:
            head_mask = [None] * self.n_layer

        if inputs_embeds is not None:
            word_emb = inputs_embeds
        else:
            word_emb = self.word_emb(input_ids)

        mlen = mems[0].size(0) if mems is not None else 0
        klen = mlen + qlen
        if self.same_length:
            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8)
            mask_len = klen - self.mem_len
            if mask_len > 0:
                mask_shift_len = qlen - mask_len
            else:
                mask_shift_len = qlen
            dec_attn_mask = (torch.triu(all_ones, 1 + mlen) +
                             torch.tril(all_ones, -mask_shift_len))[:, :,
                                                                    None]  # -1
        else:
            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen),
                                                         dtype=torch.uint8),
                                       diagonal=1 + mlen)[:, :, None]

        hids = []
        attentions = [] if output_attentions else None
        if self.attn_type == 0:  # default
            pos_seq = torch.arange(klen - 1,
                                   -1,
                                   -1.0,
                                   device=word_emb.device,
                                   dtype=word_emb.dtype)
            if self.clamp_len > 0:
                pos_seq.clamp_(max=self.clamp_len)
            pos_emb = self.pos_emb(pos_seq)

            core_out = self.drop(word_emb)
            pos_emb = self.drop(pos_emb)

            for i, layer in enumerate(self.layers):
                hids.append(core_out)
                mems_i = None if mems is None else mems[i]
                layer_outputs = layer(
                    core_out,
                    pos_emb,
                    dec_attn_mask=dec_attn_mask,
                    mems=mems_i,
                    head_mask=head_mask[i],
                    output_attentions=output_attentions,
                )
                core_out = layer_outputs[0]
                if output_attentions:
                    attentions.append(layer_outputs[1])
        else:  # learnable embeddings and absolute embeddings
            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint

        core_out = self.drop(core_out)

        new_mems = self._update_mems(hids, mems, mlen, qlen)

        if output_hidden_states:
            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
            hids.append(core_out)
            hids = tuple(t.transpose(0, 1).contiguous() for t in hids)
        else:
            hids = None
        if output_attentions:
            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
            attentions = tuple(
                t.permute(2, 3, 0, 1).contiguous() for t in attentions)
        # We transpose back here to shape [bsz, len, hidden_dim]
        core_out = core_out.transpose(0, 1).contiguous()

        if return_tuple:
            return tuple(v for v in [core_out, new_mems, hids, attentions]
                         if v is not None)

        return TransfoXLModelOutput(
            last_hidden_state=core_out,
            mems=new_mems,
            hidden_states=hids,
            attentions=attentions,
        )
Esempio n. 27
0
 def _assemble_W(self):
     """ assemble W from its pieces (P, L, U, S) """
     L = torch.tril(self.L, diagonal=-1) + torch.diag(torch.ones(self.dim))
     U = torch.triu(self.U, diagonal=1)
     W = self.P @ L @ (U + torch.diag(self.S))
     return W
Esempio n. 28
0
def get_att(head, seq_len):
    att = torch.triu(torch.ones(seq_len, seq_len).byte(), diagonal=1)
    for _ in range(2):
        att = torch.unsqueeze(att, dim=0)
    return att.repeat(1, head, 1, 1)
 def _generate_square_subsequent_mask(self, sz):
     mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
     mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(
         mask == 1, float(0.0))
     return mask
Esempio n. 30
0
 def recover_adj_lower(self, l):
     # NOTE: Assumes 1 per minibatch
     adj = torch.zeros(self.max_num_nodes, self.max_num_nodes)
     adj[torch.triu(torch.ones(self.max_num_nodes, self.max_num_nodes)) == 1] = l
     return adj
 def generate_square_subsequent_mask(sz):
     return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
Esempio n. 32
0
 def generate_square_subsequent_mask(self, sz):
     mask = torch.triu(torch.ones(sz, sz), 1)
     mask = mask.masked_fill(mask == 1, float('-inf'))
     return mask
Esempio n. 33
0
def convert_examples_to_features(examples, seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    attention_mask = torch.tril(
        torch.ones(seq_length, seq_length, dtype=torch.long), -1) + torch.triu(
            torch.ones(seq_length, seq_length, dtype=torch.long), 1)

    for (ex_index, example) in enumerate(examples):
        tokens_a = example.text_a
        labels = example.labels

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)

        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > seq_length - 2:
                tokens_a = tokens_a[0:(seq_length - 2)]
            if len(labels) > seq_length - 2:
                labels = labels[0:(seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        input_type_ids = []
        targets = []
        tokens.append("[CLS]")
        targets.append("[PAD]")
        input_type_ids.append(0)
        for i, token in enumerate(tokens_a):
            tokens.append(token)
            input_type_ids.append(0)
            targets.append(labels[i])
        tokens.append("[SEP]")
        targets.append("[PAD]")
        input_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                input_type_ids.append(1)
            tokens.append("[SEP]")
            input_type_ids.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        target_ids = tokenizer.convert_tokens_to_ids(targets)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < seq_length:
            input_ids.append(0)
            input_mask.append(0)
            target_ids.append(0)
            input_type_ids.append(0)

        assert len(input_ids) == seq_length
        assert len(input_mask) == seq_length
        assert len(target_ids) == seq_length
        assert len(input_type_ids) == seq_length
        #input_mask=(torch.tensor(input_mask, dtype=torch.long).unsqueeze(0)*attention_mask).tolist()

        candidates = example.candidates

        if ex_index < 0:
            logger.info("*** Example ***")
            logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x)
                                                    for x in input_ids]))
            logger.info("target_ids: %s" %
                        " ".join([str(x) for x in target_ids]))
            logger.info("input_mask: %s" %
                        " ".join([str(x) for x in input_mask]))
            logger.info("input_type_ids: %s" %
                        " ".join([str(x) for x in input_type_ids]))
        if False:
            logger.info("*** Example ***")
            logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in labels]))

        features.append(
            InputFeatures(
                unique_id=example.unique_id,
                target_ids=target_ids,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids,
                candidates=candidates,
            ))
    return features
Esempio n. 34
0
    def masking(self, weight):
        #         pdb.set_trace()
        mask = torch.triu(torch.ones_like(weight)).transpose(2, 3)
        weight[mask == 0] = float('-inf')

        return weight
Esempio n. 35
0
    def forward(
        self,  # type: ignore
        token_ids: torch.LongTensor,
        type_ids: torch.LongTensor,
        offsets: torch.LongTensor,
        wordpiece_mask: torch.BoolTensor,
        pos_tags: torch.LongTensor,
        word_mask: torch.BoolTensor,
        subtree_spans: torch.LongTensor = None,
    ):
        """  todo implement docstring
        Args:
            token_ids: [batch_size, num_word_pieces]
            type_ids: [batch_size, num_word_pieces]
            offsets: [batch_size, num_words, 2]
            wordpiece_mask: [batch_size, num_word_pieces]
            pos_tags: [batch_size, num_words]
            word_mask: [batch_size, num_words]
            subtree_spans: [batch_size, num_words, 2]
        Returns:
            span_start_logits: [batch_size, num_words, num_words]
            span_end_logits: [batch_size, num_words, num_words]

        """
        # [bsz, seq_len, hidden]
        embedded_text_input = self.get_word_embedding(
            token_ids=token_ids,
            offsets=offsets,
            wordpiece_mask=wordpiece_mask,
            type_ids=type_ids,
        )
        if self.pos_embedding is not None:
            embedded_pos_tags = self.pos_embedding(pos_tags)
            embedded_text_input = torch.cat(
                [embedded_text_input, embedded_pos_tags], -1)
            if self.fuse_layer is not None:
                embedded_text_input = self.fuse_layer(embedded_text_input)
        # todo compare normal dropout with InputVariationalDropout
        embedded_text_input = self._dropout(embedded_text_input)

        if self.additional_encoder is not None:
            if self.config.additional_layer_type == "transformer":
                extended_attention_mask = self.bert.get_extended_attention_mask(
                    word_mask, word_mask.size(), word_mask.device)
                encoded_text = self.additional_encoder(
                    hidden_states=embedded_text_input,
                    attention_mask=extended_attention_mask)[0]
            else:
                encoded_text = self.additional_encoder(
                    inputs=embedded_text_input, mask=word_mask)
        else:
            encoded_text = embedded_text_input

        batch_size, seq_len, encoding_dim = encoded_text.size()

        # [bsz, seq_len, dim]
        subtree_start_representation = self._dropout(
            self.subtree_start_feedforward(encoded_text))
        subtree_end_representation = self._dropout(
            self.subtree_end_feedforward(encoded_text))
        # [bsz, seq_len, seq_len]
        span_start_scores = self.subtree_start_attention(
            subtree_start_representation, subtree_start_representation)
        span_end_scores = self.subtree_end_attention(
            subtree_end_representation, subtree_end_representation)

        # start of word should less equal to it
        start_mask = word_mask.unsqueeze(-1) & (
            ~torch.triu(span_start_scores.bool(), 1))
        # end of word should greater equal to it.
        end_mask = word_mask.unsqueeze(-1) & torch.triu(span_end_scores.bool())

        minus_inf = -1e8
        span_start_scores = span_start_scores + (
            ~start_mask).float() * minus_inf
        span_end_scores = span_end_scores + (~end_mask).float() * minus_inf

        output = (F.log_softmax(span_start_scores,
                                dim=-1), F.log_softmax(span_end_scores,
                                                       dim=-1))

        if subtree_spans is not None:

            start_loss = F.cross_entropy(
                span_start_scores.view(batch_size * seq_len, -1),
                subtree_spans[:, :, 0].view(-1))
            end_loss = F.cross_entropy(
                span_end_scores.view(batch_size * seq_len, -1),
                subtree_spans[:, :, 1].view(-1))
            span_loss = start_loss + end_loss
            output = output + (span_loss, )

        return output
Esempio n. 36
0
def half_and_half(a, b):
    a = torch.stack([torch.triu(x) for x in a], 0)
    b = torch.stack([torch.tril(x, diagonal=-1) for x in b], 0)
    return a + b
Esempio n. 37
0
def sequence_mask(seq):
    batch_size, seq_len = seq.size()
    mask = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.uint8),
                      diagonal=1)
    mask = mask.unsqueeze(0).expand(batch_size, -1, -1)
    return mask
Esempio n. 38
0
print(torch.rand(5))  # tensor([ 0.4177,  0.4903,  0.5730,  0.1205,  0.1452]); It is a Vector
print(torch.randint(10, (2, 2)))  # tensor([[3., 3.], [8., 2.]]); 还是实数
print(torch.randint(10, (2, 2), dtype=torch.long))  # tensor([[8, 2], [8, 5]])
# arange & linspace
x = torch.arange(1, 8); print(x)  # tensor([1, 2, 3, 4, 5, 6, 7])
x = torch.linspace(-1, 1, 10); print(x, x.shape)  # torch.Size([10])
x = torch.full((2, 3), 0.1); print(x)  # tensor([[ 0.1000,  0.1000,  0.1000], [ 0.1000,  0.1000,  0.1000]])
x = torch.ones(2, 3); print(x)
print(x.zero_())  # 将所有的元素都变为 0; 这个会对原始数据进行修改
x = torch.zeros(2, 3); print(x)
print(x.random_(2))  # discrete uniform distribution over [from, to - 1]; 这里是 [0, 1] 之间的均匀分布
x = torch.empty(2, 3); print(x)  # Returns a tensor filled with uninitialized data

## triu(input, diagonal=0, out=None): 下三角置零
a = torch.randn(3, 3); print(a)
print(torch.triu(a))  # diagonal=0, all elements on and below the main diagonal are retained; 对角线不为 0
print(torch.triu(a, diagonal=1))  # a positive value excludes just as many diagonals above the main diagonal; 对角线也为 0
print(torch.triu(a, diagonal=-1))  # a negative value includes just as many diagonals below the main diagonal

## squeeze
x = torch.unsqueeze(torch.linspace(-1, 1, 10), dim=0); print(x.shape, x)  # torch.Size([1, 10])
x = torch.unsqueeze(torch.linspace(-1, 1, 10), dim=1); print(x.shape, x)  # torch.Size([10, 1]); new tensor with a dimension of size one inserted at the specified position.
x = torch.unsqueeze(torch.linspace(-1, 1, 10), dim=-1); print(x.shape, x)  # torch.Size([10, 1]);
x = torch.unsqueeze(torch.rand(2, 4), dim=1); print(x.shape)  # torch.Size([2, 1, 4])
x = torch.zeros(2, 1, 2, 1, 2); print(x.shape)  # torch.Size([2, 1, 2, 1, 2])
y = torch.squeeze(x); print(y.shape)  # torch.Size([2, 2, 2]); 所有维度是 1 的去掉
y = torch.squeeze(x, 0); print(y.shape)  # torch.Size([2, 1, 2, 1, 2]); 只对第 0 维操作
y = torch.squeeze(x, 1); print(y.shape)  # torch.Size([2, 2, 1, 2]); 只对第 1 维操作

## Operation; 和 np 的区别就是 torch 需要变量都是 tensor 类型的
# 求和以及按索引求和: torch.sum() torch.Tensor.indexadd()