コード例 #1
0
    def __call__(self, start_logits: torch.Tensor, end_logits: torch.Tensor,
                 match_logits: torch.Tensor,
                 mask: torch.BoolTensor) -> torch.LongTensor:

        mask = mask.bool()
        batch_size, seq_len = start_logits.size()

        # match label pred, [batch_size, seq_len, seq_len]
        match_preds = match_logits > 0

        # mask 保留 match_preds 或者 start, end 其中之一即可
        match_preds = match_preds \
                      & mask.unsqueeze(-1).expand(-1, -1, seq_len) \
                      & mask.unsqueeze(1).expand(-1, seq_len, -1)

        # [batch_size, seq_len]
        start_preds = start_logits > 0

        start_preds = start_preds & mask

        # [batch_size, seq_len]
        end_preds = end_logits > 0
        end_preds = end_preds & mask

        # match label 最终结果
        match_preds = (match_preds
                       & start_preds.unsqueeze(-1).expand(-1, -1, seq_len)
                       & end_preds.unsqueeze(1).expand(-1, seq_len, -1))

        return match_preds
コード例 #2
0
    def _construct_loss(
        self,
        arc_scores: torch.Tensor,
        arc_tag_logits: torch.Tensor,
        arc_tags: torch.Tensor,
        mask: torch.BoolTensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Computes the arc and tag loss for an adjacency matrix.

        # Parameters

        arc_scores : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length) used to generate a
            binary classification decision for whether an edge is present between two words.
        arc_tag_logits : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length, num_tags) used to generate
            a distribution over edge tags for a given edge.
        arc_tags : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length).
            The labels for every arc.
        mask : `torch.BoolTensor`, required.
            A mask of shape (batch_size, sequence_length), denoting unpadded
            elements in the sequence.

        # Returns

        arc_nll : `torch.Tensor`, required.
            The negative log likelihood from the arc loss.
        tag_nll : `torch.Tensor`, required.
            The negative log likelihood from the arc tag loss.
        """
        arc_indices = (arc_tags != -1).float()
        # Make the arc tags not have negative values anywhere
        # (by default, no edge is indicated with -1).
        arc_tags = arc_tags * arc_indices
        arc_nll = self._arc_loss(
            arc_scores, arc_indices) * mask.unsqueeze(1) * mask.unsqueeze(2)
        # We want the mask for the tags to only include the unmasked words
        # and we only care about the loss with respect to the gold arcs.
        tag_mask = mask.unsqueeze(1) * mask.unsqueeze(2) * arc_indices

        batch_size, sequence_length, _, num_tags = arc_tag_logits.size()
        original_shape = [batch_size, sequence_length, sequence_length]
        reshaped_logits = arc_tag_logits.view(-1, num_tags)
        reshaped_tags = arc_tags.view(-1)
        tag_nll = (self._tag_loss(reshaped_logits,
                                  reshaped_tags.long()).view(original_shape) *
                   tag_mask)

        valid_positions = tag_mask.sum()

        arc_nll = arc_nll.sum() / valid_positions.float()
        tag_nll = tag_nll.sum() / valid_positions.float()
        return arc_nll, tag_nll
コード例 #3
0
    def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor = None):
        if mask is not None:
            tokens = tokens * mask.unsqueeze(-1)

        # Our input has shape `(batch_size, num_tokens, embedding_dim)`, so we sum out the `num_tokens`
        # dimension.
        summed = tokens.sum(1)

        if self._averaged:
            if mask is not None:
                lengths = get_lengths_from_binary_sequence_mask(mask)
                length_mask = lengths > 0

                # Set any length 0 to 1, to avoid dividing by zero.
                lengths = torch.max(lengths, lengths.new_ones(1))
            else:
                lengths = tokens.new_full((1, ), fill_value=tokens.size(1))
                length_mask = None

            summed = summed / lengths.unsqueeze(-1).float()

            if length_mask is not None:
                summed = summed * (length_mask > 0).unsqueeze(-1)

        return summed
コード例 #4
0
ファイル: cnn_encoder.py プロジェクト: shimengfeng/allennlp
    def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor):
        if mask is not None:
            tokens = tokens * mask.unsqueeze(-1)

        # Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`.  The
        # convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`,
        # where the conv layer `in_channels` is our `embedding_dim`.  We thus need to transpose the
        # tensor first.
        tokens = torch.transpose(tokens, 1, 2)
        # Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`,
        # where `pool_length = num_tokens - ngram_size + 1`.  We then do an activation function,
        # then do max pooling over each filter for the whole input sequence.  Because our max
        # pooling is simple, we just use `torch.max`.  The resultant tensor of has shape
        # `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the
        # projection layer, if requested.

        filter_outputs = []
        for i in range(len(self._convolution_layers)):
            convolution_layer = getattr(self, "conv_layer_{}".format(i))
            filter_outputs.append(
                self._activation(convolution_layer(tokens)).max(dim=2)[0])

        # Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`.
        # Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`.
        maxpool_output = (torch.cat(filter_outputs, dim=1)
                          if len(filter_outputs) > 1 else filter_outputs[0])

        if self.projection_layer:
            result = self.projection_layer(maxpool_output)
        else:
            result = maxpool_output
        return result
コード例 #5
0
    def forward(
        self,
        previous_state: Dict[str, torch.Tensor],
        encoder_outputs: torch.Tensor,
        source_mask: torch.BoolTensor,
        previous_steps_predictions: torch.Tensor,
        previous_steps_mask: Optional[torch.BoolTensor] = None,
    ) -> Tuple[Dict[str, torch.Tensor], torch.Tensor]:

        source_mask = source_mask.unsqueeze(-2)
        future_mask = Variable(
            subsequent_mask(previous_steps_predictions.size(-2),
                            device=source_mask.device).type_as(
                                source_mask.data))
        if previous_steps_mask is None:
            previous_steps_mask = future_mask
        else:
            previous_steps_mask = previous_steps_mask.unsqueeze(
                -2) & future_mask
        previous_steps_predictions = previous_steps_predictions * self._embed_scale
        if self._positional_embedder:
            previous_steps_predictions = self._positional_embedder(
                previous_steps_predictions)
        previous_steps_predictions = self._dropout(previous_steps_predictions)
        decoded = self._self_attention(previous_steps_predictions,
                                       encoder_outputs, source_mask,
                                       previous_steps_mask)
        return {}, decoded
コード例 #6
0
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        mask: torch.BoolTensor = None,
    ) -> torch.Tensor:
        if mask is not None:
            # Same mask applied to all h heads.
            # Shape (batch_size, num_heads, timesteps, timesteps)
            mask = mask.unsqueeze(1).expand([-1, self.num_heads, -1, -1])

        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        query, key, value = [
            layer(x).view(nbatches, -1, self.num_heads,
                          self.d_k).transpose(1, 2)
            for layer, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, _ = attention(query, key, value, mask=mask, dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous().view(nbatches, -1,
                                                self.num_heads * self.d_k)
        return self.linears[-1](x)
コード例 #7
0
def masked_log_softmax(vector: torch.Tensor, mask: torch.BoolTensor, dim: int = -1) -> torch.Tensor:
    """
    `torch.nn.functional.log_softmax(vector)` does not work if some elements of `vector` should be
    masked.  This performs a log_softmax on just the non-masked portions of `vector`.  Passing
    `None` in for the mask is also acceptable; you'll just get a regular log_softmax.
    `vector` can have an arbitrary number of dimensions; the only requirement is that `mask` is
    broadcastable to `vector's` shape.  If `mask` has fewer dimensions than `vector`, we will
    unsqueeze on dimension 1 until they match.  If you need a different unsqueezing of your mask,
    do it yourself before passing the mask into this function.
    In the case that the input vector is completely masked, the return value of this function is
    arbitrary, but not `nan`.  You should be masking the result of whatever computation comes out
    of this in that case, anyway, so the specific values returned shouldn't matter.  Also, the way
    that we deal with this case relies on having single-precision floats; mixing half-precision
    floats with fully-masked vectors will likely give you `nans`.
    If your logits are all extremely negative (i.e., the max value in your logit vector is -50 or
    lower), the way we handle masking here could mess you up.  But if you've got logit values that
    extreme, you've got bigger problems than this.
    """
    if mask is not None:
        while mask.dim() < vector.dim():
            mask = mask.unsqueeze(1)
        # vector + mask.log() is an easy way to zero out masked elements in logspace, but it
        # results in nans when the whole vector is masked.  We need a very small value instead of a
        # zero in the mask for these cases.
        vector = vector + (mask + 1e-30).log()
    return torch.nn.functional.log_softmax(vector, dim=dim)
コード例 #8
0
    def forward(self,
                tensors: List[torch.Tensor],
                mask: torch.BoolTensor = None) -> torch.Tensor:
        """
        Compute a weighted average of the `tensors`.  The input tensors an be any shape
        with at least two dimensions, but must all be the same shape.
        When `do_layer_norm=True`, the `mask` is required input.  If the `tensors` are
        dimensioned  `(dim_0, ..., dim_{n-1}, dim_n)`, then the `mask` is dimensioned
        `(dim_0, ..., dim_{n-1})`, as in the typical case with `tensors` of shape

        tensors: [batch_size, timesteps, dim]  
        mask  shape: [batch_size, timesteps]
        When `do_layer_norm=False` the `mask` is ignored.
        """
        assert (len(tensors) == self.mixture_size)

        def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
            tensor_masked = tensor * broadcast_mask
            mean = torch.sum(tensor_masked) / num_elements_not_masked
            sum_value = torch.sum(((tensor_masked - mean) * broadcast_mask)**2)
            # sum_value_max= torch.finfo(sum_value.dtype).max
            # sum_value = sum_value if sum_value < sum_value_max else sum_value_max
            variance = (sum_value / num_elements_not_masked)
            return (tensor -
                    mean) / torch.sqrt(variance +
                                       tiny_value_of_dtype(variance.dtype))

        normed_weights = torch.nn.functional.softmax(torch.cat(
            [parameter for parameter in self.scalar_parameters]),
                                                     dim=0)

        if torch.cuda.is_available():
            normed_weights = normed_weights.cuda()
            self.gamma = self.gamma.to('cuda')
        normed_weights = torch.split(normed_weights, split_size_or_sections=1)
        if not self.do_layer_norm:
            pieces = []
            for weight, tensor in zip(normed_weights, tensors):
                pieces.append(weight * tensor)

            return self.gamma * sum(pieces)

        else:
            broadcast_mask = mask.unsqueeze(-1)
            input_dim = tensors[0].size(-1)
            # pad对应的位置是0, token对应的是1, sum(mask) token 的总个数, input_dim:768
            num_elements_not_masked = torch.sum(mask) * input_dim

            pieces = []
            for weight, tensor in zip(normed_weights, tensors):
                pieces.append(weight * _do_layer_norm(tensor, broadcast_mask,
                                                      num_elements_not_masked))
            # print("gamma:",self.gamma)
            # print("gamma:",self.gamma.grad)
            # print("gamma:",self.gamma.requires_grad)
            # print("normed_weights:", (normed_weights[0]))
            # print("normed_weights:", (normed_weights[0]).requires_grad)
            # print("normed_weights:", (normed_weights[0]).grad)

            return self.gamma * sum(pieces)
コード例 #9
0
def batched_index_or(batched_set : torch.BoolTensor, mapping : torch.BoolTensor) -> torch.BoolTensor:
    """
    batched_set : shape (batch_size, set capacity)
    mapping : shape (batch_size, set capacity, "constants")
    returns a bool tensor R of shape (batch_size, "constants")
    R[b,c] = True iff \exists l, batched_set[b,l] AND mapping[b,l,c]
    """
    result = (torch.bmm(batched_set.unsqueeze(1), mapping)).squeeze(1) #shape (batch_size, "lexical types")
    return result > 0
コード例 #10
0
    def _greedy_decode(
        self,
        head_tag_representation: torch.Tensor,
        child_tag_representation: torch.Tensor,
        attended_arcs: torch.Tensor,
        mask: torch.BoolTensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Decodes the head and head tag predictions by decoding the unlabeled arcs
        independently for each word and then again, predicting the head tags of
        these greedily chosen arcs independently. Note that this method of decoding
        is not guaranteed to produce trees (i.e. there maybe be multiple roots,
        or cycles when children are attached to their parents).

        # Parameters

        head_tag_representation : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, tag_representation_dim),
            which will be used to generate predictions for the dependency tags
            for the given arcs.
        child_tag_representation : `torch.Tensor`, required
            A tensor of shape (batch_size, sequence_length, tag_representation_dim),
            which will be used to generate predictions for the dependency tags
            for the given arcs.
        attended_arcs : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length) used to generate
            a distribution over attachments of a given word to all other words.

        # Returns

        heads : `torch.Tensor`
            A tensor of shape (batch_size, sequence_length) representing the
            greedily decoded heads of each word.
        head_tags : `torch.Tensor`
            A tensor of shape (batch_size, sequence_length) representing the
            dependency tags of the greedily decoded heads of each word.
        """
        # Mask the diagonal, because the head of a word can't be itself.
        attended_arcs = attended_arcs + torch.diag(
            attended_arcs.new(mask.size(1)).fill_(-numpy.inf)
        )
        # Mask padded tokens, because we only want to consider actual words as heads.
        if mask is not None:
            minus_mask = ~mask.unsqueeze(2)
            attended_arcs.masked_fill_(minus_mask, -numpy.inf)

        # Compute the heads greedily.
        # shape (batch_size, sequence_length)
        _, heads = attended_arcs.max(dim=2)

        # Given the greedily predicted heads, decode their dependency tags.
        # shape (batch_size, sequence_length, num_head_tags)
        head_tag_logits = self._get_head_tags(
            head_tag_representation, child_tag_representation, heads
        )
        _, head_tags = head_tag_logits.max(dim=2)
        return heads, head_tags
コード例 #11
0
    def forward(self, query: torch.Tensor, key: torch.Tensor,
                value: torch.Tensor, mask: torch.BoolTensor):
        """[summary]

        Parameters
        ----------
        query : torch.Tensor
            [shape : (batch_size, query_len, hidden_dim)]
        key : torch.Tensor
            [shape : (batch_size, key_len, hidden_dim)]
        value : torch.Tensor
            [shape : (batch_size, key_len, hidden_dim)]
        mask : torch.BoolTensor
            [shape : (batch_size, query_len, key_len)]
        """
        batch_size = query.shape[0]
        Q, K, V = self.fc_query(query), self.fc_key(key), self.fc_value(value)

        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, key len, hid dim]

        Q = Q.view(batch_size, -1, self.n_heads,
                   self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads,
                   self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads,
                   self.head_dim).permute(0, 2, 1, 3)

        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

        #energy = [batch size, n heads, query len, key len]
        if mask is not None:
            # mask (batch_size, query_len, key_len)
            mask = mask.unsqueeze(1).expand(batch_size, self.n_heads, -1, -1)
            energy = energy.masked_fill(mask, float("-inf"))

        score = torch.softmax(energy, dim=-1)
        #score = [batch size, n heads, query len, key len]

        res = torch.matmul(self.dropout(score), V)
        #res = [batch size, n heads, query len, head dim]

        res = res.permute(0, 2, 1, 3).contiguous()
        #res = [batch size, query len, n heads, head dim]

        res = res.view(batch_size, -1, self.hidden_dim)
        #res = [batch size, query len, hiddden dim]

        res = self.fc_output(res)
        #res = [batch size, query len, hiddden dim]
        return res, score.mean(dim=1)  # score [batch_size, query len, key len]
コード例 #12
0
    def forward(self, tensor: torch.Tensor,
                mask: torch.BoolTensor) -> torch.Tensor:

        broadcast_mask = mask.unsqueeze(-1)
        num_elements = broadcast_mask.sum() * self.size
        mean = (tensor * broadcast_mask).sum() / num_elements
        masked_centered = (tensor - mean) * broadcast_mask
        std = torch.sqrt((masked_centered * masked_centered).sum() /
                         num_elements + self.eps)
        return self.gamma * (tensor - mean) / (std + self.eps) + self.beta
コード例 #13
0
ファイル: poolers.py プロジェクト: valterlej/coot-videotext
    def forward(self, features: torch.FloatTensor, mask: torch.BoolTensor,
                _lengths: torch.LongTensor):
        """
        Args:
            features: Input features shape (batch_size, seq_len, feat_dim=
            mask: Input mask shape (batch_size, seq_len)
            _lengths: Input lengths, unused, shape (batch_size)

        Returns:
        """
        # print(f"genpool input {features.shape}")
        _batch_size, seq_len, input_dim = features.shape
        # apply first FCs, one for each head

        # features (batch, seq_len, d_input)
        # weight1 (num_heads, d_input, d_head)
        b1 = torch.matmul(features.unsqueeze(1),
                          self.genpool_w1_head.unsqueeze(0))
        b1 += self.genpool_b1_head.unsqueeze(1).unsqueeze(0)
        # output (batch, num_heads, seq_len, d_head)

        # dropout + activation
        # apply nonlinear activation
        b1 = self.activation(self.dropout1(b1))

        # apply second FCs, one for each head
        # weight2 (num_heads, d_head, d_head_output)
        b1 = torch.matmul(b1, self.genpool_w2_head.unsqueeze(0))
        b1 += self.genpool_b2_head.unsqueeze(1).unsqueeze(0)
        # output (batch, num_heads, seq_len, d_head_output)

        # dropout
        b1 = self.dropout2(b1)

        # set pre-softmax activations for masked sequence elements to -inf
        # mask shape (batch, seq_len)
        b1.masked_fill_(mask.unsqueeze(1).unsqueeze(-1), -INF)

        # now softmax individually per head over the sequence
        smweights = self.softmax(b1 / self.softmax_temp)
        # shape (batch, num_heads, seq_len, d_head_output)

        # drop attentions
        smweights = self.dropout3(smweights)

        # multiply input features with softmax weights for all heads
        smweights = smweights.transpose(1, 2).reshape(-1, seq_len, input_dim)
        # shape (batch, seq_len, input_dim)

        # use the attention weights to pool over the sequence and done
        pooled = (features * smweights).sum(dim=1)

        # return
        return pooled, smweights
コード例 #14
0
    def forward(self, tensor: torch.Tensor,
                mask: torch.BoolTensor) -> torch.Tensor:

        broadcast_mask = mask.unsqueeze(-1)
        num_elements = broadcast_mask.sum() * self.size
        mean = (tensor * broadcast_mask).sum() / num_elements
        masked_centered = (tensor - mean) * broadcast_mask
        std = torch.sqrt((masked_centered * masked_centered).sum() /
                         num_elements + util.tiny_value_of_dtype(tensor.dtype))
        return (self.gamma * (tensor - mean) /
                (std + util.tiny_value_of_dtype(tensor.dtype)) + self.beta)
コード例 #15
0
    def forward(self,
                tensors: List[torch.Tensor],
                mask: torch.BoolTensor = None) -> torch.Tensor:
        """
        Compute a weighted average of the `tensors`.  The input tensors an be any shape
        with at least two dimensions, but must all be the same shape.

        When `do_layer_norm=True`, the `mask` is required input.  If the `tensors` are
        dimensioned  `(dim_0, ..., dim_{n-1}, dim_n)`, then the `mask` is dimensioned
        `(dim_0, ..., dim_{n-1})`, as in the typical case with `tensors` of shape
        `(batch_size, timesteps, dim)` and `mask` of shape `(batch_size, timesteps)`.

        When `do_layer_norm=False` the `mask` is ignored.
        """
        if len(tensors) != self.mixture_size:
            raise ConfigurationError(
                "{} tensors were passed, but the module was initialized to "
                "mix {} tensors.".format(len(tensors), self.mixture_size))

        def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
            tensor_masked = tensor * broadcast_mask
            mean = torch.sum(tensor_masked) / num_elements_not_masked
            variance = (torch.sum(
                ((tensor_masked - mean) * broadcast_mask)**2) /
                        num_elements_not_masked)
            return (tensor - mean) / torch.sqrt(
                variance + util.tiny_value_of_dtype(variance.dtype))

        normed_weights = torch.nn.functional.softmax(torch.cat(
            [parameter for parameter in self.scalar_parameters]),
                                                     dim=0)
        normed_weights = torch.split(normed_weights, split_size_or_sections=1)

        if not self.do_layer_norm:
            pieces = []
            for weight, tensor in zip(normed_weights, tensors):
                pieces.append(weight * tensor)
            return self.gamma * sum(pieces)

        else:
            assert mask is not None
            broadcast_mask = mask.unsqueeze(-1)
            input_dim = tensors[0].size(-1)
            num_elements_not_masked = torch.sum(mask) * input_dim

            pieces = []
            for weight, tensor in zip(normed_weights, tensors):
                pieces.append(weight * _do_layer_norm(tensor, broadcast_mask,
                                                      num_elements_not_masked))
            return self.gamma * sum(pieces)
コード例 #16
0
    def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor = None):
        if self._dropout:
            tokens = self._dropout(tokens)

        # Our input has shape `(batch_size, num_tokens, embedding_dim)`, so we sum out the `num_tokens`
        # dimension.
        if self._pool == 'max':
            if mask is not None:
                tokens.masked_fill_(~mask.unsqueeze(-1), -19260817)
            ret = torch.max(tokens, dim=1)[0]
            return ret
        elif self._pool == 'sum':
            if mask is not None:
                tokens = tokens * mask.unsqueeze(-1)
            ret = tokens.sum(1)
            return ret
        elif self._pool == 'mean':
            if mask is not None:
                tokens = tokens * mask.unsqueeze(-1)
                summed = tokens.sum(1)
            if mask is not None:
                lengths = get_lengths_from_binary_sequence_mask(mask)
                length_mask = lengths > 0

                # Set any length 0 to 1, to avoid dividing by zero.
                lengths = torch.max(lengths, lengths.new_ones(1))
            else:
                lengths = tokens.new_full((1, ), fill_value=tokens.size(1))
                length_mask = None

            summed = summed / lengths.unsqueeze(-1).float()

            if length_mask is not None:
                summed = summed * (length_mask > 0).unsqueeze(-1)
            return summed
        else:
            raise NotImplementedError
コード例 #17
0
    def forward(self,
                inputs: torch.Tensor,
                mask: torch.BoolTensor = None) -> torch.Tensor:
        batch_size, seq_len, _ = inputs.size()
        state = torch.zeros(batch_size, self.hidden_dim, device=inputs.device)
        norm = torch.cat([param.flatten()
                          for param in self.parameters()]).norm(p=2)

        states = []
        for time in range(seq_len):
            inp = inputs[:, time, :]
            preact = self.in_proj(inp) + self.hid_proj(state)
            state = torch.tanh(self.scale / norm * preact)
            states.append(state)
        return torch.stack(states, dim=1) * mask.unsqueeze(dim=-1)
コード例 #18
0
ファイル: hardmaxattention.py プロジェクト: yang-233/mmsa
    def forward(self,
                query: torch.Tensor,
                key: torch.Tensor,
                value: torch.Tensor,
                mask: torch.BoolTensor = None):
        """[summary]

        Parameters
        ----------
        query : torch.Tensor
            [shape : (batch_size, query_len, hidden_dim)]
        key : torch.Tensor
            [shape : (batch_size, key_len, hidden_dim)]
        value : torch.Tensor
            [shape : (batch_size, key_len, hidden_dim)]
        mask : torch.BoolTensor
            [shape : (batch_size, query_len, key_len)]
        """
        batch_size = query.shape[0]
        Q, K, V = self.fc_query(query), self.fc_key(key), self.fc_value(value)
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, key len, hid dim]

        K = K.permute(0, 2, 1)
        #Q = [batch size, query len, hidden dim]
        #K = [batch size, hidden dim, key len]
        #V = [batch size, key len, hidden dim]

        energy = torch.matmul(Q, K) / self.scale
        #energy = [batch size, query len, key len]

        if mask is not None:
            # mask (batch_size, query_len, key_len)
            mask = mask.unsqueeze(1).expand(batch_size, query.shape[1],
                                            key.shape[1])
            energy = energy.masked_fill(mask, float("-inf"))

        score = _hardmax(energy)  #[batch size, query len, key len]

        res = torch.matmul(score, V)
        #res = [batch size, query len, hidden dim]

        res = self.fc_output(res)
        #res = [batch size, query len, hiddden dim]
        return res, score
コード例 #19
0
    def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor:
        """
        # Parameters

        inputs : `torch.Tensor`, required.
            A tensor of shape (batch_size, timesteps, input_dim)
        mask : `torch.BoolTensor`, optional (default = None).
            A tensor of shape (batch_size, timesteps).

        # Returns

        A tensor of shape (batch_size, timesteps, output_dim).
        """
        if mask is None:
            return self._feedforward(inputs)
        else:
            outputs = self._feedforward(inputs)
            return outputs * mask.unsqueeze(dim=-1)
コード例 #20
0
ファイル: tmp.py プロジェクト: PYART0/PyART-demo
def masked_softmax(
    vector: torch.Tensor, mask: torch.BoolTensor, dim: int = -1, memory_efficient: bool = False,
) -> torch.Tensor:
    if mask is None:
        result = torch.nn.functional.softmax(vector, dim=dim)
    else:
        while mask.dim() < vector.dim():
            mask = mask.unsqueeze(1)
        if not memory_efficient:
            result = torch.nn.functional.softmax(vector * mask, dim=dim)
            result = result * mask
            result = result / (
                result.sum(dim=dim, keepdim=True) + tiny_value_of_dtype(result.dtype)
            )
        else:
            masked_vector = vector.masked_fill(~mask, min_value_of_dtype(vector.dtype))
            result = torch.nn.functional.softmax(masked_vector, dim=dim)
    return result
コード例 #21
0
    def _greedy_decode(
            arc_scores: torch.Tensor, arc_tag_logits: torch.Tensor,
            mask: torch.BoolTensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Decodes the head and head tag predictions by decoding the unlabeled arcs
        independently for each word and then again, predicting the head tags of
        these greedily chosen arcs independently.

        # Parameters

        arc_scores : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length) used to generate
            a distribution over attachments of a given word to all other words.
        arc_tag_logits : `torch.Tensor`, required.
            A tensor of shape (batch_size, sequence_length, sequence_length, num_tags) used to
            generate a distribution over tags for each arc.
        mask : `torch.BoolTensor`, required.
            A mask of shape (batch_size, sequence_length).

        # Returns

        arc_probs : `torch.Tensor`
            A tensor of shape (batch_size, sequence_length, sequence_length) representing the
            probability of an arc being present for this edge.
        arc_tag_probs : `torch.Tensor`
            A tensor of shape (batch_size, sequence_length, sequence_length, sequence_length)
            representing the distribution over edge tags for a given edge.
        """
        # Mask the diagonal, because we don't self edges.
        inf_diagonal_mask = torch.diag(
            arc_scores.new(mask.size(1)).fill_(-numpy.inf))
        arc_scores = arc_scores + inf_diagonal_mask
        # shape (batch_size, sequence_length, sequence_length, num_tags)
        arc_tag_logits = arc_tag_logits + inf_diagonal_mask.unsqueeze(
            0).unsqueeze(-1)
        # Mask padded tokens, because we only want to consider actual word -> word edges.
        minus_mask = ~mask.unsqueeze(2)
        arc_scores.masked_fill_(minus_mask, -numpy.inf)
        arc_tag_logits.masked_fill_(minus_mask.unsqueeze(-1), -numpy.inf)
        # shape (batch_size, sequence_length, sequence_length)
        arc_probs = arc_scores.sigmoid()
        # shape (batch_size, sequence_length, sequence_length, num_tags)
        arc_tag_probs = torch.nn.functional.softmax(arc_tag_logits, dim=-1)
        return arc_probs, arc_tag_probs
コード例 #22
0
    def forward(
        self,
        sequence_tensor: torch.FloatTensor,
        span_indices: torch.LongTensor,
        span_indices_mask: torch.BoolTensor = None,
    ) -> torch.FloatTensor:
        # shape (batch_size, sequence_length, 1)

        global_attention_logits = torch.matmul(
            sequence_tensor,
            torch.zeros(self.input_dim, 1).to_device(sequence_tensor.device()))

        # shape (batch_size, sequence_length, embedding_dim + 1)
        concat_tensor = torch.cat([sequence_tensor, global_attention_logits],
                                  -1)

        concat_output, span_mask = util.batched_span_select(
            concat_tensor, span_indices)

        print(span_mask)

        # Shape: (batch_size, num_spans, max_batch_span_width, embedding_dim)
        span_embeddings = concat_output[:, :, :, :-1]
        # Shape: (batch_size, num_spans, max_batch_span_width)
        span_attention_logits = concat_output[:, :, :, -1]

        # Shape: (batch_size, num_spans, max_batch_span_width)
        span_attention_weights = util.masked_softmax(span_attention_logits,
                                                     span_mask)

        # Do a weighted sum of the embedded spans with
        # respect to the normalised attention distributions.
        # Shape: (batch_size, num_spans, embedding_dim)
        attended_text_embeddings = util.weighted_sum(span_embeddings,
                                                     span_attention_weights)

        if span_indices_mask is not None:
            # Above we were masking the widths of spans with respect to the max
            # span width in the batch. Here we are masking the spans which were
            # originally passed in as padding.
            return attended_text_embeddings * span_indices_mask.unsqueeze(-1)

        return attended_text_embeddings
コード例 #23
0
    def forward(
        self,
        q: torch.Tensor,
        k: torch.Tensor,
        v: torch.Tensor,
        mask: torch.BoolTensor = None,
    ) -> torch.Tensor:
        """
        Args:
            q (torch.Tensor): [batch size, len_q, d_model]
            k (torch.Tensor): [batch size, len_k, d_model]
            v (torch.Tensor): [batch size, len_v, d_model]
            mask (torch.BoolTensor, optional): [batch size, len_q, len_k]. Defaults to None.

        Returns:
            torch.Tensor: [batch size, len_q, d_v * n_head]
        """
        batch_size, len_q, len_k, len_v = q.size(0), q.size(1), k.size(
            1), v.size(1)

        residual = q

        qs = self.w_qs(q).view(batch_size, len_q, self.n_head, self.d_k)
        ks = self.w_ks(k).view(batch_size, len_k, self.n_head, self.d_k)
        vs = self.w_vs(v).view(batch_size, len_v, self.n_head, self.d_v)

        # output = [batch size, n_head, len_q, d_v]
        if mask is not None:
            mask = mask.unsqueeze(1).repeat(1, self.n_head, 1, 1)

        output, _ = self.attn(
            qs.transpose(1, 2),
            ks.transpose(1, 2),
            vs.transpose(1, 2),
            mask=mask,
        )

        # [batch size, len_q, n_head * d_v]
        output = output.transpose(1, 2).reshape(batch_size, len_q, -1)
        output = self.layer_norm(output + residual)

        return output
コード例 #24
0
    def forward(self, inputs: torch.Tensor, mask: torch.BoolTensor = None) -> torch.Tensor:
        """
        # Parameters

        inputs : `torch.Tensor`, required.
            A tensor of shape (batch_size, timesteps, input_dim)
        mask : `torch.BoolTensor`, optional (default = `None`).
            A tensor of shape (batch_size, timesteps).

        # Returns

        A tensor of shape (batch_size, timesteps, output_dim),
        where output_dim = input_dim.
        """
        if mask is None:
            return inputs
        else:
            # We should mask out the output instead of the input.
            # But here, output = input, so we directly mask out the input.
            return inputs * mask.unsqueeze(dim=-1)
コード例 #25
0
    def forward(self, token_embeddings: torch.Tensor, mask: torch.BoolTensor):


        mask_for_fill = ~mask.unsqueeze(1)

        if self._return_all_layers:
            layer_outputs: List[List[torch.Tensor]] = [[], []]
        else:
            outputs: List[torch.Tensor] = []

        for k, blocks in enumerate([self._forward_residual_blocks, self._backward_residual_blocks]):
            out = transposed_embeddings
            for block in blocks:
                out = block(out.masked_fill(mask_for_fill, 0.0))
                if self._return_all_layers:
            if not self._return_all_layers:

        if self._return_all_layers:
            return []
            reveal_type(torch)
コード例 #26
0
    def get_attention_masks(self, mask: torch.BoolTensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Returns 2 masks of shape (batch_size, timesteps, timesteps) representing
        1) non-padded elements, and
        2) elements of the sequence which are permitted to be involved in attention at a given timestep.
        """
        device = mask.device
        # Forward case:
        timesteps = mask.size(1)
        # Shape (1, timesteps, timesteps)
        subsequent = subsequent_mask(timesteps, device)
        # Broadcasted logical and - we want zero
        # elements where either we have padding from the mask,
        # or we aren't allowed to use the timesteps.
        # Shape (batch_size, timesteps, timesteps)
        forward_mask = mask.unsqueeze(-1) & subsequent
        # Backward case - exactly the same, but transposed.
        backward_mask = forward_mask.transpose(1, 2)

        return forward_mask, backward_mask
コード例 #27
0
def masked_softmax(
    vector: torch.Tensor,
    mask: torch.BoolTensor,
    dim: int = -1,
    memory_efficient: bool = False,
) -> torch.Tensor:
    """
    `torch.nn.functional.softmax(vector)` does not work if some elements of `vector` should be
    masked.  This performs a softmax on just the non-masked portions of `vector`.  Passing
    `None` in for the mask is also acceptable; you'll just get a regular softmax.

    `vector` can have an arbitrary number of dimensions; the only requirement is that `mask` is
    broadcastable to `vector's` shape.  If `mask` has fewer dimensions than `vector`, we will
    unsqueeze on dimension 1 until they match.  If you need a different unsqueezing of your mask,
    do it yourself before passing the mask into this function.

    If `memory_efficient` is set to true, we will simply use a very large negative number for those
    masked positions so that the probabilities of those positions would be approximately 0.
    This is not accurate in math, but works for most cases and consumes less memory.

    In the case that the input vector is completely masked and `memory_efficient` is false, this function
    returns an array of `0.0`. This behavior may cause `NaN` if this is used as the last layer of
    a model that uses categorical cross-entropy loss. Instead, if `memory_efficient` is true, this function
    will treat every element as equal, and do softmax over equal numbers.
    """
    if mask is None:
        result = torch.nn.functional.softmax(vector, dim=dim)
    else:
        while mask.dim() < vector.dim():
            mask = mask.unsqueeze(1)
        if not memory_efficient:
            # To limit numerical errors from large vector elements outside the mask, we zero these out.
            result = torch.nn.functional.softmax(vector * mask, dim=dim)
            result = result * mask
            result = result / (
                result.sum(dim=dim, keepdim=True) + tiny_value_of_dtype(result.dtype)
            )
        else:
            masked_vector = vector.masked_fill(~mask, min_value_of_dtype(vector.dtype))
            result = torch.nn.functional.softmax(masked_vector, dim=dim)
    return result
コード例 #28
0
    def forward(self, key: torch.Tensor, mask: torch.BoolTensor = None):
        """[summary]
        Parameters
        ----------
        query : torch.Tensor
            [shape : (batch_size, query_len, hidden_dim)]
        key : torch.Tensor
            [shape : (batch_size, key_len, hidden_dim)]
        mask : torch.BoolTensor
            [shape : (batch_size, query_len, key_len)]
        """
        batch_size = key.shape[0]
        query = self.query.expand((batch_size, 1, self.hidden_dim))

        Q, K = self.fc_query(query), self.fc_key(key)

        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]

        Q = Q.view(batch_size, -1, self.n_heads,
                   self.head_dim).permute(0, 2, 1, 3)
        #Q = [batch size, n heads, query len, head dim]
        K = K.view(batch_size, -1, self.n_heads, self.head_dim)
        #K = [batch size, key len, nheads, head dim]

        energy = torch.matmul(Q, K.permute(0, 2, 3, 1)) / self.scale

        #energy = [batch size, n heads, query len, key len]
        if mask is not None:
            # mask (batch_size, query_len, key_len)
            mask = mask.unsqueeze(1).expand(batch_size, self.n_heads, -1, -1)
            energy = energy.masked_fill(mask, float("-inf"))

        score = torch.softmax(energy, dim=-1)
        #score = [batch size, n heads, query len, key len]
        score = score.mean(dim=1)
        return score
コード例 #29
0
    def forward(self, token_embeddings: torch.Tensor, mask: torch.BoolTensor):

        # Convolutions need transposed input
        transposed_embeddings = torch.transpose(token_embeddings, 1, 2)

        # We need to broadcast the mask to feature dimension,
        # and to use masked_fill_ we need the inverse of the mask.
        mask_for_fill = ~mask.unsqueeze(1)

        if self._return_all_layers:
            # outputs will be [[all forward layers], [all backward layers]]
            layer_outputs: List[List[torch.Tensor]] = [[], []]
        else:
            # outputs will be [forward final layer, backward final layer]
            outputs: List[torch.Tensor] = []

        for k, blocks in enumerate(
            [self._forward_residual_blocks, self._backward_residual_blocks]):
            out = transposed_embeddings
            # Due to zero padding for backward sequences, we need
            # to ensure that the input has zeros everywhere where
            # there isn't a mask.
            for block in blocks:
                out = block(out.masked_fill(mask_for_fill, 0.0))
                if self._return_all_layers:
                    layer_outputs[k].append(out)
            if not self._return_all_layers:
                outputs.append(out)

        if self._return_all_layers:
            return [
                torch.cat([fwd, bwd], dim=1).transpose(1, 2)
                for fwd, bwd in zip(*layer_outputs)
            ]
        else:
            # Concatenate forward and backward, then transpose back
            return torch.cat(outputs, dim=1).transpose(1, 2)
コード例 #30
0
ファイル: losses.py プロジェクト: Chung-I/tsm-rnnt
    def _calculate_edit_distance(self, output_symbols: torch.LongTensor,
                                 targets: torch.LongTensor,
                                 mask: torch.BoolTensor) -> torch.FloatTensor:
        batch_size, max_pred_len = output_symbols.size()
        _, max_len = targets.size()

        distances = output_symbols.new_zeros(batch_size, max_pred_len, max_len)
        distances[:, :, 0] = torch.arange(max_pred_len)
        distances[:, 0, :] = torch.arange(max_len)
        distances = distances.float()

        for i in range(1, max_pred_len):
            for j in range(1, max_len):
                diagonal = distances[:, i-1, j-1] + \
                    self.dsub * (output_symbols[:, i-1] != targets[:, j-1]).float()
                comp = torch.stack(
                    (diagonal, distances[:, i - 1, j] + self.dins,
                     distances[:, i, j - 1] + self.ddel),
                    dim=-1)
                distances[:, i, j], _ = torch.min(comp, dim=-1)

        #edit_distance_mask = self._get_edit_distance_mask(mask, output_symbols)
        distances = distances.masked_fill(~mask.unsqueeze(1), float('inf'))
        return distances