コード例 #1
0
def test_label_encoder_known(label_encoder):
    input_ = 'symbols/namesake/named_after'
    sample = [
        'people/deceased_person/place_of_death',
        'symbols/name_source/namesakes'
    ]
    sample.append(input_)
    label_encoder = LabelEncoder(sample)
    output = label_encoder.encode(input_)
    assert label_encoder.decode(output) == input_
コード例 #2
0
class ActionSequenceEncoder:
    def __init__(self, samples: Samples, token_threshold: int):
        reserved_labels: List[Union[Unknown,
                                    CloseVariadicFieldRule]] = [Unknown()]
        reserved_labels.append(CloseVariadicFieldRule())
        self._rule_encoder = LabelEncoder(samples.rules,
                                          reserved_labels=reserved_labels,
                                          unknown_index=0)
        self._node_type_encoder = LabelEncoder(samples.node_types)
        reserved_labels = [Unknown()]
        self._token_encoder = LabelEncoder(samples.tokens,
                                           min_occurrences=token_threshold,
                                           reserved_labels=reserved_labels,
                                           unknown_index=0)
        self.value_to_idx: Dict[str, List[int]] = {}
        for kind, value in self._token_encoder.vocab[len(reserved_labels):]:
            idx = self._token_encoder.encode((kind, value))
            if value not in self.value_to_idx:
                self.value_to_idx[value] = []
            self.value_to_idx[value].append(idx)

    def decode(self, tensor: torch.LongTensor, reference: List[Token]) \
            -> Optional[ActionSequence]:
        """
        Return the action sequence corresponding to the tensor

        Parameters
        ----------
        tensor: torch.LongTensor
            The encoded tensor with the shape of
            (len(action_sequence), 3). Each action will be encoded by the tuple
            of (ID of the applied rule, ID of the inserted token,
            the index of the word copied from the reference).
            The padding value should be -1.
        reference

        Returns
        -------
        Optional[action_sequence]
            The action sequence corresponding to the tensor
            None if the action sequence cannot be generated.
        """

        retval = ActionSequence()
        for i in range(tensor.shape[0]):
            if tensor[i, 0] > 0:
                # ApplyRule
                rule = self._rule_encoder.decode(tensor[i, 0])
                retval.eval(ApplyRule(rule))
            elif tensor[i, 1] > 0:
                # GenerateToken
                kind, value = self._token_encoder.decode(tensor[i, 1])
                retval.eval(GenerateToken(kind, value))
            elif tensor[i, 2] >= 0:
                # GenerateToken (Copy)
                index = int(tensor[i, 2].numpy())
                if index >= len(reference):
                    logger.debug("reference index is out-of-bounds")
                    return None
                token = reference[index]
                retval.eval(GenerateToken(token.kind, token.raw_value))
            else:
                logger.debug("invalid actions")
                return None

        return retval

    def encode_action(self,
                      action_sequence: ActionSequence,
                      reference: List[Token]) \
            -> Optional[torch.Tensor]:
        """
        Return the tensor encoded the action sequence

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded
        reference

        Returns
        -------
        Optional[torch.Tensor]
            The encoded tensor. The shape of tensor is
            (len(action_sequence) + 1, 4). Each action will be encoded by
            the tuple of (ID of the node types, ID of the applied rule,
            ID of the inserted token, the index of the word copied from
            the reference. The padding value should be -1.
            None if the action sequence cannot be encoded.
        """
        reference_value = [token.raw_value for token in reference]
        action = \
            torch.ones(len(action_sequence.action_sequence) + 1, 4).long() \
            * -1
        for i in range(len(action_sequence.action_sequence)):
            a = action_sequence.action_sequence[i]
            parent = action_sequence.parent(i)
            if parent is not None:
                parent_action = \
                    cast(ApplyRule,
                         action_sequence.action_sequence[parent.action])
                parent_rule = cast(ExpandTreeRule, parent_action.rule)
                action[i, 0] = self._node_type_encoder.encode(
                    parent_rule.children[parent.field][1])

            if isinstance(a, ApplyRule):
                rule = a.rule
                action[i, 1] = self._rule_encoder.encode(rule)
            else:
                encoded_token = \
                    int(self._token_encoder.encode((a.kind, a.value)).numpy())

                if encoded_token != 0:
                    action[i, 2] = encoded_token

                # Unknown token
                if a.value in reference_value:
                    # TODO use kind in reference
                    action[i, 3] = \
                        reference_value.index(cast(str, a.value))

                if encoded_token == 0 and \
                        a.value not in reference_value:
                    logger.debug("cannot encode token")
                    return None

        head = action_sequence.head
        length = len(action_sequence.action_sequence)
        if head is not None:
            head_action = \
                cast(ApplyRule,
                     action_sequence.action_sequence[head.action])
            head_rule = cast(ExpandTreeRule, head_action.rule)
            action[length, 0] = self._node_type_encoder.encode(
                head_rule.children[head.field][1])

        return action

    def encode_raw_value(self, text: str) -> List[int]:
        if text in self.value_to_idx:
            return self.value_to_idx[text]
        else:
            return [self._token_encoder.encode(Unknown()).item()]

    def batch_encode_raw_value(self, texts: List[str]) -> List[List[int]]:
        return [self.encode_raw_value(text) for text in texts]

    def encode_parent(self, action_sequence) -> torch.Tensor:
        """
        Return the tensor encoded the action sequence

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded

        Returns
        -------
        torch.Tensor
            The encoded tensor. The shape of `action` tensor is
            (len(action_sequence) + 1, 4). Each action will be encoded by
            the tuple of (ID of the parent node types, ID of the
            parent-action's rule, the index of the parent action,
            the index of the field).
            The padding value should be -1.
        """
        parent_tensor = \
            torch.ones(len(action_sequence.action_sequence) + 1, 4).long() \
            * -1

        for i in range(len(action_sequence.action_sequence)):
            parent = action_sequence.parent(i)
            if parent is not None:
                parent_action = \
                    cast(ApplyRule,
                         action_sequence.action_sequence[parent.action])
                parent_rule = cast(ExpandTreeRule, parent_action.rule)
                parent_tensor[i, 0] = \
                    self._node_type_encoder.encode(parent_rule.parent)
                parent_tensor[i, 1] = self._rule_encoder.encode(parent_rule)
                parent_tensor[i, 2] = parent.action
                parent_tensor[i, 3] = parent.field

        head = action_sequence.head
        length = len(action_sequence.action_sequence)
        if head is not None:
            head_action = \
                cast(ApplyRule,
                     action_sequence.action_sequence[head.action])
            head_rule = cast(ExpandTreeRule, head_action.rule)
            parent_tensor[length, 0] = \
                self._node_type_encoder.encode(head_rule.parent)
            parent_tensor[length, 1] = self._rule_encoder.encode(head_rule)
            parent_tensor[length, 2] = head.action
            parent_tensor[length, 3] = head.field

        return parent_tensor

    def encode_tree(self, action_sequence: ActionSequence) \
            -> Union[torch.Tensor, torch.Tensor]:
        """
        Return the tensor adjacency matrix of the action sequence

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded

        Returns
        -------
        depth: torch.Tensor
            The depth of each action. The shape is (len(action_sequence),).
        adjacency_matrix: torch.Tensor
            The encoded tensor. The shape of tensor is
            (len(action_sequence), len(action_sequence)). If i th action is
            a parent of j th action, (i, j) element will be 1. the element
            will be 0 otherwise.
        """
        L = len(action_sequence.action_sequence)
        depth = torch.zeros(L)
        m = torch.zeros(L, L)

        for i in range(L):
            p = action_sequence.parent(i)
            if p is not None:
                depth[i] = depth[p.action] + 1
                m[p.action, i] = 1

        return depth, m

    def encode_each_action(self,
                           action_sequence: ActionSequence,
                           reference: List[Token],
                           max_arity: int) \
            -> torch.Tensor:
        """
        Return the tensor encoding the each action

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded
        reference
        max_arity: int

        Returns
        -------
        torch.Tensor
            The encoded tensor. The shape of tensor is
            (len(action_sequence), max_arity + 1, 3).
            [:, 0, 0] encodes the parent node type. [:, i, 0] encodes
            the node type of (i - 1)-th child node. [:, i, 1] encodes
            the token of (i - 1)-th child node. [:, i, 2] encodes the reference
            index of (i - 1)-th child node.
            The padding value is -1.
        """
        L = len(action_sequence.action_sequence)
        reference_value = [token.raw_value for token in reference]
        retval = torch.ones(L, max_arity + 1, 3).long() * -1
        for i, action in enumerate(action_sequence.action_sequence):
            if isinstance(action, ApplyRule):
                if isinstance(action.rule, ExpandTreeRule):
                    # Encode parent
                    retval[i, 0, 0] = \
                        self._node_type_encoder.encode(action.rule.parent)
                    # Encode children
                    for j, (_, child) in enumerate(
                            action.rule.children[:max_arity]):
                        retval[i, j + 1, 0] = \
                            self._node_type_encoder.encode(child)
            else:
                gentoken: GenerateToken = action
                kind = gentoken.kind
                value = gentoken.value
                encoded_token = \
                    int(self._token_encoder.encode((kind, value)).numpy())

                if encoded_token != 0:
                    retval[i, 1, 1] = encoded_token

                if value in reference_value:
                    # TODO use kind in reference
                    retval[i, 1, 2] = \
                        reference_value.index(cast(str, value))

        return retval

    def encode_path(self, action_sequence: ActionSequence, max_depth: int) \
            -> torch.Tensor:
        """
        Return the tensor encoding the each action

        Parameters
        ----------
        action_sequence: action_sequence
            The action_sequence containing action sequence to be encoded
        max_depth: int

        Returns
        -------
        torch.Tensor
            The encoded tensor. The shape of tensor is
            (len(action_sequence), max_depth).
            [i, :] encodes the path from the root node to i-th node.
            Each node represented by the rule id.
            The padding value is -1.
        """
        L = len(action_sequence.action_sequence)
        retval = torch.ones(L, max_depth).long() * -1
        for i in range(L):
            parent_opt = action_sequence.parent(i)
            if parent_opt is not None:
                p = action_sequence.action_sequence[parent_opt.action]
                if isinstance(p, ApplyRule):
                    retval[i, 0] = self._rule_encoder.encode(p.rule)
                retval[i, 1:] = retval[parent_opt.action, :max_depth - 1]

        return retval