コード例 #1
0
    def test_spans_to_tokens_indexes(self):
        # Given
        spans = [{
            START: 0,
            END: 1
        }, {
            START: 2,
            END: 6
        }, {
            START: 5,
            END: 6
        }, {
            START: 9,
            END: 15
        }]
        tokens = [
            Token(value="abc", start=0, end=3),
            Token(value="def", start=4, end=7),
            Token(value="ghi", start=10, end=13)
        ]

        # When
        indexes = _spans_to_tokens_indexes(spans, tokens)

        # Then
        expected_indexes = [[0], [0, 1], [1], [2]]
        self.assertListEqual(indexes, expected_indexes)
コード例 #2
0
    def test_utterance_to_sample(self, mocked_positive_tagging):
        # Given
        language = LANGUAGE_EN

        def mock_positive_tagging(_, slot, slot_size):
            return [INSIDE_PREFIX + slot for _ in range(slot_size)]

        mocked_positive_tagging.side_effect = mock_positive_tagging
        slot_name = "animal"
        query_data = [{
            "text": "i am a "
        }, {
            "text": "beautiful bird",
            "slot_name": slot_name
        }]
        expected_tagging = [
            OUTSIDE, OUTSIDE, OUTSIDE, INSIDE_PREFIX + slot_name,
            INSIDE_PREFIX + slot_name
        ]
        expected_tokens = [
            Token(value='i', start=0, end=1),
            Token(value='am', start=2, end=4),
            Token(value='a', start=5, end=6),
            Token(value='beautiful', start=7, end=16),
            Token(value='bird', start=17, end=21)
        ]
        expected_sample = {"tokens": expected_tokens, "tags": expected_tagging}

        # When
        sample = utterance_to_sample(query_data, TaggingScheme.IO, language)

        # Then
        self.assertEqual(sample, expected_sample)
コード例 #3
0
    def test_should_tokenize_symbols(self):
        # Given
        language = LANGUAGE_EN
        text = "$$ % !!"

        # When
        tokens = tokenize(text, language)

        # Then
        expected_tokens = [
            Token(value='$$', start=0, end=2),
            Token(value='%', start=3, end=4),
            Token(value='!!', start=5, end=7)
        ]
        self.assertListEqual(tokens, expected_tokens)
コード例 #4
0
    def test_should_tokenize_literals(self):
        # Given
        language = LANGUAGE_EN
        text = "Hello Beautiful World"

        # When
        tokens = tokenize(text, language)

        # Then
        expected_tokens = [
            Token(value='Hello', start=0, end=5),
            Token(value='Beautiful', start=6, end=15),
            Token(value='World', start=16, end=21)
        ]
        self.assertListEqual(tokens, expected_tokens)
コード例 #5
0
ファイル: feature_factory.py プロジェクト: wangdf62/snips-nlu
 def _transform(self, tokens):
     if self.use_stemming:
         light_tokens = (stem_token(t, self.language) for t in tokens)
     else:
         light_tokens = (normalize_token(t) for t in tokens)
     current_index = 0
     transformed_tokens = []
     for light_token in light_tokens:
         transformed_token = Token(value=light_token,
                                   start=current_index,
                                   end=current_index + len(light_token))
         transformed_tokens.append(transformed_token)
         current_index = transformed_token.end + 1
     return transformed_tokens
コード例 #6
0
    def test_log_inference_weights(self):
        # Given
        self.maxDiff = None  # pylint: disable=invalid-name
        text = "this is a slot in a text"
        tokens = [
            Token("this", 0, 0),
            Token("is", 0, 0),
            Token("a", 0, 0),
            Token("slot", 0, 0),
            Token("in", 0, 0),
            Token("a", 0, 0),
            Token("text", 0, 0),
        ]
        features = [
            {
                "ngram_1": "this",
                "is_first": "1",
            },
            {
                "ngram_1": "is",
                "common": "1",
            },
            {
                "ngram_1": "a"
            },
            {
                "ngram_1": "slot",
            },
            {
                "ngram_1": "in",
            },
            {
                "ngram_1": "a",
            },
            {
                "ngram_1": "text",
            },
        ]
        tags = ["O", "O", "B-slot", "I-slot", "O", "O", "O"]
        tags = [_encode_tag(t) for t in tags]

        transitions_weights = {
            (_encode_tag("O"), _encode_tag("O")): 2,
            (_encode_tag("O"), _encode_tag("B-slot")): 1,
            (_encode_tag("B-slot"), _encode_tag("I-slot")): 2,
            (_encode_tag("B-slot"), _encode_tag("O")): 1.5,
        }

        states_weights = {
            ("ngram_1:this", _encode_tag("O")): 5,
            ("ngram_1:this", _encode_tag("B-slot")): -2,
            ("ngram_1:slot", _encode_tag("B-slot")): 5,
            ("ngram_1:slot", _encode_tag("I-slot")): -3,
            ("ngram_1:slot", _encode_tag("O")): -1
        }

        # pylint: disable=super-init-not-called
        class MockedSlotFiller(CRFSlotFiller):
            def __init__(self, transition_features, state_features):
                mocked_model = MagicMock()
                type(mocked_model).transition_features_ = PropertyMock(
                    return_value=transition_features)
                type(mocked_model).state_features_ = PropertyMock(
                    return_value=state_features)
                self.crf_model = mocked_model
                self.slot_name_mapping = 1

            def __del__(self):
                pass

        slot_filler = MockedSlotFiller(transitions_weights, states_weights)

        # When
        log = slot_filler.log_inference_weights(text=text,
                                                tokens=tokens,
                                                features=features,
                                                tags=tags)

        # Then
        expected_log = """Feature weights for "this is a slot in a text":

# Token "this" (tagged as O):

Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5

Feature weights:
- (ngram_1:this, O) -> 5
- (ngram_1:this, B-slot) -> -2

Features not seen at train time:
- is_first:1


# Token "is" (tagged as O):

Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1

Transition weights to next tag:
- (O, B-slot) -> 1

No feature weights !

Features not seen at train time:
- common:1
- ngram_1:is


# Token "a" (tagged as B-slot):

Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1

Transition weights to next tag:
- (B-slot, I-slot) -> 2

No feature weights !

Features not seen at train time:
- ngram_1:a


# Token "slot" (tagged as I-slot):

Transition weights from previous tag:
- (B-slot, I-slot) -> 2
- (B-slot, O) -> 1.5

Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5

Feature weights:
- (ngram_1:slot, B-slot) -> 5
- (ngram_1:slot, I-slot) -> -3
- (ngram_1:slot, O) -> -1


# Token "in" (tagged as O):

No transition from previous tag seen at train time !

Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5

No feature weights !

Features not seen at train time:
- ngram_1:in


# Token "a" (tagged as O):

Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1

Transition weights to next tag:
- (O, O) -> 2
- (B-slot, O) -> 1.5

No feature weights !

Features not seen at train time:
- ngram_1:a


# Token "text" (tagged as O):

Transition weights from previous tag:
- (O, O) -> 2
- (O, B-slot) -> 1

No feature weights !

Features not seen at train time:
- ngram_1:text"""
        self.assertEqual(expected_log, log)