Beispiel #1
0
    def compute_features(self, tokens, drop_out=False):
        """Compute features on the provided tokens

        The *drop_out* parameters allows to activate drop out on features that
        have a positive drop out ratio. This should only be used during
        training.
        """

        if resource_exists(self.language, STEMS):
            tokens = [
                Token(t.value, t.start, t.end,
                      stem=stem(t.normalized_value, self.language))
                for t in tokens]
        else:
            tokens = [Token(t.value, t.start, t.end, stem=t.normalized_value)
                      for t in tokens]
        cache = [{TOKEN_NAME: token} for token in tokens]
        features = []
        random_state = check_random_state(self.config.random_seed)
        for i in range(len(tokens)):
            token_features = UnupdatableDict()
            for feature in self.features:
                f_drop_out = feature.drop_out
                if drop_out and random_state.rand() < f_drop_out:
                    continue
                value = feature.compute(i, cache)
                if value is not None:
                    token_features[feature.name] = value
            features.append(token_features)
        return features
Beispiel #2
0
    def test_spans_to_tokens_indexes(self):
        # Given
        spans = [{
            START: 0,
            END: 1
        }, {
            START: 2,
            END: 6
        }, {
            START: 5,
            END: 6
        }, {
            START: 9,
            END: 15
        }]
        tokens = [
            Token(value="abc", start=0, end=3, stem="abc"),
            Token(value="def", start=4, end=7, stem="def"),
            Token(value="ghi", start=10, end=13, stem="ghi")
        ]

        # When
        indexes = _spans_to_tokens_indexes(spans, tokens)

        # Then
        expected_indexes = [[0], [0, 1], [1], [2]]
        self.assertListEqual(indexes, expected_indexes)
Beispiel #3
0
    def test_utterance_to_sample(self, mocked_positive_tagging):
        # Given
        language = LANGUAGE_EN

        def mock_positive_tagging(_, slot, slot_size):
            return [INSIDE_PREFIX + slot for _ in range(slot_size)]

        mocked_positive_tagging.side_effect = mock_positive_tagging
        slot_name = "animal"
        query_data = [{
            "text": "i am a "
        }, {
            "text": "beautiful bird",
            "slot_name": slot_name
        }]
        expected_tagging = [
            OUTSIDE, OUTSIDE, OUTSIDE, INSIDE_PREFIX + slot_name,
            INSIDE_PREFIX + slot_name
        ]
        expected_tokens = [
            Token(value='i', start=0, end=1),
            Token(value='am', start=2, end=4),
            Token(value='a', start=5, end=6),
            Token(value='beautiful', start=7, end=16),
            Token(value='bird', start=17, end=21)
        ]
        expected_sample = {"tokens": expected_tokens, "tags": expected_tagging}

        # When
        sample = utterance_to_sample(query_data, TaggingScheme.IO, language)

        # Then
        self.assertEqual(sample, expected_sample)
    def test_should_tokenize_symbols(self):
        # Given
        language = LANGUAGE_EN
        text = "$$ % !!"

        # When
        tokens = tokenize(text, language)

        # Then
        expected_tokens = [
            Token(value='$$', start=0, end=2, stem=None),
            Token(value='%', start=3, end=4, stem=None),
            Token(value='!!', start=5, end=7, stem=None)
        ]
        self.assertListEqual(tokens, expected_tokens)
    def test_should_tokenize_literals(self):
        # Given
        language = LANGUAGE_EN
        text = "Hello Beautiful World"

        # When
        tokens = tokenize(text, language)

        # Then
        expected_tokens = [
            Token(value='Hello', start=0, end=5, stem=None),
            Token(value='Beautiful', start=6, end=15, stem=None),
            Token(value='World', start=16, end=21, stem=None)
        ]
        self.assertListEqual(tokens, expected_tokens)
Beispiel #6
0
def utterance_to_sample(query_data, tagging_scheme, language):
    tokens, tags = [], []
    current_length = 0
    for chunk in query_data:
        chunk_tokens = tokenize(chunk[TEXT], language)
        tokens += [Token(t.value, current_length + t.start,
                         current_length + t.end) for t in chunk_tokens]
        current_length += len(chunk[TEXT])
        if SLOT_NAME not in chunk:
            tags += negative_tagging(len(chunk_tokens))
        else:
            tags += positive_tagging(tagging_scheme, chunk[SLOT_NAME],
                                     len(chunk_tokens))
    return {TOKENS: tokens, TAGS: tags}