def test_spans_to_tokens_indexes(self): # Given spans = [{ START: 0, END: 1 }, { START: 2, END: 6 }, { START: 5, END: 6 }, { START: 9, END: 15 }] tokens = [ Token(value="abc", start=0, end=3), Token(value="def", start=4, end=7), Token(value="ghi", start=10, end=13) ] # When indexes = _spans_to_tokens_indexes(spans, tokens) # Then expected_indexes = [[0], [0, 1], [1], [2]] self.assertListEqual(indexes, expected_indexes)
def test_utterance_to_sample(self, mocked_positive_tagging): # Given language = LANGUAGE_EN def mock_positive_tagging(_, slot, slot_size): return [INSIDE_PREFIX + slot for _ in range(slot_size)] mocked_positive_tagging.side_effect = mock_positive_tagging slot_name = "animal" query_data = [{ "text": "i am a " }, { "text": "beautiful bird", "slot_name": slot_name }] expected_tagging = [ OUTSIDE, OUTSIDE, OUTSIDE, INSIDE_PREFIX + slot_name, INSIDE_PREFIX + slot_name ] expected_tokens = [ Token(value='i', start=0, end=1), Token(value='am', start=2, end=4), Token(value='a', start=5, end=6), Token(value='beautiful', start=7, end=16), Token(value='bird', start=17, end=21) ] expected_sample = {"tokens": expected_tokens, "tags": expected_tagging} # When sample = utterance_to_sample(query_data, TaggingScheme.IO, language) # Then self.assertEqual(sample, expected_sample)
def test_should_tokenize_symbols(self): # Given language = LANGUAGE_EN text = "$$ % !!" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='$$', start=0, end=2), Token(value='%', start=3, end=4), Token(value='!!', start=5, end=7) ] self.assertListEqual(tokens, expected_tokens)
def test_should_tokenize_literals(self): # Given language = LANGUAGE_EN text = "Hello Beautiful World" # When tokens = tokenize(text, language) # Then expected_tokens = [ Token(value='Hello', start=0, end=5), Token(value='Beautiful', start=6, end=15), Token(value='World', start=16, end=21) ] self.assertListEqual(tokens, expected_tokens)
def _transform(self, tokens): if self.use_stemming: light_tokens = (stem_token(t, self.language) for t in tokens) else: light_tokens = (normalize_token(t) for t in tokens) current_index = 0 transformed_tokens = [] for light_token in light_tokens: transformed_token = Token(value=light_token, start=current_index, end=current_index + len(light_token)) transformed_tokens.append(transformed_token) current_index = transformed_token.end + 1 return transformed_tokens
def test_log_inference_weights(self): # Given self.maxDiff = None # pylint: disable=invalid-name text = "this is a slot in a text" tokens = [ Token("this", 0, 0), Token("is", 0, 0), Token("a", 0, 0), Token("slot", 0, 0), Token("in", 0, 0), Token("a", 0, 0), Token("text", 0, 0), ] features = [ { "ngram_1": "this", "is_first": "1", }, { "ngram_1": "is", "common": "1", }, { "ngram_1": "a" }, { "ngram_1": "slot", }, { "ngram_1": "in", }, { "ngram_1": "a", }, { "ngram_1": "text", }, ] tags = ["O", "O", "B-slot", "I-slot", "O", "O", "O"] tags = [_encode_tag(t) for t in tags] transitions_weights = { (_encode_tag("O"), _encode_tag("O")): 2, (_encode_tag("O"), _encode_tag("B-slot")): 1, (_encode_tag("B-slot"), _encode_tag("I-slot")): 2, (_encode_tag("B-slot"), _encode_tag("O")): 1.5, } states_weights = { ("ngram_1:this", _encode_tag("O")): 5, ("ngram_1:this", _encode_tag("B-slot")): -2, ("ngram_1:slot", _encode_tag("B-slot")): 5, ("ngram_1:slot", _encode_tag("I-slot")): -3, ("ngram_1:slot", _encode_tag("O")): -1 } # pylint: disable=super-init-not-called class MockedSlotFiller(CRFSlotFiller): def __init__(self, transition_features, state_features): mocked_model = MagicMock() type(mocked_model).transition_features_ = PropertyMock( return_value=transition_features) type(mocked_model).state_features_ = PropertyMock( return_value=state_features) self.crf_model = mocked_model self.slot_name_mapping = 1 def __del__(self): pass slot_filler = MockedSlotFiller(transitions_weights, states_weights) # When log = slot_filler.log_inference_weights(text=text, tokens=tokens, features=features, tags=tags) # Then expected_log = """Feature weights for "this is a slot in a text": # Token "this" (tagged as O): Transition weights to next tag: - (O, O) -> 2 - (B-slot, O) -> 1.5 Feature weights: - (ngram_1:this, O) -> 5 - (ngram_1:this, B-slot) -> -2 Features not seen at train time: - is_first:1 # Token "is" (tagged as O): Transition weights from previous tag: - (O, O) -> 2 - (O, B-slot) -> 1 Transition weights to next tag: - (O, B-slot) -> 1 No feature weights ! Features not seen at train time: - common:1 - ngram_1:is # Token "a" (tagged as B-slot): Transition weights from previous tag: - (O, O) -> 2 - (O, B-slot) -> 1 Transition weights to next tag: - (B-slot, I-slot) -> 2 No feature weights ! Features not seen at train time: - ngram_1:a # Token "slot" (tagged as I-slot): Transition weights from previous tag: - (B-slot, I-slot) -> 2 - (B-slot, O) -> 1.5 Transition weights to next tag: - (O, O) -> 2 - (B-slot, O) -> 1.5 Feature weights: - (ngram_1:slot, B-slot) -> 5 - (ngram_1:slot, I-slot) -> -3 - (ngram_1:slot, O) -> -1 # Token "in" (tagged as O): No transition from previous tag seen at train time ! Transition weights to next tag: - (O, O) -> 2 - (B-slot, O) -> 1.5 No feature weights ! Features not seen at train time: - ngram_1:in # Token "a" (tagged as O): Transition weights from previous tag: - (O, O) -> 2 - (O, B-slot) -> 1 Transition weights to next tag: - (O, O) -> 2 - (B-slot, O) -> 1.5 No feature weights ! Features not seen at train time: - ngram_1:a # Token "text" (tagged as O): Transition weights from previous tag: - (O, O) -> 2 - (O, B-slot) -> 1 No feature weights ! Features not seen at train time: - ngram_1:text""" self.assertEqual(expected_log, log)