Beispiel #1
0
def align_tokens(tokens_in: List[Text], token_end: int,
                 token_start: int) -> List[Token]:
    """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer.

    As a language model might split a single word into multiple tokens, we need to make
    sure that the start and end value of first and last sub-token matches the
    start and end value of the token return by the WhitespaceTokenizer as the
    entities are using those start and end values.
    """

    tokens_out = []

    current_token_offset = token_start

    for index, string in enumerate(tokens_in):
        if index == 0:
            if index == len(tokens_in) - 1:
                s_token_end = token_end
            else:
                s_token_end = current_token_offset + len(string)
            tokens_out.append(Token(string, token_start, end=s_token_end))
        elif index == len(tokens_in) - 1:
            tokens_out.append(
                Token(string, current_token_offset, end=token_end))
        else:
            tokens_out.append(
                Token(string,
                      current_token_offset,
                      end=current_token_offset + len(string)))

        current_token_offset += len(string)

    return tokens_out
Beispiel #2
0
def test_tokens_comparison():
    x = Token("hello", 0)
    y = Token("Hello", 0)

    assert x == x
    assert y < x

    assert x != 1

    with pytest.raises(TypeError):
        assert y < "a"
Beispiel #3
0
def test_tokens_comparison():
    from rasa.nlu.tokenizers.tokenizer import Token

    x = Token("hello", 0)
    y = Token("Hello", 0)

    assert x == x
    assert y < x

    assert x != 1

    with pytest.raises(TypeError):
        assert y < "a"
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "return_sequence": True
    })

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(
        test_message.get("text_sparse_features").toarray()[0] == expected)
Beispiel #5
0
def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected,
                                                   labeled_tokens):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
    from rasa.nlu.tokenizers.tokenizer import Token

    lookups = [
        {
            "name": "cites",
            "elements": ["北京", "上海", "广州", "深圳", "杭州"],
        },
        {
            "name": "dates",
            "elements": ["昨天", "今天", "明天", "后天"],
        },
    ]
    ftr = RegexFeaturizer({"use_word_boundaries": False})
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(TOKENS_NAMES[TEXT],
                [Token(word, start) for (word, start) in tokens])

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10)
    assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10)

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Beispiel #6
0
def test_encode_entities__with_entity_roles_and_groups():

    # create fake message that has been tokenized and entities have been extracted
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[0],
            ENTITY_ATTRIBUTE_VALUE: "London",
            ENTITY_ATTRIBUTE_START: 17,
            ENTITY_ATTRIBUTE_END: 23,
        },
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[1],
            ENTITY_ATTRIBUTE_VALUE: "Paris",
            ENTITY_ATTRIBUTE_START: 27,
            ENTITY_ATTRIBUTE_END: 32,
        },
    ]
    message = Message({
        TEXT: text,
        TOKENS_NAMES[TEXT]: tokens,
        ENTITIES: entities
    })

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # instantiate matching domain and single state featurizer
    domain = Domain(
        intents=[],
        entities=entity_tags,
        slots=[],
        responses={},
        forms={},
        action_names=[],
    )
    f = SingleStateFeaturizer()
    f.prepare_for_training(domain)

    # encode!
    encoded = f.encode_entities(entity_data={
        TEXT: text,
        ENTITIES: entities
    },
                                precomputations=precomputations)

    # check
    assert len(f.entity_tag_specs) == 1
    tags_to_ids = f.entity_tag_specs[0].tags_to_ids
    for idx, entity_tag in enumerate(entity_tags):
        tags_to_ids[entity_tag] = idx + 1  # hence, city -> 1, city#to -> 2
    assert sorted(list(encoded.keys())) == [ENTITY_TAGS]
    assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1],
                                                       [0], [2]])
Beispiel #7
0
async def _get_e2e_entity_evaluation_result(
    processor: "MessageProcessor",
    tracker: DialogueStateTracker,
    prediction: PolicyPrediction,
) -> Optional[EntityEvaluationResult]:
    previous_event = tracker.events[-1]

    if isinstance(previous_event, SlotSet):
        # UserUttered events with entities can be followed by SlotSet events
        # if slots are defined in the domain
        previous_event = tracker.get_last_event_for(
            (UserUttered, ActionExecuted))

    if isinstance(previous_event, UserUttered):
        entities_predicted_by_policies = [
            entity for prediction_event in prediction.events
            if isinstance(prediction_event, EntitiesAdded)
            for entity in prediction_event.entities
        ]
        entity_targets = previous_event.entities
        if entity_targets or entities_predicted_by_policies:
            text = previous_event.text
            if text:
                parsed_message = await processor.parse_message(
                    UserMessage(text=text))
                if parsed_message:
                    tokens = [
                        Token(text[start:end], start, end)
                        for start, end in parsed_message.get(
                            TOKENS_NAMES[TEXT], [])
                    ]
                    return EntityEvaluationResult(
                        entity_targets, entities_predicted_by_policies, tokens,
                        text)
    return None
Beispiel #8
0
 def _token_from_offset(
     self, text: bytes, offset: int, encoded_sentence: bytes
 ) -> Token:
     return Token(
         text.decode(DEFAULT_ENCODING),
         self._byte_to_char_offset(encoded_sentence, offset),
     )
def test_create_train_load_and_process(
    create_lexical_syntactic_featurizer: Callable[
        [Dict[Text, Any]], LexicalSyntacticFeaturizer
    ],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource_lexical_syntactic_featurizer: Resource,
    feature_config: List[Text],
) -> Callable[..., LexicalSyntacticFeaturizer]:

    config = {"alias": "lsf", "features": feature_config}
    featurizer = create_lexical_syntactic_featurizer(config)

    sentence = "Hello how are you"
    tokens = [
        Token(text=match[0], start=match.start())
        for match in re.finditer(r"\w+", sentence)
    ]
    message = Message(data={TOKENS_NAMES[TEXT]: tokens})

    featurizer.train(TrainingData([message]))

    loaded_featurizer = LexicalSyntacticFeaturizer.load(
        config={**LexicalSyntacticFeaturizer.get_default_config(), **config},
        model_storage=default_model_storage,
        execution_context=default_execution_context,
        resource=resource_lexical_syntactic_featurizer,
    )

    assert loaded_featurizer._feature_to_idx_dict == featurizer._feature_to_idx_dict
def test_only_featurizes_text_attribute(
    create_lexical_syntactic_featurizer: Callable[
        [Dict[Text, Any]], LexicalSyntacticFeaturizer
    ]
):
    # build a message with tokens for lots of attributes
    sentence = "hello goodbye hello"
    tokens = [
        Token(text=match[0], start=match.start())
        for match in re.finditer(r"\w+", sentence)
    ]
    message_data = {}
    for attribute in MESSAGE_ATTRIBUTES + DENSE_FEATURIZABLE_ATTRIBUTES:
        message_data[attribute] = sentence
        message_data[TOKENS_NAMES[attribute]] = tokens
    message = Message(data=message_data)

    # train and process
    featurizer = create_lexical_syntactic_featurizer(
        {"alias": "lsf", "features": [["BOS"]]}
    )
    featurizer.train(TrainingData([message]))
    featurizer.process([message])
    assert len(message.features) == 1
    assert message.features[0].attribute == TEXT
def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer()

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
def test_count_vector_featurizer_using_tokens(
    tokens: List[Text],
    expected: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
):
    ftr = create_featurizer()

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message(data={TEXT: ""})
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Beispiel #13
0
def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)

    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] ==
        expected)
def test_process_multiple_messages(
    create_lexical_syntactic_featurizer: Callable[
        [Dict[Text, Any]], LexicalSyntacticFeaturizer
    ]
):
    # build a message with tokens for lots of attributes
    multiple_messages = []
    for sentence in ["hello", "hello there"]:
        tokens = [
            Token(text=match[0], start=match.start())
            for match in re.finditer(r"\w+", sentence)
        ]

        multiple_messages.append(Message(data={TOKENS_NAMES[TEXT]: tokens}))

    # train and process
    featurizer = create_lexical_syntactic_featurizer(
        {"alias": "lsf", "features": [["prefix2"]]}
    )
    featurizer.train(TrainingData(multiple_messages))
    featurizer.process(multiple_messages)
    for message in multiple_messages:
        assert len(message.features) == 1
        assert message.features[0].attribute == TEXT

    # we know both texts where used for training if more than one feature has been
    # extracted e.g. for the first message from which only the prefix "he" can be
    # extracted
    assert multiple_messages[0].features[0].features.shape[-1] > 1
def test_warn_if_part_of_speech_features_cannot_be_computed(
    create_lexical_syntactic_featurizer: Callable[
        [Dict[Text, Any]], LexicalSyntacticFeaturizer
    ],
    sentence: Text,
    feature_config: Dict[Text, Any],
    expected_features: np.ndarray,
):

    featurizer = create_lexical_syntactic_featurizer(
        {"alias": "lsf", "features": feature_config}
    )

    # build the message - with tokens but *no* part-of-speech tags
    tokens = [
        Token(text=match[0], start=match.start())
        for match in re.finditer(r"\w+", sentence)
    ]
    message = Message(data={TOKENS_NAMES[TEXT]: tokens})

    # train
    with pytest.warns(
        UserWarning,
        match="Expected training data to include tokens with part-of-speech tags",
    ):
        featurizer.train(TrainingData([message]))
    assert not message.features

    # process
    with pytest.warns(None) as records:
        featurizer.process([message])
    assert len(records) == 0
    assert len(message.features) == 1
    feature = message.features[0]
    assert np.all(feature.features.todense() == expected_features)
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        doc = self.get_doc(message, attribute)

        return [
            Token(t.text,
                  t.idx,
                  lemma=t.lemma_,
                  data={POS_TAG_KEY: self._tag_of_token(t)}) for t in doc
        ]
Beispiel #17
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import jieba

        text = message.get(attribute)

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return tokens
Beispiel #18
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        text = message.get(attribute)
        tokenized = [i for i in text]
        tokens = []
        offset = 0
        for word in tokenized:
            tokens.append(Token(word, offset))
            offset += len(word)

        return tokens
Beispiel #19
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        """Tokenizes the text of the provided attribute of the incoming message."""
        import jieba

        text = message.get(attribute)

        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return self._apply_token_pattern(tokens)
Beispiel #20
0
def test_ckip_featurizer(mock_POS_class):
    expected_pos_list = [[
        'Nd', 'Nd', 'VC', 'Di', 'Na', 'Na', 'VC', 'Di', 'Neu', 'Nf'
    ]]
    mock_POS_inst = mock_POS_class.return_value
    mock_POS_inst.return_value = expected_pos_list

    msg = Message.build(text="昨天晚上吃了牛肉燴飯花了120元", intent="eat_dinner")
    msg.set("tokens", [
        Token("昨天", 0),
        Token("晚上", 2),
        Token("吃", 4),
        Token("了", 5),
        Token("牛肉", 6),
        Token("燴飯", 8),
        Token("花", 10),
        Token("了", 11),
        Token("120", 12),
        Token("元", 15)
    ])

    from rukip.featurizer import CKIPFeaturizer
    component_config = {"model_path": "./data"}

    ckip_featurizer = CKIPFeaturizer(component_config)
    ner_features = ckip_featurizer.gen_ner_features(msg)
    assert ner_features == [['昨天', 'Nd'], ['晚上', 'Nd'], ['吃',
                                                         'VC'], ['了', 'Di'],
                            ['牛肉', 'Na'], ['燴飯', 'Na'], ['花', 'VC'],
                            ['了', 'Di'], ['120', 'Neu'], ['元', 'Nf']]

    component_config = {"model_path": "./data", "token_features": ["pos"]}
    ckip_featurizer = CKIPFeaturizer(component_config)
    ner_features = ckip_featurizer.gen_ner_features(msg)
    assert ner_features == [['Nd'], ['Nd'], ['VC'], ['Di'], ['Na'], ['Na'],
                            ['VC'], ['Di'], ['Neu'], ['Nf']]

    component_config = {"model_path": "./data", "token_features": ["word"]}
    ckip_featurizer = CKIPFeaturizer(component_config)
    ner_features = ckip_featurizer.gen_ner_features(msg)
    assert ner_features == [['昨天'], ['晚上'], ['吃'], ['了'], ['牛肉'], ['燴飯'],
                            ['花'], ['了'], ['120'], ['元']]
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        text = message.get(attribute)

        doc = self.nlp(text)
        tokens = [
            Token(
                text=t.text,
                start=t.idx,
            ) for t in doc if t.text and t.text.strip()
        ]
        return self._apply_token_pattern(tokens)
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import jieba

        text = message.get(attribute)
        if self.component_config.get("case_sensitive", False):
            tokenized = jieba.tokenize(text.lower())
        else:
            tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        return self._apply_token_pattern(tokens)
Beispiel #23
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        from janome.tokenizer import Tokenizer
        text = message.get(attribute)
        text = self.removePunctuation(text)

        tokenizer = Tokenizer()
        tokenized = tokenizer.tokenize(text)
        tokens = []
        for token in tokenized:
            tokens.append(Token(token.node.surface, token.node.pos - 1))

        return self._apply_token_pattern(tokens)
Beispiel #24
0
    def tokenize(self,
                 text: Text,
                 attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
        import jieba

        text = self.preprocess_text(text, attribute)
        tokenized = jieba.tokenize(text)
        tokens = [Token(word, start) for (word, start, end) in tokenized]

        self.add_cls_token(tokens, attribute)

        return tokens
Beispiel #25
0
def test_align_token_features_convert():
    tokens = [
        Token("This", 0, data={NUMBER_OF_SUB_TOKENS: 1}),
        Token("is", 5, data={NUMBER_OF_SUB_TOKENS: 1}),
        Token("a", 8, data={NUMBER_OF_SUB_TOKENS: 1}),
        Token("sentence", 10, data={NUMBER_OF_SUB_TOKENS: 2}),
        Token("embedding", 19, data={NUMBER_OF_SUB_TOKENS: 4}),
    ]

    seq_dim = sum(t.get(NUMBER_OF_SUB_TOKENS) for t in tokens)
    token_features = np.random.rand(1, seq_dim, 64)

    actual_features = train_utils.align_token_features([tokens], token_features)

    assert np.all(actual_features[0][0] == token_features[0][0])
    assert np.all(actual_features[0][1] == token_features[0][1])
    assert np.all(actual_features[0][2] == token_features[0][2])
    # sentence is split into 2 sub-tokens
    assert np.all(actual_features[0][3] == np.mean(token_features[0][3:5], axis=0))
    # embedding is split into 4 sub-tokens
    assert np.all(actual_features[0][4] == np.mean(token_features[0][5:10], axis=0))
Beispiel #26
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        doc = self.get_doc(message, attribute)

        tokens = [
            Token(t.text,
                  t.idx,
                  lemma=t.lemma_,
                  data={POS_TAG_KEY: self._tag_of_token(t)}) for t in doc
            if t.text and t.text.strip()
        ]

        return self._apply_token_pattern(tokens)
Beispiel #27
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        text = message.get(attribute)
        encoded_input = self.tokenizer(text,
                                       return_offsets_mapping=True,
                                       add_special_tokens=False)
        token_position_pair = zip(encoded_input.tokens(),
                                  encoded_input["offset_mapping"])

        return [
            Token(text=token_text, start=position[0], end=position[1])
            for token_text, position in token_position_pair
        ]
Beispiel #28
0
    def tokenize(self, text, msg_tokens):
        words = []
        for token in msg_tokens.entities:
            words.append(token.value)

        running_offset = 0
        tokens = []
        for word in words:
            word_offset = text.index(word, running_offset)
            word_len = len(word)
            running_offset = word_offset + word_len
            tokens.append(Token(word, word_offset))
        return tokens
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import MicroTokenizer

        text = message.get(attribute)

        tokenized = MicroTokenizer.cut(text)

        tokens = []
        offset = 0
        for word in tokenized:
            tokens.append(Token(word, offset))
            offset += len(word)

        return tokens
Beispiel #30
0
 def tokenize(self, message: Message, attribute: Text) -> List[Token]:
     text = message.get(attribute)
     if self.lang in ('zh', 'ja'):
         r = query_data_by_url(cf.servant_by_lang(self.lang), 'tokens', {'lang': self.lang, 'sents': text})
         words = r['data']
         running_offset = 0
         tokens = []
         for word in words:
             word_offset = text.index(word, running_offset)
             word_len = len(word)
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
         return tokens
     return super().tokenize(message, attribute)