Example #1
0
def test_encode_entities__with_entity_roles_and_groups():

    # create fake message that has been tokenized and entities have been extracted
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[0],
            ENTITY_ATTRIBUTE_VALUE: "London",
            ENTITY_ATTRIBUTE_START: 17,
            ENTITY_ATTRIBUTE_END: 23,
        },
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[1],
            ENTITY_ATTRIBUTE_VALUE: "Paris",
            ENTITY_ATTRIBUTE_START: 27,
            ENTITY_ATTRIBUTE_END: 32,
        },
    ]
    message = Message({
        TEXT: text,
        TOKENS_NAMES[TEXT]: tokens,
        ENTITIES: entities
    })

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # instantiate matching domain and single state featurizer
    domain = Domain(
        intents=[],
        entities=entity_tags,
        slots=[],
        responses={},
        forms={},
        action_names=[],
    )
    f = SingleStateFeaturizer()
    f.prepare_for_training(domain)

    # encode!
    encoded = f.encode_entities(entity_data={
        TEXT: text,
        ENTITIES: entities
    },
                                precomputations=precomputations)

    # check
    assert len(f.entity_tag_specs) == 1
    tags_to_ids = f.entity_tag_specs[0].tags_to_ids
    for idx, entity_tag in enumerate(entity_tags):
        tags_to_ids[entity_tag] = idx + 1  # hence, city -> 1, city#to -> 2
    assert sorted(list(encoded.keys())) == [ENTITY_TAGS]
    assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1],
                                                       [0], [2]])
Example #2
0
def test_single_state_featurizer_with_entity_roles_and_groups(
    unpacked_trained_moodbot_path: Text,
):
    from rasa.core.agent import Agent

    interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter
    # TODO roles and groups are not supported in e2e yet
    domain = Domain(
        intents=[],
        entities=["city", f"city{ENTITY_LABEL_SEPARATOR}to"],
        slots=[],
        responses={},
        forms={},
        action_names=[],
    )
    f = SingleStateFeaturizer()
    f.prepare_for_training(domain, RegexInterpreter())
    encoded = f.encode_entities(
        {
            TEXT: "I am flying from London to Paris",
            ENTITIES: [
                {
                    ENTITY_ATTRIBUTE_TYPE: "city",
                    ENTITY_ATTRIBUTE_VALUE: "London",
                    ENTITY_ATTRIBUTE_START: 17,
                    ENTITY_ATTRIBUTE_END: 23,
                },
                {
                    ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to",
                    ENTITY_ATTRIBUTE_VALUE: "Paris",
                    ENTITY_ATTRIBUTE_START: 27,
                    ENTITY_ATTRIBUTE_END: 32,
                },
            ],
        },
        interpreter=interpreter,
    )
    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
    assert np.all(
        encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]]
    )
def test_encode_entities__with_bilou_entity_roles_and_groups():

    # Instantiate domain and configure the single state featurizer for this domain.
    # Note that there are 2 entity tags here.
    entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    domain = Domain(
        intents=[],
        entities=entity_tags,
        slots=[],
        responses={},
        forms={},
        action_names=[],
        data={},
    )
    f = SingleStateFeaturizer()
    f.prepare_for_training(domain, bilou_tagging=True)

    # (1) example with both entities

    # create message that has been tokenized and where entities have been extracted
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[0],
            ENTITY_ATTRIBUTE_VALUE: "London",
            ENTITY_ATTRIBUTE_START: 17,
            ENTITY_ATTRIBUTE_END: 23,
        },
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[1],
            ENTITY_ATTRIBUTE_VALUE: "Paris",
            ENTITY_ATTRIBUTE_START: 27,
            ENTITY_ATTRIBUTE_END: 32,
        },
    ]
    message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities})

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # encode!
    encoded = f.encode_entities(
        {TEXT: text, ENTITIES: entities},
        precomputations=precomputations,
        bilou_tagging=True,
    )
    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
    assert np.all(
        encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [4], [0], [8]]
    )

    # (2) example with only the "city" entity

    # create message that has been tokenized and where entities have been extracted
    text = "I am flying to Saint Petersburg"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: "Saint Petersburg",
            ENTITY_ATTRIBUTE_START: 15,
            ENTITY_ATTRIBUTE_END: 31,
        }
    ]
    message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities})

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # encode!
    encoded = f.encode_entities(
        {TEXT: text, ENTITIES: entities},
        precomputations=precomputations,
        bilou_tagging=True,
    )
    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
    assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [3]])