Example #1
0
def test_load_multi_file_training_data(domain: Domain):
    featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2)
    trackers = training.load_data(
        "data/test_yaml_stories/stories.yml", domain, augmentation_factor=0
    )
    trackers = sorted(trackers, key=lambda t: t.sender_id)

    (tr_as_sts, tr_as_acts) = featurizer.training_states_and_labels(trackers, domain)
    hashed = []
    for sts, acts in zip(tr_as_sts, tr_as_acts):
        hashed.append(json.dumps(sts + acts, sort_keys=True))
    hashed = sorted(hashed, reverse=True)

    data, label_ids, _ = featurizer.featurize_trackers(
        trackers, domain, precomputations=None
    )

    featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2)
    trackers_mul = training.load_data(
        "data/test_multifile_yaml_stories", domain, augmentation_factor=0
    )
    trackers_mul = sorted(trackers_mul, key=lambda t: t.sender_id)

    (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_labels(
        trackers_mul, domain
    )
    hashed_mul = []
    for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul):
        hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True))
    hashed_mul = sorted(hashed_mul, reverse=True)

    data_mul, label_ids_mul, _ = featurizer_mul.featurize_trackers(
        trackers_mul, domain, precomputations=None
    )

    assert hashed == hashed_mul
    # we check for intents, action names and entities -- the features which
    # are included in the story files

    data = _surface_attributes(data)
    data_mul = _surface_attributes(data_mul)

    for attribute in [INTENT, ACTION_NAME, ENTITIES]:
        if attribute not in data or attribute not in data_mul:
            continue
        assert len(data.get(attribute)) == len(data_mul.get(attribute))

        for idx_tracker in range(len(data.get(attribute))):
            for idx_dialogue in range(len(data.get(attribute)[idx_tracker])):
                f1 = data.get(attribute)[idx_tracker][idx_dialogue]
                f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue]
                if f1 is None or f2 is None:
                    assert f1 == f2
                    continue
                for idx_turn in range(len(f1)):
                    f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn]
                    f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn]
                    assert np.all((f1 == f2).data)

    assert np.all(label_ids == label_ids_mul)
Example #2
0
def test_can_read_test_story_with_entities(domain: Domain):
    trackers = training.load_data(
        "data/test_yaml_stories/story_with_or_and_entities.yml",
        domain,
        use_story_concatenation=False,
        tracker_limit=1000,
        remove_duplicates=False,
    )
    assert len(trackers) == 2

    assert trackers[0].events[-3] == UserUttered(
        intent={"name": "greet", "confidence": 1.0},
        parse_data={
            "text": "/greet",
            "intent_ranking": [{"confidence": 1.0, "name": "greet"}],
            "intent": {"confidence": 1.0, "name": "greet"},
            "entities": [],
        },
    )
    assert trackers[0].events[-2] == ActionExecuted("utter_greet")
    assert trackers[0].events[-1] == ActionExecuted("action_listen")

    assert trackers[1].events[-4] == UserUttered(
        intent={"name": "greet", "confidence": 1.0},
        entities=[{"entity": "name", "value": "peter"}],
        parse_data={
            "text": "/greet",
            "intent_ranking": [{"confidence": 1.0, "name": "greet"}],
            "intent": {"confidence": 1.0, "name": "greet"},
            "entities": [{"entity": "name", "value": "peter"}],
        },
    )
    assert trackers[1].events[-3] == SlotSet(key="name", value="peter")
    assert trackers[1].events[-2] == ActionExecuted("utter_greet")
    assert trackers[1].events[-1] == ActionExecuted("action_listen")
Example #3
0
def train_trackers(
        domain: Domain,
        stories_file: Text,
        augmentation_factor: int = 20) -> List[TrackerWithCachedStates]:
    return training.load_data(stories_file,
                              domain,
                              augmentation_factor=augmentation_factor)
def test_generate_training_data_with_unused_checkpoints(domain: Domain):
    training_trackers = training.load_data(
        "data/test_yaml_stories/stories_unused_checkpoints.yml", domain)
    # there are 3 training stories:
    #   2 with unused end checkpoints -> training_trackers
    #   1 with unused start checkpoints -> ignored
    assert len(training_trackers) == 2
Example #5
0
def test_can_read_test_story(domain: Domain):
    trackers = training.load_data(
        "data/test_yaml_stories/stories.yml",
        domain,
        use_story_concatenation=False,
        tracker_limit=1000,
        remove_duplicates=False,
    )
    assert len(trackers) == 7
    # this should be the story simple_story_with_only_end -> show_it_all
    # the generated stories are in a non stable order - therefore we need to
    # do some trickery to find the one we want to test
    tracker = [t for t in trackers if len(t.events) == 5][0]
    assert tracker.events[0] == ActionExecuted("action_listen")
    assert tracker.events[1] == UserUttered(
        intent={INTENT_NAME_KEY: "simple", "confidence": 1.0},
        parse_data={
            "text": "/simple",
            "intent_ranking": [{"confidence": 1.0, INTENT_NAME_KEY: "simple"}],
            "intent": {"confidence": 1.0, INTENT_NAME_KEY: "simple"},
            "entities": [],
        },
    )
    assert tracker.events[2] == ActionExecuted("utter_default")
    assert tracker.events[3] == ActionExecuted("utter_greet")
    assert tracker.events[4] == ActionExecuted("action_listen")
Example #6
0
def test_parsing_of_e2e_stories(domain: Domain):
    yaml_file = "data/test_yaml_stories/stories_hybrid_e2e.yml"
    tracker = training.load_data(
        yaml_file,
        domain,
        use_story_concatenation=False,
        tracker_limit=1000,
        remove_duplicates=False,
    )

    assert len(tracker) == 1

    actual = list(tracker[0].events)

    expected = [
        ActionExecuted(ACTION_LISTEN_NAME),
        UserUttered(intent={"name": "simple"}),
        ActionExecuted("utter_greet"),
        ActionExecuted(ACTION_LISTEN_NAME),
        UserUttered(
            "I am looking for a Kenyan restaurant",
            {"name": None},
            entities=[{"start": 19, "end": 25, "value": "Kenyan", "entity": "cuisine"}],
        ),
        ActionExecuted("", action_text="good for you"),
        ActionExecuted(ACTION_LISTEN_NAME),
        UserUttered(intent={"name": "goodbye"}),
        ActionExecuted("utter_goodbye"),
        ActionExecuted(ACTION_LISTEN_NAME),
        UserUttered("One more thing", {"name": None}),
        ActionExecuted("", action_text="What?"),
        ActionExecuted(ACTION_LISTEN_NAME),
    ]

    assert actual == expected
Example #7
0
def test_can_read_test_story_with_checkpoint_after_or(domain: Domain):
    trackers = training.load_data(
        "data/test_yaml_stories/stories_checkpoint_after_or.yml",
        domain,
        use_story_concatenation=False,
        tracker_limit=1000,
        remove_duplicates=False,
    )
    assert len(trackers) == 2
Example #8
0
def test_or_statement_story_with_or_slot_was_set(domain: Domain):
    training_trackers = training.load_data(
        "data/test_yaml_stories/story_with_or_slot_was_set.yml",
        domain,
        use_story_concatenation=False,
        tracker_limit=1000,
        remove_duplicates=False,
    )
    assert len(training_trackers) == 2
    assert training_trackers[0].events[3] == SlotSet(key="name", value="peter")
    assert training_trackers[1].events[3] == SlotSet(key="name", value="bob")
Example #9
0
def test_yaml_wrong_yaml_format_warning(domain: Domain):
    yaml_file = "data/test_wrong_yaml_stories/wrong_yaml.yml"

    with pytest.raises(YamlSyntaxException):
        _ = training.load_data(
            yaml_file,
            domain,
            use_story_concatenation=False,
            tracker_limit=1000,
            remove_duplicates=False,
        )
Example #10
0
def test_can_read_test_story_with_slots(domain: Domain):
    trackers = training.load_data(
        "data/test_yaml_stories/simple_story_with_only_end.yml",
        domain,
        use_story_concatenation=False,
        tracker_limit=1000,
        remove_duplicates=False,
    )
    assert len(trackers) == 1

    assert trackers[0].events[-2] == SlotSet(key="name", value="peter")
    assert trackers[0].events[-1] == ActionExecuted("action_listen")
Example #11
0
def test_yaml_slot_without_value_is_parsed(domain: Domain):
    yaml_file = "data/test_yaml_stories/story_with_slot_was_set.yml"

    tracker = training.load_data(
        yaml_file,
        domain,
        use_story_concatenation=False,
        tracker_limit=1000,
        remove_duplicates=False,
    )

    assert tracker[0].events[-2] == SlotSet(key="name", value=DEFAULT_VALUE_TEXT_SLOTS)
Example #12
0
def test_yaml_slot_different_types(domain: Domain):
    with pytest.warns(None):
        tracker = training.load_data(
            "data/test_yaml_stories/story_slot_different_types.yml",
            domain,
            use_story_concatenation=False,
            tracker_limit=1000,
            remove_duplicates=False,
        )

    assert tracker[0].events[3] == SlotSet(key="list_slot", value=["value1", "value2"])
    assert tracker[0].events[4] == SlotSet(key="bool_slot", value=True)
    assert tracker[0].events[5] == SlotSet(key="text_slot", value="some_text")
Example #13
0
def trained_ted(
    tmp_path_factory: TempPathFactory, moodbot_domain_path: Path,
) -> TEDPolicyGraphComponent:
    training_files = "data/test_moodbot/data/stories.yml"
    domain = Domain.load(moodbot_domain_path)
    trackers = training.load_data(str(training_files), domain)
    policy = TEDPolicyGraphComponent.create(
        {**TEDPolicyGraphComponent.get_default_config(), EPOCHS: 1},
        LocalModelStorage.create(tmp_path_factory.mktemp("storage")),
        Resource("ted"),
        ExecutionContext(GraphSchema({})),
    )
    policy.train(trackers, domain)

    return policy
Example #14
0
def test_yaml_intent_with_leading_slash_warning(domain: Domain):
    yaml_file = "data/test_wrong_yaml_stories/intent_with_leading_slash.yml"

    with pytest.warns(UserWarning) as record:
        tracker = training.load_data(
            yaml_file,
            domain,
            use_story_concatenation=False,
            tracker_limit=1000,
            remove_duplicates=False,
        )

    # one for leading slash
    assert len(record) == 1

    assert tracker[0].latest_message == UserUttered(intent={"name": "simple"})
Example #15
0
def test_generate_training_data_original_and_augmented_trackers(domain: Domain):
    training_trackers = training.load_data(
        "data/test_yaml_stories/stories_defaultdomain.yml",
        domain,
        augmentation_factor=3,
    )
    # there are three original stories
    # augmentation factor of 3 indicates max of 3*10 augmented stories generated
    # maximum number of stories should be augmented+original = 33
    original_trackers = [
        t
        for t in training_trackers
        if not hasattr(t, "is_augmented") or not t.is_augmented
    ]
    assert len(original_trackers) == 4
    assert len(training_trackers) <= 34
Example #16
0
    def load_data(
        self,
        training_resource: Union[Text, TrainingDataImporter],
        remove_duplicates: bool = True,
        unique_last_num_states: Optional[int] = None,
        augmentation_factor: int = 50,
        tracker_limit: Optional[int] = None,
        use_story_concatenation: bool = True,
        debug_plots: bool = False,
        exclusion_percentage: Optional[int] = None,
    ) -> List["TrackerWithCachedStates"]:
        """Load training data from a resource."""
        max_history = self._max_history()

        if unique_last_num_states is None:
            # for speed up of data generation
            # automatically detect unique_last_num_states
            # if it was not set and
            # if all featurizers are MaxHistoryTrackerFeaturizer
            if self._are_all_featurizers_using_a_max_history():
                unique_last_num_states = max_history
        elif unique_last_num_states < max_history:
            # possibility of data loss
            rasa.shared.utils.io.raise_warning(
                f"unique_last_num_states={unique_last_num_states} but "
                f"maximum max_history={max_history}. "
                f"Possibility of data loss. "
                f"It is recommended to set "
                f"unique_last_num_states to "
                f"at least maximum max_history.")

        return training.load_data(
            training_resource,
            self.domain,
            remove_duplicates,
            unique_last_num_states,
            augmentation_factor,
            tracker_limit,
            use_story_concatenation,
            debug_plots,
            exclusion_percentage=exclusion_percentage,
        )
Example #17
0
def test_generate_training_data_with_cycles(domain: Domain):
    featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=4)
    training_trackers = training.load_data(
        "data/test_yaml_stories/stories_with_cycle.yml", domain, augmentation_factor=0,
    )

    _, label_ids, _ = featurizer.featurize_trackers(
        training_trackers, domain, precomputations=None
    )

    # how many there are depends on the graph which is not created in a
    # deterministic way but should always be 3 or 4
    assert len(training_trackers) == 3 or len(training_trackers) == 4

    # if we have 4 trackers, there is going to be one example more for label 10
    num_tens = len(training_trackers) - 1
    # if new default actions are added the keys of the actions will be changed

    all_label_ids = [id for ids in label_ids for id in ids]
    assert Counter(all_label_ids) == {0: 6, 15: 3, 14: num_tens, 1: 2, 16: 1}
Example #18
0
 def load_data(
     self,
     training_resource: Union[Text, TrainingDataImporter],
     remove_duplicates: bool = True,
     unique_last_num_states: Optional[int] = None,
     augmentation_factor: int = 50,
     tracker_limit: Optional[int] = None,
     use_story_concatenation: bool = True,
     debug_plots: bool = False,
     exclusion_percentage: Optional[int] = None,
 ) -> List["TrackerWithCachedStates"]:
     """Load training data from a resource."""
     return training.load_data(
         training_resource,
         self.domain,
         remove_duplicates,
         unique_last_num_states,
         augmentation_factor=augmentation_factor,
         tracker_limit=tracker_limit,
         use_story_concatenation=use_story_concatenation,
         debug_plots=debug_plots,
         exclusion_percentage=exclusion_percentage,
     )
Example #19
0
def test_load_training_data_reader_not_found_throws(tmp_path: Path,
                                                    domain: Domain):
    (tmp_path / "file").touch()

    with pytest.raises(Exception):
        training.load_data(str(tmp_path), domain)