Example #1
0
def test_load_multi_file_training_data(domain: Domain):
    featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2)
    trackers = training.load_data(
        "data/test_yaml_stories/stories.yml", domain, augmentation_factor=0
    )
    trackers = sorted(trackers, key=lambda t: t.sender_id)

    (tr_as_sts, tr_as_acts) = featurizer.training_states_and_labels(trackers, domain)
    hashed = []
    for sts, acts in zip(tr_as_sts, tr_as_acts):
        hashed.append(json.dumps(sts + acts, sort_keys=True))
    hashed = sorted(hashed, reverse=True)

    data, label_ids, _ = featurizer.featurize_trackers(
        trackers, domain, precomputations=None
    )

    featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2)
    trackers_mul = training.load_data(
        "data/test_multifile_yaml_stories", domain, augmentation_factor=0
    )
    trackers_mul = sorted(trackers_mul, key=lambda t: t.sender_id)

    (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_labels(
        trackers_mul, domain
    )
    hashed_mul = []
    for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul):
        hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True))
    hashed_mul = sorted(hashed_mul, reverse=True)

    data_mul, label_ids_mul, _ = featurizer_mul.featurize_trackers(
        trackers_mul, domain, precomputations=None
    )

    assert hashed == hashed_mul
    # we check for intents, action names and entities -- the features which
    # are included in the story files

    data = _surface_attributes(data)
    data_mul = _surface_attributes(data_mul)

    for attribute in [INTENT, ACTION_NAME, ENTITIES]:
        if attribute not in data or attribute not in data_mul:
            continue
        assert len(data.get(attribute)) == len(data_mul.get(attribute))

        for idx_tracker in range(len(data.get(attribute))):
            for idx_dialogue in range(len(data.get(attribute)[idx_tracker])):
                f1 = data.get(attribute)[idx_tracker][idx_dialogue]
                f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue]
                if f1 is None or f2 is None:
                    assert f1 == f2
                    continue
                for idx_turn in range(len(f1)):
                    f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn]
                    f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn]
                    assert np.all((f1 == f2).data)

    assert np.all(label_ids == label_ids_mul)
def test_featurize_trackers_with_max_history_tracker_featurizer(
        moodbot_domain: Domain):
    state_featurizer = SingleStateFeaturizer()
    tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer)

    tracker = tracker_from_dialogue_file("data/test_dialogues/moodbot.json",
                                         moodbot_domain)
    state_features, labels, entity_tags = tracker_featurizer.featurize_trackers(
        [tracker], moodbot_domain, RegexInterpreter())

    assert state_features is not None
    assert len(state_features) == 7
    assert labels is not None
    assert len(labels) == 7
    # moodbot doesn't contain e2e entities
    assert not any([any(turn_tags) for turn_tags in entity_tags])
Example #3
0
def test_generate_training_data_with_cycles(domain: Domain):
    featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=4)
    training_trackers = training.load_data(
        "data/test_yaml_stories/stories_with_cycle.yml", domain, augmentation_factor=0,
    )

    _, label_ids, _ = featurizer.featurize_trackers(
        training_trackers, domain, precomputations=None
    )

    # how many there are depends on the graph which is not created in a
    # deterministic way but should always be 3 or 4
    assert len(training_trackers) == 3 or len(training_trackers) == 4

    # if we have 4 trackers, there is going to be one example more for label 10
    num_tens = len(training_trackers) - 1
    # if new default actions are added the keys of the actions will be changed

    all_label_ids = [id for ids in label_ids for id in ids]
    assert Counter(all_label_ids) == {0: 6, 15: 3, 14: num_tens, 1: 2, 16: 1}
Example #4
0
async def test_generate_training_data_with_cycles(stories_file: Text,
                                                  default_domain: Domain):
    featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(),
                                             max_history=4)
    training_trackers = await training.load_data(stories_file,
                                                 default_domain,
                                                 augmentation_factor=0)

    training_data, label_ids = featurizer.featurize_trackers(
        training_trackers, default_domain, interpreter=RegexInterpreter())

    # how many there are depends on the graph which is not created in a
    # deterministic way but should always be 3 or 4
    assert len(training_trackers) == 3 or len(training_trackers) == 4

    # if we have 4 trackers, there is going to be one example more for label 10
    num_tens = len(training_trackers) - 1
    # if new default actions are added the keys of the actions will be changed

    all_label_ids = [id for ids in label_ids for id in ids]
    assert Counter(all_label_ids) == {0: 6, 12: num_tens, 14: 1, 1: 2, 13: 3}
Example #5
0
async def test_load_multi_file_training_data(
    stories_resources: List, default_domain: Domain
):
    # the stories file in `data/test_multifile_stories` is the same as in
    # `data/test_stories/stories.md`, but split across multiple files
    featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2)
    trackers = await training.load_data(
        stories_resources[0], default_domain, augmentation_factor=0
    )
    (tr_as_sts, tr_as_acts) = featurizer.training_states_and_actions(
        trackers, default_domain
    )
    hashed = []
    for sts, acts in zip(tr_as_sts, tr_as_acts):
        hashed.append(json.dumps(sts + acts, sort_keys=True))
    hashed = sorted(hashed, reverse=True)

    data, label_ids = featurizer.featurize_trackers(
        trackers, default_domain, interpreter=RegexInterpreter()
    )

    featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2)
    trackers_mul = await training.load_data(
        stories_resources[1], default_domain, augmentation_factor=0
    )
    (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_actions(
        trackers_mul, default_domain
    )
    hashed_mul = []
    for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul):
        hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True))
    hashed_mul = sorted(hashed_mul, reverse=True)

    data_mul, label_ids_mul = featurizer_mul.featurize_trackers(
        trackers_mul, default_domain, interpreter=RegexInterpreter()
    )

    assert hashed == hashed_mul
    # we check for intents, action names and entities -- the features which
    # are included in the story files

    data = surface_attributes(data)
    data_mul = surface_attributes(data_mul)

    for attribute in [INTENT, ACTION_NAME, ENTITIES]:
        if attribute not in data or attribute not in data_mul:
            continue
        assert len(data.get(attribute)) == len(data_mul.get(attribute))

        for idx_tracker in range(len(data.get(attribute))):
            for idx_dialogue in range(len(data.get(attribute)[idx_tracker])):
                f1 = data.get(attribute)[idx_tracker][idx_dialogue]
                f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue]
                if f1 is None or f2 is None:
                    assert f1 == f2
                    continue
                for idx_turn in range(len(f1)):
                    f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn]
                    f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn]
                    assert np.all((f1 == f2).data)

    assert np.all(label_ids == label_ids_mul)