def test_load_multi_file_training_data(domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers = training.load_data( "data/test_yaml_stories/stories.yml", domain, augmentation_factor=0 ) trackers = sorted(trackers, key=lambda t: t.sender_id) (tr_as_sts, tr_as_acts) = featurizer.training_states_and_labels(trackers, domain) hashed = [] for sts, acts in zip(tr_as_sts, tr_as_acts): hashed.append(json.dumps(sts + acts, sort_keys=True)) hashed = sorted(hashed, reverse=True) data, label_ids, _ = featurizer.featurize_trackers( trackers, domain, precomputations=None ) featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers_mul = training.load_data( "data/test_multifile_yaml_stories", domain, augmentation_factor=0 ) trackers_mul = sorted(trackers_mul, key=lambda t: t.sender_id) (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_labels( trackers_mul, domain ) hashed_mul = [] for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul): hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True)) hashed_mul = sorted(hashed_mul, reverse=True) data_mul, label_ids_mul, _ = featurizer_mul.featurize_trackers( trackers_mul, domain, precomputations=None ) assert hashed == hashed_mul # we check for intents, action names and entities -- the features which # are included in the story files data = _surface_attributes(data) data_mul = _surface_attributes(data_mul) for attribute in [INTENT, ACTION_NAME, ENTITIES]: if attribute not in data or attribute not in data_mul: continue assert len(data.get(attribute)) == len(data_mul.get(attribute)) for idx_tracker in range(len(data.get(attribute))): for idx_dialogue in range(len(data.get(attribute)[idx_tracker])): f1 = data.get(attribute)[idx_tracker][idx_dialogue] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue] if f1 is None or f2 is None: assert f1 == f2 continue for idx_turn in range(len(f1)): f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn] assert np.all((f1 == f2).data) assert np.all(label_ids == label_ids_mul)
def test_featurize_trackers_with_max_history_tracker_featurizer( moodbot_domain: Domain): state_featurizer = SingleStateFeaturizer() tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer) tracker = tracker_from_dialogue_file("data/test_dialogues/moodbot.json", moodbot_domain) state_features, labels, entity_tags = tracker_featurizer.featurize_trackers( [tracker], moodbot_domain, RegexInterpreter()) assert state_features is not None assert len(state_features) == 7 assert labels is not None assert len(labels) == 7 # moodbot doesn't contain e2e entities assert not any([any(turn_tags) for turn_tags in entity_tags])
def test_generate_training_data_with_cycles(domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=4) training_trackers = training.load_data( "data/test_yaml_stories/stories_with_cycle.yml", domain, augmentation_factor=0, ) _, label_ids, _ = featurizer.featurize_trackers( training_trackers, domain, precomputations=None ) # how many there are depends on the graph which is not created in a # deterministic way but should always be 3 or 4 assert len(training_trackers) == 3 or len(training_trackers) == 4 # if we have 4 trackers, there is going to be one example more for label 10 num_tens = len(training_trackers) - 1 # if new default actions are added the keys of the actions will be changed all_label_ids = [id for ids in label_ids for id in ids] assert Counter(all_label_ids) == {0: 6, 15: 3, 14: num_tens, 1: 2, 16: 1}
async def test_generate_training_data_with_cycles(stories_file: Text, default_domain: Domain): featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=4) training_trackers = await training.load_data(stories_file, default_domain, augmentation_factor=0) training_data, label_ids = featurizer.featurize_trackers( training_trackers, default_domain, interpreter=RegexInterpreter()) # how many there are depends on the graph which is not created in a # deterministic way but should always be 3 or 4 assert len(training_trackers) == 3 or len(training_trackers) == 4 # if we have 4 trackers, there is going to be one example more for label 10 num_tens = len(training_trackers) - 1 # if new default actions are added the keys of the actions will be changed all_label_ids = [id for ids in label_ids for id in ids] assert Counter(all_label_ids) == {0: 6, 12: num_tens, 14: 1, 1: 2, 13: 3}
async def test_load_multi_file_training_data( stories_resources: List, default_domain: Domain ): # the stories file in `data/test_multifile_stories` is the same as in # `data/test_stories/stories.md`, but split across multiple files featurizer = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers = await training.load_data( stories_resources[0], default_domain, augmentation_factor=0 ) (tr_as_sts, tr_as_acts) = featurizer.training_states_and_actions( trackers, default_domain ) hashed = [] for sts, acts in zip(tr_as_sts, tr_as_acts): hashed.append(json.dumps(sts + acts, sort_keys=True)) hashed = sorted(hashed, reverse=True) data, label_ids = featurizer.featurize_trackers( trackers, default_domain, interpreter=RegexInterpreter() ) featurizer_mul = MaxHistoryTrackerFeaturizer(SingleStateFeaturizer(), max_history=2) trackers_mul = await training.load_data( stories_resources[1], default_domain, augmentation_factor=0 ) (tr_as_sts_mul, tr_as_acts_mul) = featurizer.training_states_and_actions( trackers_mul, default_domain ) hashed_mul = [] for sts_mul, acts_mul in zip(tr_as_sts_mul, tr_as_acts_mul): hashed_mul.append(json.dumps(sts_mul + acts_mul, sort_keys=True)) hashed_mul = sorted(hashed_mul, reverse=True) data_mul, label_ids_mul = featurizer_mul.featurize_trackers( trackers_mul, default_domain, interpreter=RegexInterpreter() ) assert hashed == hashed_mul # we check for intents, action names and entities -- the features which # are included in the story files data = surface_attributes(data) data_mul = surface_attributes(data_mul) for attribute in [INTENT, ACTION_NAME, ENTITIES]: if attribute not in data or attribute not in data_mul: continue assert len(data.get(attribute)) == len(data_mul.get(attribute)) for idx_tracker in range(len(data.get(attribute))): for idx_dialogue in range(len(data.get(attribute)[idx_tracker])): f1 = data.get(attribute)[idx_tracker][idx_dialogue] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue] if f1 is None or f2 is None: assert f1 == f2 continue for idx_turn in range(len(f1)): f1 = data.get(attribute)[idx_tracker][idx_dialogue][idx_turn] f2 = data_mul.get(attribute)[idx_tracker][idx_dialogue][idx_turn] assert np.all((f1 == f2).data) assert np.all(label_ids == label_ids_mul)