def _load_model_class( cls, tf_model_file: Text, model_data_example: RasaModelData, label_data: RasaModelData, entity_tag_specs: List[EntityTagSpec], config: Dict[Text, Any], finetune_mode: bool = False, ) -> "RasaModel": predict_data_example = RasaModelData( label_key=model_data_example.label_key, data={ feature_name: features for feature_name, features in model_data_example.items() if TEXT in feature_name }, ) return cls.model_class(config[USE_TEXT_AS_LABEL]).load( tf_model_file, model_data_example, predict_data_example, data_signature=model_data_example.get_signature(), label_data=label_data, entity_tag_specs=entity_tag_specs, config=copy.deepcopy(config), finetune_mode=finetune_mode, )
def __init__( self, name: Text, config: Dict[Text, Any], data_signature: Dict[Text, Dict[Text, List[FeatureSignature]]], label_data: RasaModelData, ) -> None: super().__init__( name=name, random_seed=config[RANDOM_SEED], tensorboard_log_dir=config[TENSORBOARD_LOG_DIR], tensorboard_log_level=config[TENSORBOARD_LOG_LEVEL], checkpoint_model=config[CHECKPOINT_MODEL], ) self.config = config self.data_signature = data_signature self.label_signature = label_data.get_signature() self._check_data() label_batch = label_data.prepare_batch() self.tf_label_data = self.batch_to_model_data_format( label_batch, self.label_signature) # set up tf layers self._tf_layers: Dict[Text, tf.keras.layers.Layer] = {}
def test_train_val_split(model_data: RasaModelData): train_model_data, test_model_data = model_data.split(2, 42) for key, values in model_data.items(): assert len(values) == len(train_model_data.get(key)) assert len(values) == len(test_model_data.get(key)) for sub_key, data in values.items(): assert len(data) == len(train_model_data.get(key, sub_key)) assert len(data) == len(test_model_data.get(key, sub_key)) for i, v in enumerate(data): if isinstance(v[0], list): assert ( v[0][0].dtype == train_model_data.get(key, sub_key)[i][0][0].dtype ) else: assert v[0].dtype == train_model_data.get(key, sub_key)[i][0].dtype for values in train_model_data.values(): for data in values.values(): for v in data: assert np.array(v).shape[0] == 3 for values in test_model_data.values(): for data in values.values(): for v in data: assert np.array(v).shape[0] == 2
def test_update_key(model_data: RasaModelData): assert model_data.does_feature_exist("label", "ids") model_data.update_key("label", "ids", "intent", "ids") assert not model_data.does_feature_exist("label", "ids") assert model_data.does_feature_exist("intent", "ids") assert "label" not in model_data.data
def test_split_data_by_label(model_data: RasaModelData): split_model_data = model_data._split_by_label_ids( model_data.data, model_data.get("intent", "ids")[0], np.array([0, 1])) assert len(split_model_data) == 2 for s in split_model_data: assert len(set(s.get("intent", "ids")[0])) == 1
def test_not_balance_model_data(model_data: RasaModelData): test_model_data = RasaModelData(label_key="entities", label_sub_key="tag_ids", data=model_data.data) data = test_model_data._balanced_data(test_model_data.data, 2, False) assert np.all(data["entities"]["tag_ids"] == test_model_data.get( "entities", "tag_ids"))
def _create_label_data(self, domain: Domain) -> RasaModelData: # encode all label_ids with policies' featurizer state_featurizer = self.featurizer.state_featurizer all_labels = state_featurizer.create_encoded_all_actions(domain) all_labels = all_labels.astype(np.float32) label_data = RasaModelData() label_data.add_features(LABEL_FEATURES, [all_labels]) return label_data
def load(cls, path: Text) -> "TEDPolicy": """Loads a policy from the storage. **Needs to load its featurizer** """ if not os.path.exists(path): raise Exception(f"Failed to load TED policy model. Path " f"'{os.path.abspath(path)}' doesn't exist.") model_path = Path(path) tf_model_file = model_path / f"{SAVE_MODEL_FILE_NAME}.tf_model" featurizer = TrackerFeaturizer.load(path) if not (model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl").is_file(): return cls(featurizer=featurizer) loaded_data = io_utils.json_unpickle( model_path / f"{SAVE_MODEL_FILE_NAME}.data_example.pkl") label_data = io_utils.json_unpickle( model_path / f"{SAVE_MODEL_FILE_NAME}.label_data.pkl") meta = io_utils.pickle_load(model_path / f"{SAVE_MODEL_FILE_NAME}.meta.pkl") priority = io_utils.json_unpickle( model_path / f"{SAVE_MODEL_FILE_NAME}.priority.pkl") model_data_example = RasaModelData(label_key=LABEL_IDS, data=loaded_data) meta = train_utils.update_similarity_type(meta) model = TED.load( str(tf_model_file), model_data_example, data_signature=model_data_example.get_signature(), config=meta, max_history_tracker_featurizer_used=isinstance( featurizer, MaxHistoryTrackerFeaturizer), label_data=label_data, ) # build the graph for prediction predict_data_example = RasaModelData( label_key=LABEL_IDS, data={ feature_name: features for feature_name, features in model_data_example.items() if DIALOGUE in feature_name }, ) model.build_for_predict(predict_data_example) return cls(featurizer=featurizer, priority=priority, model=model, **meta)
def test_session_data_for_ids(model_data: RasaModelData): filtered_data = model_data._data_for_ids(model_data.data, np.array([0, 1])) for values in filtered_data.values(): for v in values: assert v.shape[0] == 2 k = list(model_data.keys())[0] assert np.all(np.array(filtered_data[k][0][0]) == np.array(model_data.get(k)[0][0])) assert np.all(np.array(filtered_data[k][0][1]) == np.array(model_data.get(k)[0][1]))
def _check_input_dimension_consistency(self, model_data: RasaModelData) -> None: """Checks if features have same dimensionality if hidden layers are shared.""" if self.component_config.get(SHARE_HIDDEN_LAYERS): num_text_features = model_data.feature_dimension(TEXT_FEATURES) num_label_features = model_data.feature_dimension(LABEL_FEATURES) if num_text_features != num_label_features: raise ValueError( "If embeddings are shared text features and label features " "must coincide. Check the output dimensions of previous components." )
def test_split_data_by_label(model_data: RasaModelData): split_model_data = model_data._split_by_label_ids( model_data.data, model_data.get("label", "ids")[0], np.array([0, 1]) ) assert len(split_model_data) == 2 for s in split_model_data: assert len(set(s.get("label", "ids")[0])) == 1 for key, attribute_data in split_model_data[0].items(): for sub_key, features in attribute_data.items(): assert len(features) == len(model_data.data[key][sub_key]) assert len(features[0]) == 2
def test_shuffle_session_data(model_data: RasaModelData): before = copy.copy(model_data) # precondition assert np.all( np.array(list(before.values())) == np.array(list(model_data.values()))) data = model_data._shuffled_data(model_data.data) # check that original data didn't change assert np.all( np.array(list(before.values())) == np.array(list(model_data.values()))) # check that new data is different assert np.all(np.array(model_data.values()) != np.array(data.values()))
def test_split_data_by_none_label(model_data: RasaModelData): model_data.label_key = None split_model_data = model_data.split(2, 42) assert len(split_model_data) == 2 train_data = split_model_data[0] test_data = split_model_data[1] # train data should have 3 examples assert len(train_data.get("intent_ids")[0]) == 3 # test data should have 2 examples assert len(test_data.get("intent_ids")[0]) == 2
def _assemble_label_data(self, attribute_data: Data, domain: Domain) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data can possibly contain one or both of the keys - [`label_action_name`, `label_action_text`] but will definitely contain the `label` key. `label_action_*` will contain the sequence, sentence and mask features for corresponding labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{ACTION_TEXT}", SEQUENCE_LENGTH, f"{LABEL}_{ACTION_TEXT}", SEQUENCE, ) label_ids = np.arange(domain.num_actions) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [ FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2) ], ) return label_data
def _assemble_label_data( self, attribute_data: Data, domain: Domain ) -> RasaModelData: """Constructs data regarding labels to be fed to the model. The resultant model data should contain the keys `label_intent`, `label`. `label_intent` will contain the sequence, sentence and mask features for all intent labels and `label` will contain the numerical label ids. Args: attribute_data: Feature data for all intent labels. domain: Domain of the assistant. Returns: Features of labels ready to be fed to the model. """ label_data = RasaModelData() label_data.add_data(attribute_data, key_prefix=f"{LABEL_KEY}_") label_data.add_lengths( f"{LABEL}_{INTENT}", SEQUENCE_LENGTH, f"{LABEL}_{INTENT}", SEQUENCE, ) label_ids = np.arange(len(domain.intents)) label_data.add_features( LABEL_KEY, LABEL_SUB_KEY, [FeatureArray(np.expand_dims(label_ids, -1), number_of_dimensions=2)], ) return label_data
def test_train_val_split(model_data: RasaModelData): train_model_data, test_model_data = model_data.split(2, 42) for k, values in model_data.items(): assert len(values) == len(train_model_data.get(k)) assert len(values) == len(test_model_data.get(k)) for i, v in enumerate(values): assert v[0].dtype == train_model_data.get(k)[i][0].dtype for values in train_model_data.values(): for v in values: assert v.shape[0] == 3 for values in test_model_data.values(): for v in values: assert v.shape[0] == 2
def _instantiate_model_class(self, model_data: RasaModelData) -> "RasaModel": return self.model_class(self.use_text_as_label)( data_signature=model_data.get_signature(), label_data=self._label_data, entity_tag_specs=self._entity_tag_specs, config=self.component_config, )
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData: """Prepares data for training. Performs sanity checks on training data, extracts encodings for labels. """ if self.retrieval_intent: training_data = training_data.filter_by_intent(self.retrieval_intent) label_id_index_mapping = self._label_id_index_mapping( training_data, attribute=RESPONSE ) if not label_id_index_mapping: # no labels are present to train return RasaModelData() self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping) self._label_data = self._create_label_data( training_data, label_id_index_mapping, attribute=RESPONSE ) model_data = self._create_model_data( training_data.intent_examples, label_id_index_mapping, label_attribute=RESPONSE, ) self._check_input_dimension_consistency(model_data) return model_data
def _update_data_signatures(self, model_data: RasaModelData) -> None: self.data_signature = model_data.get_signature() self.predict_data_signature = { feature_name: features for feature_name, features in self.data_signature.items() if TEXT in feature_name }
def test_session_data_for_ids(model_data: RasaModelData): filtered_data = model_data._data_for_ids(model_data.data, np.array([0, 1])) for values in filtered_data.values(): for data in values.values(): for v in data: assert v.shape[0] == 2 key = model_data.keys()[0] sub_key = model_data.keys(key)[0] assert np.all( np.array(filtered_data[key][sub_key][0][0]) == np.array( model_data.get(key, sub_key)[0][0])) assert np.all( np.array(filtered_data[key][sub_key][0][1]) == np.array( model_data.get(key, sub_key)[0][1]))
def __init__( self, data_signature: Dict[Text, List[FeatureSignature]], config: Dict[Text, Any], max_history_tracker_featurizer_used: bool, label_data: RasaModelData, ) -> None: super().__init__( name="TED", random_seed=config[RANDOM_SEED], tensorboard_log_dir=config[TENSORBOARD_LOG_DIR], tensorboard_log_level=config[TENSORBOARD_LOG_LEVEL], ) self.config = config self.max_history_tracker_featurizer_used = max_history_tracker_featurizer_used # data self.data_signature = data_signature self._check_data() self.predict_data_signature = { feature_name: features for feature_name, features in data_signature.items() if DIALOGUE in feature_name } # optimizer self.optimizer = tf.keras.optimizers.Adam() self.all_labels_embed = None label_batch = label_data.prepare_batch() self.tf_label_data = self.batch_to_model_data_format( label_batch, label_data.get_signature() ) # metrics self.action_loss = tf.keras.metrics.Mean(name="loss") self.action_acc = tf.keras.metrics.Mean(name="acc") self.metrics_to_log += ["loss", "acc"] # set up tf layers self._tf_layers: Dict[Text : tf.keras.layers.Layer] = {} self._prepare_layers()
def test_sort(model_data: RasaModelData): assert list(model_data.data.keys()) == [ "text", "action_text", "dialogue", "label", "entities", ] model_data.sort() assert list(model_data.data.keys()) == [ "action_text", "dialogue", "entities", "label", "text", ]
def _construct_model_initialization_data( cls, loaded_data: Dict[Text, Dict[Text, List[FeatureArray]]] ) -> Tuple[RasaModelData, RasaModelData]: model_data_example = RasaModelData(label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data=loaded_data) predict_data_example = RasaModelData( label_key=LABEL_KEY, label_sub_key=LABEL_SUB_KEY, data={ feature_name: features for feature_name, features in model_data_example.items() if feature_name # we need to remove label features for prediction if they are present in PREDICTION_FEATURES }, ) return model_data_example, predict_data_example
def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]: if self._predict_function is None: logger.debug("There is no tensorflow prediction graph.") self.build_for_predict(predict_data) # Prepare a single batch of the size of the input batch_in = predict_data.prepare_batch() self._training = False # needed for eager mode return self._predict_function(batch_in)
def predict(self, predict_data: RasaModelData) -> Dict[Text, tf.Tensor]: if self._predict_function is None: logger.debug("There is no tensorflow prediction graph.") self.build_for_predict(predict_data) predict_dataset = predict_data.as_tf_dataset(batch_size=1) batch_in = next(iter(predict_dataset)) self._training = False # needed for eager mode return self._predict_function(batch_in)
async def model_data() -> RasaModelData: return RasaModelData( label_key="intent", label_sub_key="ids", data={ "text_features": { "sentence": [ np.array([ np.random.rand(5, 14), np.random.rand(2, 14), np.random.rand(3, 14), np.random.rand(1, 14), np.random.rand(3, 14), ]), np.array([ scipy.sparse.csr_matrix( np.random.randint(5, size=(5, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(2, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(1, 10))), scipy.sparse.csr_matrix( np.random.randint(5, size=(3, 10))), ]), ] }, "intent_features": { "sentence": [ np.array([ np.random.randint(2, size=(5, 10)), np.random.randint(2, size=(2, 10)), np.random.randint(2, size=(3, 10)), np.random.randint(2, size=(1, 10)), np.random.randint(2, size=(3, 10)), ]) ] }, "intent": { "ids": [np.array([0, 1, 0, 1, 1])] }, "entities": { "tag_ids": [ np.array([ np.array([[0], [1], [1], [0], [2]]), np.array([[2], [0]]), np.array([[0], [1], [1]]), np.array([[0], [1]]), np.array([[0], [0], [0]]), ]) ] }, }, )
def test_batch_inference( batch_size: int, number_of_data_points: int, expected_number_of_batch_iterations: int, ): model = RasaModel() def _batch_predict( batch_in: Tuple[np.ndarray], ) -> Dict[Text, Union[np.ndarray, Dict[Text, np.ndarray]]]: dummy_output = batch_in[0] output = { "dummy_output": dummy_output, "non_input_affected_output": tf.constant(np.array([[1, 2]]), dtype=tf.int32), } return output # Monkeypatch batch predict so that run_inference interface can be tested model.batch_predict = _batch_predict # Create dummy model data to pass to model model_data = RasaModelData( label_key=LABEL, label_sub_key=IDS, data={ TEXT: { SENTENCE: [ FeatureArray( np.random.rand(number_of_data_points, 2), number_of_dimensions=2, ), ] } }, ) output = model.run_inference(model_data, batch_size=batch_size) # Firstly, the number of data points in dummy_output should be equal # to the number of data points sent as input. assert output["dummy_output"].shape[0] == number_of_data_points # Secondly, the number of data points inside diagnostic_data should be # equal to the number of batches passed to the model because for every # batch passed as input, it would have created a # corresponding diagnostic data entry. assert output["non_input_affected_output"].shape == ( expected_number_of_batch_iterations, 2, )
def _load_model( cls, index_label_id_mapping: Dict[int, Text], index_tag_id_mapping: Dict[int, Text], label_data: RasaModelData, meta: Dict[Text, Any], data_example: Dict[Text, List[np.ndarray]], model_dir: Text, ): file_name = meta.get("file") tf_model_file = os.path.join(model_dir, file_name + ".tf_model") label_key = LABEL_IDS if meta[INTENT_CLASSIFICATION] else None model_data_example = RasaModelData(label_key=label_key, data=data_example) model = cls.model_class(meta).load( tf_model_file, model_data_example, data_signature=model_data_example.get_signature(), label_data=label_data, index_label_id_mapping=index_label_id_mapping, index_tag_id_mapping=index_tag_id_mapping, config=meta, ) # build the graph for prediction predict_data_example = RasaModelData( label_key=label_key, data={ feature_name: features for feature_name, features in model_data_example.items() if TEXT in feature_name }, ) model.build_for_predict(predict_data_example) return model
def __init__( self, data_signature: Dict[Text, List[FeatureSignature]], label_data: RasaModelData, index_label_id_mapping: Optional[Dict[int, Text]], index_tag_id_mapping: Optional[Dict[int, Text]], config: Dict[Text, Any], ) -> None: super().__init__( name="CRFTransformer", random_seed=config[RANDOM_SEED], tensorboard_log_dir=config[TENSORBOARD_LOG_DIR], tensorboard_log_level=config[TENSORBOARD_LOG_LEVEL], ) self.config = config self.data_signature = data_signature self._check_data() self.predict_data_signature = { feature_name: features for feature_name, features in data_signature.items() if TEXT in feature_name } label_batch = label_data.prepare_batch() self.tf_label_data = self.batch_to_model_data_format( label_batch, label_data.get_signature()) self._num_intents = len(index_label_id_mapping ) if index_label_id_mapping is not None else 0 self._num_tags = len( index_tag_id_mapping) if index_tag_id_mapping is not None else 0 # tf objects, training self._prepare_layers() self._set_optimizer(tf.keras.optimizers.Adam(config[LEARNING_RATE])) self._create_metrics() self._update_metrics_to_log()
def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData: """Prepares data for training. Performs sanity checks on training data, extracts encodings for labels. Args: training_data: training data to preprocessed. """ # Collect all retrieval intents present in the data before filtering self.all_retrieval_intents = list(training_data.retrieval_intents) if self.retrieval_intent: training_data = training_data.filter_training_examples( lambda ex: self.retrieval_intent == ex.get(INTENT) ) else: # retrieval intent was left to its default value logger.info( "Retrieval intent parameter was left to its default value. This " "response selector will be trained on training examples combining " "all retrieval intents." ) label_attribute = RESPONSE if self.use_text_as_label else INTENT_RESPONSE_KEY label_id_index_mapping = self._label_id_index_mapping( training_data, attribute=label_attribute ) self.responses = training_data.responses if not label_id_index_mapping: # no labels are present to train return RasaModelData() self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping) self._label_data = self._create_label_data( training_data, label_id_index_mapping, attribute=label_attribute ) model_data = self._create_model_data( training_data.intent_examples, label_id_index_mapping, label_attribute=label_attribute, ) self._check_input_dimension_consistency(model_data) return model_data