Example #1
0
def test_data_generator_with_increasing_batch_size(model_data: RasaModelData):
    epochs = 2

    data_generator = RasaBatchDataGenerator(
        model_data,
        batch_size=[1, 2],
        epochs=epochs,
        batch_strategy="balanced",
        shuffle=True,
    )

    expected_batch_sizes = [[1, 1, 1, 1, 1], [2, 2, 1]]

    for _epoch in range(epochs):
        iterator = iter(data_generator)

        assert len(data_generator) == len(expected_batch_sizes[_epoch])

        for i in range(len(data_generator)):
            batch, _ = next(iterator)

            assert len(batch) == 11
            assert len(batch[0]) == expected_batch_sizes[_epoch][i]

        with pytest.raises(StopIteration):
            next(iterator)

        data_generator.on_epoch_end()
Example #2
0
    def rasa_predict(self, model_data: RasaModelData) -> Dict[Text, tf.Tensor]:
        """Custom prediction method that builds tf graph on the first call.

        Args:
            model_data: The model data to use for prediction.

        Return:
            Prediction output.
        """
        self._training = False
        if not self.prepared_for_prediction:
            # in case the model is used for prediction without loading, e.g. directly
            # after training, we need to prepare the model for prediction once
            self.prepare_for_predict()
            self.prepared_for_prediction = True

        batch_in = RasaBatchDataGenerator.prepare_batch(model_data.data)

        if self._run_eagerly:
            return tf_utils.to_numpy_or_python_type(self.predict_step(batch_in))

        if self._tf_predict_step is None:
            self._tf_predict_step = tf.function(
                self.predict_step, input_signature=self._dynamic_signature(batch_in)
            )

        return tf_utils.to_numpy_or_python_type(self._tf_predict_step(batch_in))
Example #3
0
def create_data_generators(
    model_data: RasaModelData,
    batch_sizes: Union[int, List[int]],
    epochs: int,
    batch_strategy: Text = SEQUENCE,
    eval_num_examples: int = 0,
    random_seed: Optional[int] = None,
    shuffle: bool = True,
) -> Tuple[RasaBatchDataGenerator, Optional[RasaBatchDataGenerator]]:
    """Create data generators for train and optional validation data.

    Args:
        model_data: The model data to use.
        batch_sizes: The batch size(s).
        epochs: The number of epochs to train.
        batch_strategy: The batch strategy to use.
        eval_num_examples: Number of examples to use for validation data.
        random_seed: The random seed.
        shuffle: Whether to shuffle data inside the data generator.

    Returns:
        The training data generator and optional validation data generator.
    """
    validation_data_generator = None
    if eval_num_examples > 0:
        model_data, evaluation_model_data = model_data.split(
            eval_num_examples,
            random_seed,
        )
        validation_data_generator = RasaBatchDataGenerator(
            evaluation_model_data,
            batch_size=batch_sizes,
            epochs=epochs,
            batch_strategy=batch_strategy,
            shuffle=shuffle,
        )

    data_generator = RasaBatchDataGenerator(
        model_data,
        batch_size=batch_sizes,
        epochs=epochs,
        batch_strategy=batch_strategy,
        shuffle=shuffle,
    )

    return data_generator, validation_data_generator
Example #4
0
    def _compile_and_fit(
        self, data_example: Dict[Text, Dict[Text, List[FeatureArray]]]
    ) -> None:
        """Compiles modified model and fits a sample data on it.

        Args:
            data_example: a data example that is stored with the ML component.
        """
        self.compile(optimizer=tf.keras.optimizers.Adam(self.config[LEARNING_RATE]))
        label_key = LABEL_KEY if self.config[INTENT_CLASSIFICATION] else None
        label_sub_key = LABEL_SUB_KEY if self.config[INTENT_CLASSIFICATION] else None

        model_data = RasaModelData(
            label_key=label_key, label_sub_key=label_sub_key, data=data_example
        )
        self._update_data_signatures(model_data)
        data_generator = RasaBatchDataGenerator(model_data, batch_size=1)
        self.fit(data_generator, verbose=False)
Example #5
0
    def load(
        cls,
        model_file_name: Text,
        model_data_example: RasaModelData,
        predict_data_example: Optional[RasaModelData] = None,
        finetune_mode: bool = False,
        *args: Any,
        **kwargs: Any,
    ) -> "RasaModel":
        """Loads a model from the given weights.

        Args:
            model_file_name: Path to file containing model weights.
            model_data_example: Example data point to construct the model architecture.
            predict_data_example: Example data point to speed up prediction during
              inference.
            finetune_mode: Indicates whether to load the model for further finetuning.
            *args: Any other non key-worded arguments.
            **kwargs: Any other key-worded arguments.

        Returns:
            Loaded model with weights appropriately set.
        """
        logger.debug(
            f"Loading the model from {model_file_name} "
            f"with finetune_mode={finetune_mode}..."
        )
        # create empty model
        model = cls(*args, **kwargs)
        learning_rate = kwargs.get("config", {}).get(LEARNING_RATE, 0.001)
        # need to train on 1 example to build weights of the correct size
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate))
        data_generator = RasaBatchDataGenerator(model_data_example, batch_size=1)
        model.fit(data_generator, verbose=False)
        # load trained weights
        model.load_weights(model_file_name)

        # predict on one data example to speed up prediction during inference
        # the first prediction always takes a bit longer to trace tf function
        if not finetune_mode and predict_data_example:
            model.run_inference(predict_data_example)

        logger.debug("Finished loading the model.")
        return model
Example #6
0
def test_data_generator_with_fixed_batch_size(model_data: RasaModelData):
    data_generator = RasaBatchDataGenerator(model_data,
                                            batch_size=2,
                                            epochs=1,
                                            batch_strategy="balanced",
                                            shuffle=True)

    expected_batch_sizes = [2, 2, 1]

    iterator = iter(data_generator)

    assert len(data_generator) == len(expected_batch_sizes)

    for i in range(len(data_generator)):
        batch, _ = next(iterator)
        assert len(batch) == 11
        assert len(batch[0]) == expected_batch_sizes[i]

    with pytest.raises(StopIteration):
        next(iterator)
Example #7
0
    async def test_gen_batch(self, trained_policy: TEDPolicy,
                             default_domain: Domain, stories_path: Path):
        training_trackers = await tests.core.test_policies.train_trackers(
            default_domain, stories_path, augmentation_factor=0)
        interpreter = RegexInterpreter()
        training_data, label_ids, entity_tags = trained_policy._featurize_for_training(
            training_trackers, default_domain, interpreter)
        label_data, all_labels = trained_policy._create_label_data(
            default_domain, interpreter)
        model_data = trained_policy._create_model_data(training_data,
                                                       label_ids, entity_tags,
                                                       all_labels)
        batch_size = 2
        data_generator = RasaBatchDataGenerator(model_data,
                                                batch_size=batch_size,
                                                shuffle=False,
                                                batch_strategy="sequence")
        iterator = iter(data_generator)
        # model data keys were sorted, so the order is alphabetical
        (
            (
                batch_action_name_mask,
                batch_action_name_sentence_indices,
                batch_action_name_sentence_data,
                batch_action_name_sentence_shape,
                batch_dialogue_length,
                batch_entities_mask,
                batch_entities_sentence_indices,
                batch_entities_sentence_data,
                batch_entities_sentence_shape,
                batch_intent_mask,
                batch_intent_sentence_indices,
                batch_intent_sentence_data,
                batch_intent_sentence_shape,
                batch_label_ids,
                batch_slots_mask,
                batch_slots_sentence_indices,
                batch_slots_sentence_data,
                batch_slots_sentence_shape,
            ),
            _,
        ) = next(iterator)

        assert (batch_label_ids.shape[0] == batch_size
                and batch_dialogue_length.shape[0] == batch_size)
        # batch and dialogue dimensions are NOT combined for masks
        assert (batch_slots_mask.shape[0] == batch_size
                and batch_intent_mask.shape[0] == batch_size
                and batch_entities_mask.shape[0] == batch_size
                and batch_action_name_mask.shape[0] == batch_size)
        # some features might be "fake" so there sequence is `0`
        seq_len = max([
            batch_intent_sentence_shape[1],
            batch_action_name_sentence_shape[1],
            batch_entities_sentence_shape[1],
            batch_slots_sentence_shape[1],
        ])
        assert (batch_intent_sentence_shape[1] == seq_len
                or batch_intent_sentence_shape[1] == 0)
        assert (batch_action_name_sentence_shape[1] == seq_len
                or batch_action_name_sentence_shape[1] == 0)
        assert (batch_entities_sentence_shape[1] == seq_len
                or batch_entities_sentence_shape[1] == 0)
        assert (batch_slots_sentence_shape[1] == seq_len
                or batch_slots_sentence_shape[1] == 0)

        data_generator = RasaBatchDataGenerator(model_data,
                                                batch_size=batch_size,
                                                shuffle=True,
                                                batch_strategy="balanced")
        iterator = iter(data_generator)

        (
            (
                batch_action_name_mask,
                batch_action_name_sentence_indices,
                batch_action_name_sentence_data,
                batch_action_name_sentence_shape,
                batch_dialogue_length,
                batch_entities_mask,
                batch_entities_sentence_indices,
                batch_entities_sentence_data,
                batch_entities_sentence_shape,
                batch_intent_mask,
                batch_intent_sentence_indices,
                batch_intent_sentence_data,
                batch_intent_sentence_shape,
                batch_label_ids,
                batch_slots_mask,
                batch_slots_sentence_indices,
                batch_slots_sentence_data,
                batch_slots_sentence_shape,
            ),
            _,
        ) = next(iterator)

        assert (batch_label_ids.shape[0] == batch_size
                and batch_dialogue_length.shape[0] == batch_size)
        # some features might be "fake" so there sequence is `0`
        seq_len = max([
            batch_intent_sentence_shape[1],
            batch_action_name_sentence_shape[1],
            batch_entities_sentence_shape[1],
            batch_slots_sentence_shape[1],
        ])
        assert (batch_intent_sentence_shape[1] == seq_len
                or batch_intent_sentence_shape[1] == 0)
        assert (batch_action_name_sentence_shape[1] == seq_len
                or batch_action_name_sentence_shape[1] == 0)
        assert (batch_entities_sentence_shape[1] == seq_len
                or batch_entities_sentence_shape[1] == 0)
        assert (batch_slots_sentence_shape[1] == seq_len
                or batch_slots_sentence_shape[1] == 0)