Beispiel #1
0
def test_train_val_split(session_data: SessionDataType):
    train_session_data, val_session_data = train_val_split(
        session_data, 2, 42, "intent_ids"
    )

    for k, values in session_data.items():
        assert len(values) == len(train_session_data[k])
        assert len(values) == len(val_session_data[k])
        for i, v in enumerate(values):
            assert v[0].dtype == train_session_data[k][i][0].dtype

    for values in train_session_data.values():
        for v in values:
            assert v.shape[0] == 3

    for values in val_session_data.values():
        for v in values:
            assert v.shape[0] == 2
Beispiel #2
0
    def train(
        self,
        training_trackers: List["DialogueStateTracker"],
        domain: "Domain",
        **kwargs: Any,
    ) -> None:
        """Train the policy on given training trackers."""

        logger.debug("Started training embedding policy.")

        # set numpy random seed
        np.random.seed(self.random_seed)

        # dealing with training data
        training_data = self.featurize_for_training(training_trackers, domain,
                                                    **kwargs)

        # encode all label_ids with policies' featurizer
        state_featurizer = self.featurizer.state_featurizer
        self._encoded_all_label_ids = state_featurizer.create_encoded_all_actions(
            domain)

        # check if number of negatives is less than number of label_ids
        logger.debug("Check if num_neg {} is smaller "
                     "than number of label_ids {}, "
                     "else set num_neg to the number of label_ids - 1"
                     "".format(self.num_neg, domain.num_actions))
        # noinspection PyAttributeOutsideInit
        self.num_neg = min(self.num_neg, domain.num_actions - 1)

        # extract actual training data to feed to tf session
        session_data = self._create_session_data(training_data.X,
                                                 training_data.y)

        if self.evaluate_on_num_examples:
            session_data, eval_session_data = train_utils.train_val_split(
                session_data,
                self.evaluate_on_num_examples,
                self.random_seed,
                label_key="action_ids",
            )
        else:
            eval_session_data = None

        self.graph = tf.Graph()
        with self.graph.as_default():
            # set random seed in tf
            tf.set_random_seed(self.random_seed)

            # allows increasing batch size
            batch_size_in = tf.placeholder(tf.int64)

            (
                self._iterator,
                train_init_op,
                eval_init_op,
            ) = train_utils.create_iterator_init_datasets(
                session_data,
                eval_session_data,
                batch_size_in,
                self.batch_strategy,
                label_key="action_ids",
            )

            self._is_training = tf.placeholder_with_default(False, shape=())

            loss, acc = self._build_tf_train_graph()

            # define which optimizer to use
            self._train_op = tf.train.AdamOptimizer().minimize(loss)

            # train tensorflow graph
            self.session = tf.Session(config=self._tf_config)
            train_utils.train_tf_dataset(
                train_init_op,
                eval_init_op,
                batch_size_in,
                loss,
                acc,
                self._train_op,
                self.session,
                self._is_training,
                self.epochs,
                self.batch_size,
                self.evaluate_on_num_examples,
                self.evaluate_every_num_epochs,
            )

            # rebuild the graph for prediction
            self.pred_confidence = self._build_tf_pred_graph(session_data)

            self.attention_weights = train_utils.extract_attention(
                self.attention_weights)
Beispiel #3
0
    def train(
        self,
        training_data: "TrainingData",
        cfg: Optional["RasaNLUModelConfig"] = None,
        **kwargs: Any,
    ) -> None:
        """Train the embedding intent classifier on a data set."""

        logger.debug("Started training embedding classifier.")

        # set numpy random seed
        np.random.seed(self.random_seed)

        session_data = self.preprocess_train_data(training_data)

        possible_to_train = self._check_enough_labels(session_data)

        if not possible_to_train:
            logger.error("Can not train a classifier. "
                         "Need at least 2 different classes. "
                         "Skipping training of classifier.")
            return

        if self.evaluate_on_num_examples:
            session_data, eval_session_data = train_utils.train_val_split(
                session_data,
                self.evaluate_on_num_examples,
                self.random_seed,
                label_key="label_ids",
            )
        else:
            eval_session_data = None

        self.graph = tf.Graph()
        with self.graph.as_default():
            # set random seed
            tf.set_random_seed(self.random_seed)

            # allows increasing batch size
            batch_size_in = tf.placeholder(tf.int64)

            (
                self._iterator,
                train_init_op,
                eval_init_op,
            ) = train_utils.create_iterator_init_datasets(
                session_data,
                eval_session_data,
                batch_size_in,
                self.batch_in_strategy,
                label_key="label_ids",
            )

            self._is_training = tf.placeholder_with_default(False, shape=())

            loss, acc = self._build_tf_train_graph(session_data)

            # define which optimizer to use
            self._train_op = tf.train.AdamOptimizer().minimize(loss)

            # train tensorflow graph
            self.session = tf.Session(config=self._tf_config)
            train_utils.train_tf_dataset(
                train_init_op,
                eval_init_op,
                batch_size_in,
                loss,
                acc,
                self._train_op,
                self.session,
                self._is_training,
                self.epochs,
                self.batch_in_size,
                self.evaluate_on_num_examples,
                self.evaluate_every_num_epochs,
            )

            # rebuild the graph for prediction
            self.pred_confidence = self._build_tf_pred_graph(session_data)
Beispiel #4
0
def test_train_val_split_incorrect_size(session_data: SessionDataType, size):
    with pytest.raises(ValueError):
        train_val_split(session_data, size, 42, "intent_ids")