Example #1
0
    def fit(self, dataset, intent):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            intent (str): The specific intent of the dataset to train
                the slot filler on

        Returns:
            :class:`CRFSlotFiller`: The same instance, trained
        """
        logger.debug("Fitting %s slot filler...", intent)
        dataset = validate_and_format_dataset(dataset)
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        self.language = dataset[LANGUAGE]
        self.intent = intent
        self.slot_name_mapping = get_slot_name_mapping(dataset, intent)

        if not self.slot_name_mapping:
            # No need to train the CRF if the intent has no slots
            return self

        random_state = check_random_state(self.config.random_seed)
        augmented_intent_utterances = augment_utterances(
            dataset,
            self.intent,
            language=self.language,
            random_state=random_state,
            **self.config.data_augmentation_config.to_dict())

        crf_samples = [
            utterance_to_sample(u[DATA], self.config.tagging_scheme,
                                self.language)
            for u in augmented_intent_utterances
        ]

        for factory in self.features_factories:
            factory.fit(dataset, intent)

        # Ensure that X, Y are safe and that the OUTSIDE label is learnt to
        # avoid segfault at inference time
        # pylint: disable=C0103
        X = [
            self.compute_features(sample[TOKENS], drop_out=True)
            for sample in crf_samples
        ]
        Y = [[tag for tag in sample[TAGS]] for sample in crf_samples]
        X, Y = _ensure_safe(X, Y)

        # ensure ascii tags
        Y = [[_encode_tag(tag) for tag in y] for y in Y]

        # pylint: enable=C0103
        self.crf_model = _get_crf_model(self.config.crf_args)
        self.crf_model.fit(X, Y)

        logger.debug("Most relevant features for %s:\n%s", self.intent,
                     DifferedLoggingMessage(self.log_weights))
        return self
Example #2
0
    def test_utterance_to_sample(self, mocked_positive_tagging):
        # Given
        language = LANGUAGE_EN

        def mock_positive_tagging(_, slot, slot_size):
            return [INSIDE_PREFIX + slot for _ in range(slot_size)]

        mocked_positive_tagging.side_effect = mock_positive_tagging
        slot_name = "animal"
        query_data = [{
            "text": "i am a "
        }, {
            "text": "beautiful bird",
            "slot_name": slot_name
        }]
        expected_tagging = [
            OUTSIDE, OUTSIDE, OUTSIDE, INSIDE_PREFIX + slot_name,
            INSIDE_PREFIX + slot_name
        ]
        expected_tokens = [
            Token(value='i', start=0, end=1),
            Token(value='am', start=2, end=4),
            Token(value='a', start=5, end=6),
            Token(value='beautiful', start=7, end=16),
            Token(value='bird', start=17, end=21)
        ]
        expected_sample = {"tokens": expected_tokens, "tags": expected_tagging}

        # When
        sample = utterance_to_sample(query_data, TaggingScheme.IO, language)

        # Then
        self.assertEqual(sample, expected_sample)
Example #3
0
    def test_utterance_to_sample_with_partial_slots(self,
                                                    mocked_positive_tagging):

        # Given
        language = LANGUAGE_EN

        def mock_positive_tagging(_, slot, slot_size):
            return [INSIDE_PREFIX + slot for _ in range(slot_size)]

        mocked_positive_tagging.side_effect = mock_positive_tagging
        slot_name = "animal"
        query_data = [{"text": "i am a b"},
                      {"text": "eautiful bird", "slot_name": slot_name}]
        expected_tagging = [OUTSIDE, OUTSIDE, OUTSIDE, OUTSIDE,
                            INSIDE_PREFIX + slot_name,
                            INSIDE_PREFIX + slot_name]

        expected_tokens = [Token(value='i', start=0, end=1),
                           Token(value='am', start=2, end=4),
                           Token(value='a', start=5, end=6),
                           Token(value='b', start=7, end=8),
                           Token(value='eautiful', start=8, end=16),
                           Token(value='bird', start=17, end=21)]

        expected_sample = {"tokens": expected_tokens, "tags": expected_tagging}

        # When
        sample = utterance_to_sample(query_data, TaggingScheme.IO, language)

        # Then
        mocked_positive_tagging.assert_called()
        self.assertEqual(sample, expected_sample)
Example #4
0
    def fit(self, dataset, intent, verbose=False):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            intent (str): The specific intent of the dataset to train
                the slot filler on
            verbose (bool, optional): If *True*, it will print the weights
                of the CRF once the training is done

        Returns:
            :class:`CRFSlotFiller`: The same instance, trained
        """
        dataset = validate_and_format_dataset(dataset)
        self.intent = intent
        self.slot_name_mapping = get_slot_name_mapping(dataset, intent)
        self.language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)
        augmented_intent_utterances = augment_utterances(
            dataset,
            self.intent,
            language=self.language,
            random_state=random_state,
            **self.config.data_augmentation_config.to_dict())

        crf_samples = [
            utterance_to_sample(u[DATA], self.config.tagging_scheme,
                                self.language)
            for u in augmented_intent_utterances
        ]

        for factory in self.features_factories:
            factory.fit(dataset, intent)

        # pylint: disable=C0103
        X = [
            self.compute_features(sample[TOKENS], drop_out=True)
            for sample in crf_samples
        ]
        # ensure ascii tags
        Y = [[_encode_tag(tag) for tag in sample[TAGS]]
             for sample in crf_samples]
        # pylint: enable=C0103
        self.crf_model = _get_crf_model(self.config.crf_args)
        self.crf_model.fit(X, Y)
        if verbose:
            self.print_weights()

        return self
Example #5
0
    def fit(self, dataset, intent):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            intent (str): The specific intent of the dataset to train
                the slot filler on

        Returns:
            :class:`CRFSlotFiller`: The same instance, trained
        """
        logger.debug("Fitting %s slot filler...", intent)
        dataset = validate_and_format_dataset(dataset)
        self.intent = intent
        self.slot_name_mapping = get_slot_name_mapping(dataset, intent)
        self.language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)
        augmented_intent_utterances = augment_utterances(
            dataset,
            self.intent,
            language=self.language,
            random_state=random_state,
            **self.config.data_augmentation_config.to_dict())

        crf_samples = [
            utterance_to_sample(u[DATA], self.config.tagging_scheme,
                                self.language)
            for u in augmented_intent_utterances
        ]

        for factory in self.features_factories:
            factory.fit(dataset, intent)

        # pylint: disable=C0103
        X = [
            self.compute_features(sample[TOKENS], drop_out=True)
            for sample in crf_samples
        ]
        # ensure ascii tags
        Y = [[_encode_tag(tag) for tag in sample[TAGS]]
             for sample in crf_samples]
        # pylint: enable=C0103
        self.crf_model = _get_crf_model(self.config.crf_args)
        self.crf_model.fit(X, Y)

        logger.debug("Most relevant features for %s:\n%s", self.intent,
                     DifferedLoggingMessage(self.log_weights))
        return self
    def fit(self, dataset, intent, verbose=False):
        """Fit the slot filler

        Args:
            dataset (dict): A valid Snips dataset
            intent (str): The specific intent of the dataset to train
                the slot filler on
            verbose (bool, optional): If *True*, it will print the weights
                of the CRF once the training is done

        Returns:
            :class:`CRFSlotFiller`: The same instance, trained
        """
        dataset = validate_and_format_dataset(dataset)
        self.intent = intent
        self.slot_name_mapping = get_slot_name_mapping(dataset, intent)
        self.language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)
        augmented_intent_utterances = augment_utterances(
            dataset, self.intent, language=self.language,
            random_state=random_state,
            **self.config.data_augmentation_config.to_dict())

        crf_samples = [
            utterance_to_sample(u[DATA], self.config.tagging_scheme,
                                self.language)
            for u in augmented_intent_utterances]

        for factory in self.features_factories:
            factory.fit(dataset, intent)

        # pylint: disable=C0103
        X = [self.compute_features(sample[TOKENS], drop_out=True)
             for sample in crf_samples]
        # ensure ascii tags
        Y = [[_encode_tag(tag) for tag in sample[TAGS]]
             for sample in crf_samples]
        # pylint: enable=C0103
        self.crf_model = _get_crf_model(self.config.crf_args)
        self.crf_model.fit(X, Y)
        if verbose:
            self.print_weights()

        return self