def __init__(self, config, **shared): if config is None: self.config = self.default_config() elif isinstance(config, ProcessingUnitConfig): self.config = config elif isinstance(config, dict): self.config = self.config_type.from_dict(config) else: raise ValueError("Unexpected config type: %s" % type(config)) if self.config is not None: self.config.set_unit_name(self.unit_name) self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER) self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER) self.resources = shared.get(RESOURCES) self.random_state = check_random_state(shared.get(RANDOM_STATE))
def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.debug("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources ) self.featurizer.language = language none_class = max(classes) try: self.featurizer = self.featurizer.fit( dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: self.featurizer = None return self x = self.featurizer.transform(utterances) alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def compute_features(self, tokens, drop_out=False): """Computes features on the provided tokens The *drop_out* parameters allows to activate drop out on features that have a positive drop out ratio. This should only be used during training. """ cache = [{TOKEN_NAME: token} for token in tokens] features = [] random_state = check_random_state(self.config.random_seed) for i in range(len(tokens)): token_features = UnupdatableDict() for feature in self.features: f_drop_out = feature.drop_out if drop_out and random_state.rand() < f_drop_out: continue value = feature.compute(i, cache) if value is not None: token_features[feature.name] = value features.append(token_features) return features
def train(dataset_path, output_path, config_path=None, verbose=False, random_seed=None): """Train an NLU engine on the provided dataset""" import json import logging from pathlib import Path from snips_nlu import SnipsNLUEngine from snips_nlu.cli.utils import set_nlu_logger from snips_nlu.common.utils import check_random_state if verbose == 1: set_nlu_logger(logging.INFO) elif verbose >= 2: set_nlu_logger(logging.DEBUG) with Path(dataset_path).open("r", encoding="utf8") as f: dataset = json.load(f) config = None if config_path is not None: with Path(config_path).open("r", encoding="utf8") as f: config = json.load(f) random_state = check_random_state(random_seed) print("Create and train the engine...") engine = SnipsNLUEngine(config, random_state=random_state).fit(dataset) print("Persisting the engine...") engine.persist(output_path) print("Saved the trained engine to %s" % output_path)
def fit(self, dataset, intent): """Fits the slot filler Args: dataset (dict): A valid Snips dataset intent (str): The specific intent of the dataset to train the slot filler on Returns: :class:`CRFSlotFiller`: The same instance, trained """ logger.debug("Fitting %s slot filler...", intent) dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) for factory in self.features_factories: factory.custom_entity_parser = self.custom_entity_parser factory.builtin_entity_parser = self.builtin_entity_parser factory.resources = self.resources self.language = dataset[LANGUAGE] self.intent = intent self.slot_name_mapping = get_slot_name_mapping(dataset, intent) if not self.slot_name_mapping: # No need to train the CRF if the intent has no slots return self random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, resources=self.resources, random_state=random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ utterance_to_sample(u[DATA], self.config.tagging_scheme, self.language) for u in augmented_intent_utterances ] for factory in self.features_factories: factory.fit(dataset, intent) # Ensure that X, Y are safe and that the OUTSIDE label is learnt to # avoid segfault at inference time # pylint: disable=C0103 X = [ self.compute_features(sample[TOKENS], drop_out=True) for sample in crf_samples ] Y = [[tag for tag in sample[TAGS]] for sample in crf_samples] X, Y = _ensure_safe(X, Y) # ensure ascii tags Y = [[_encode_tag(tag) for tag in y] for y in Y] # pylint: enable=C0103 self.crf_model = _get_crf_model(self.config.crf_args) self.crf_model.fit(X, Y) logger.debug("Most relevant features for %s:\n%s", self.intent, DifferedLoggingMessage(self.log_weights)) return self
def __init__(self, factory_config, **shared): self.factory_config = factory_config self.resources = shared.get(RESOURCES) self.builtin_entity_parser = shared.get(BUILTIN_ENTITY_PARSER) self.custom_entity_parser = shared.get(CUSTOM_ENTITY_PARSER) self.random_state = check_random_state(shared.get(RANDOM_STATE))