def test_flexible_nlu_pipeline(): message = Message("This is a test message.", data={"intent": "test"}) training_data = TrainingData([message, message, message, message, message]) tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}) featurizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={ FEATURIZER_CLASS_ALIAS: "cvf_char", "min_ngram": 1, "max_ngram": 3, "analyzer": "char_wb", }) featurizer.train(training_data) featurizer = LexicalSyntacticFeaturizer({}) featurizer.train(training_data) assert len(message.features) == 6 assert message.features[0].origin == "cvf_word" assert message.features[0].type == FEATURE_TYPE_SEQUENCE assert message.features[1].origin == "cvf_word" assert message.features[1].type == FEATURE_TYPE_SENTENCE # cvf word is also extracted for the intent assert message.features[2].origin == "cvf_word" assert message.features[2].type == FEATURE_TYPE_SEQUENCE assert message.features[3].origin == "cvf_char" assert message.features[3].type == FEATURE_TYPE_SEQUENCE assert message.features[4].origin == "cvf_char" assert message.features[4].type == FEATURE_TYPE_SENTENCE assert message.features[5].origin == "LexicalSyntacticFeaturizer" assert message.features[5].type == FEATURE_TYPE_SEQUENCE sequence_feature_dim = (message.features[0].features.shape[1] + message.features[5].features.shape[1]) sentence_feature_dim = message.features[0].features.shape[1] classifier = DIETClassifier(component_config={ FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"] }) model_data = classifier.preprocess_train_data(training_data) assert len(model_data.get(TEXT_SENTENCE_FEATURES)) == 1 assert len(model_data.get(TEXT_SEQUENCE_FEATURES)) == 1 assert len(model_data.get(LABEL_SEQUENCE_FEATURES)) == 1 assert len(model_data.get(LABEL_SENTENCE_FEATURES)) == 0 assert model_data.get(TEXT_SEQUENCE_FEATURES)[0][0].shape == ( 5, sequence_feature_dim, ) assert model_data.get(TEXT_SENTENCE_FEATURES)[0][0].shape == ( 1, sentence_feature_dim, ) assert model_data.get(LABEL_SEQUENCE_FEATURES)[0][0].shape == (1, 1)
def test_model_data_signature_with_entities(messages: List[Message], entity_expected: bool): classifier = DIETClassifier({"BILOU_flag": False}) training_data = TrainingData(messages) # create tokens for entity parsing inside DIET tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) model_data = classifier.preprocess_train_data(training_data) entity_exists = "entities" in model_data.get_signature().keys() assert entity_exists == entity_expected
def test_flexible_nlu_pipeline(): message = Message("This is a test message.", data={"intent": "test"}) training_data = TrainingData([message, message, message, message, message]) tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"} ) featurizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={ FEATURIZER_CLASS_ALIAS: "cvf_char", "min_ngram": 1, "max_ngram": 3, "analyzer": "char_wb", } ) featurizer.train(training_data) featurizer = LexicalSyntacticFeaturizer({}) featurizer.train(training_data) assert len(message.features) == 4 assert message.features[0].origin == "cvf_word" # cvf word is also extracted for the intent assert message.features[1].origin == "cvf_word" assert message.features[2].origin == "cvf_char" assert message.features[3].origin == "LexicalSyntacticFeaturizer" feature_dim = ( message.features[0].features.shape[1] + message.features[3].features.shape[1] ) classifier = DIETClassifier( component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]} ) model_data = classifier.preprocess_train_data(training_data) assert len(model_data.get("text_features")) == 1 assert len(model_data.get("label_features")) == 1 assert model_data.get("text_features")[0][0].shape == (6, feature_dim) assert model_data.get("label_features")[0][0].shape == (1, 1)
def test_removing_label_sparse_feature_sizes( initial_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], final_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], label_attribute: Text, ): """Tests if label attribute is removed from sparse feature sizes collection.""" feature_sizes = DIETClassifier._remove_label_sparse_feature_sizes( sparse_feature_sizes=initial_sparse_feature_sizes, label_attribute=label_attribute, ) assert feature_sizes == final_sparse_feature_sizes
def inner( diet: DIETClassifier, pipeline: Optional[List[Dict[Text, Any]]] = None, training_data: str = nlu_data_path, message_text: Text = "Rasa is great!", expect_intent: bool = True, ) -> Message: if not pipeline: pipeline = [ { "component": WhitespaceTokenizer }, { "component": CountVectorsFeaturizer }, ] training_data, loaded_pipeline = train_and_preprocess( pipeline, training_data) diet.train(training_data=training_data) message = Message(data={TEXT: message_text}) message = process_message(loaded_pipeline, message) message2 = copy.deepcopy(message) classified_message = diet.process([message])[0] if expect_intent: assert classified_message.data["intent"]["name"] loaded_diet = create_diet(diet.component_config, load=True) classified_message2 = loaded_diet.process([message2])[0] assert classified_message2.fingerprint( ) == classified_message.fingerprint() return loaded_diet, classified_message
def test_compute_default_label_features(): label_features = [ Message(data={TEXT: "test a"}), Message(data={TEXT: "test b"}), Message(data={TEXT: "test c"}), Message(data={TEXT: "test d"}), ] output = DIETClassifier._compute_default_label_features(label_features) output = output[0] for i, o in enumerate(output): assert isinstance(o, np.ndarray) assert o[0][i] == 1 assert o.shape == (1, len(label_features))
def inner(config: Dict[Text, Any], load: bool = False, finetune: bool = False) -> DIETClassifier: if load: constructor = DIETClassifier.load else: constructor = DIETClassifier.create default_execution_context.is_finetuning = finetune return constructor( config=rasa.utils.common.override_defaults( DIETClassifier.get_default_config(), config), model_storage=default_model_storage, execution_context=default_execution_context, resource=default_diet_resource, )
def inner(config: Dict[Text, Any], load: bool = False, finetune: bool = False) -> DIETClassifier: if load: constructor = DIETClassifier.load else: constructor = DIETClassifier.create default_execution_context.is_finetuning = finetune return constructor( config={ **DIETClassifier.get_default_config(), **config }, model_storage=default_model_storage, execution_context=default_execution_context, resource=default_diet_resource, )
def get_default_config() -> Dict[Text, Any]: """The component's default config (see parent class for full docstring).""" return { **DIETClassifier.get_default_config(), # ## Architecture of the used neural network # Hidden layer sizes for layers before the embedding layers for user message # and labels. # The number of hidden layers is equal to the length of the corresponding # list. HIDDEN_LAYERS_SIZES: { TEXT: [256, 128], LABEL: [256, 128] }, # Whether to share the hidden layer weights between input words # and responses SHARE_HIDDEN_LAYERS: False, # Number of units in transformer TRANSFORMER_SIZE: None, # Number of transformer layers NUM_TRANSFORMER_LAYERS: 0, # Number of attention heads in transformer NUM_HEADS: 4, # If 'True' use key relative embeddings in attention KEY_RELATIVE_ATTENTION: False, # If 'True' use key relative embeddings in attention VALUE_RELATIVE_ATTENTION: False, # Max position for relative embeddings. Only in effect if key- # or value relative attention are turned on MAX_RELATIVE_POSITION: 5, # Use a unidirectional or bidirectional encoder. UNIDIRECTIONAL_ENCODER: False, # ## Training parameters # Initial and final batch sizes: # Batch size will be linearly increased for each epoch. BATCH_SIZES: [64, 256], # Strategy used when creating batches. # Can be either 'sequence' or 'balanced'. BATCH_STRATEGY: BALANCED, # Number of epochs to train EPOCHS: 300, # Set random seed to any 'int' to get reproducible results RANDOM_SEED: None, # Initial learning rate for the optimizer LEARNING_RATE: 0.001, # ## Parameters for embeddings # Dimension size of embedding vectors EMBEDDING_DIMENSION: 20, # Default dense dimension to use if no dense features are present. DENSE_DIMENSION: { TEXT: 512, LABEL: 512 }, # Default dimension to use for concatenating sequence and sentence features. CONCAT_DIMENSION: { TEXT: 512, LABEL: 512 }, # The number of incorrect labels. The algorithm will minimize # their similarity to the user input during training. NUM_NEG: 20, # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'. SIMILARITY_TYPE: AUTO, # The type of the loss function, either 'cross_entropy' or 'margin'. LOSS_TYPE: CROSS_ENTROPY, # Number of top actions for which confidences should be predicted. # Set to 0 if confidences for all intents should be reported. RANKING_LENGTH: 10, # Determines whether the confidences of the chosen top actions should be # renormalized so that they sum up to 1. By default, we do not renormalize # and return the confidences for the top actions as is. # Note that renormalization only makes sense if confidences are generated # via `softmax`. RENORMALIZE_CONFIDENCES: False, # Indicates how similar the algorithm should try to make embedding vectors # for correct labels. # Should be 0.0 < ... < 1.0 for 'cosine' similarity type. MAX_POS_SIM: 0.8, # Maximum negative similarity for incorrect labels. # Should be -1.0 < ... < 1.0 for 'cosine' similarity type. MAX_NEG_SIM: -0.4, # If 'True' the algorithm only minimizes maximum similarity over # incorrect intent labels, used only if 'loss_type' is set to 'margin'. USE_MAX_NEG_SIM: True, # Scale loss inverse proportionally to confidence of correct prediction SCALE_LOSS: True, # ## Regularization parameters # The scale of regularization REGULARIZATION_CONSTANT: 0.002, # Fraction of trainable weights in internal layers. CONNECTION_DENSITY: 1.0, # The scale of how important is to minimize the maximum similarity # between embeddings of different labels. NEGATIVE_MARGIN_SCALE: 0.8, # Dropout rate for encoder DROP_RATE: 0.2, # Dropout rate for attention DROP_RATE_ATTENTION: 0, # If 'True' apply dropout to sparse input tensors SPARSE_INPUT_DROPOUT: False, # If 'True' apply dropout to dense input tensors DENSE_INPUT_DROPOUT: False, # ## Evaluation parameters # How often calculate validation accuracy. # Small values may hurt performance, e.g. model accuracy. EVAL_NUM_EPOCHS: 20, # How many examples to use for hold out validation set # Large values may hurt performance, e.g. model accuracy. EVAL_NUM_EXAMPLES: 0, # ## Selector config # If 'True' random tokens of the input message will be masked and the model # should predict those tokens. MASKED_LM: False, # Name of the intent for which this response selector is to be trained RETRIEVAL_INTENT: None, # Boolean flag to check if actual text of the response # should be used as ground truth label for training the model. USE_TEXT_AS_LABEL: False, # If you want to use tensorboard to visualize training # and validation metrics, # set this option to a valid output directory. TENSORBOARD_LOG_DIR: None, # Define when training metrics for tensorboard should be logged. # Either after every epoch or for every training step. # Valid values: 'epoch' and 'batch' TENSORBOARD_LOG_LEVEL: "epoch", # Specify what features to use as sequence and sentence features # By default all features in the pipeline are used. FEATURIZERS: [], # Perform model checkpointing CHECKPOINT_MODEL: False, # if 'True' applies sigmoid on all similarity terms and adds it # to the loss function to ensure that similarity values are # approximately bounded. Used inside cross-entropy loss only. CONSTRAIN_SIMILARITIES: False, # Model confidence to be returned during inference. Currently, the only # possible value is `softmax`. MODEL_CONFIDENCE: SOFTMAX, }
assert result["predictions"][1] == prediction assert os.path.exists( os.path.join(report_folder, "response_selection_confusion_matrix.png")) assert os.path.exists( os.path.join(report_folder, "response_selection_histogram.png")) assert not os.path.exists( os.path.join(report_folder, "response_selection_errors.json")) assert os.path.exists( os.path.join(report_folder, "response_selection_successes.json")) @pytest.mark.parametrize( "components, expected_extractors", [ ([DIETClassifier({ENTITY_RECOGNITION: False})], set()), ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}), ([CRFEntityExtractor()], {"CRFEntityExtractor"}), ( [SpacyEntityExtractor(), CRFEntityExtractor()], {"SpacyEntityExtractor", "CRFEntityExtractor"}, ), ([ResponseSelector()], set()), ], ) def test_get_entity_extractors(components, expected_extractors): mock_interpreter = Interpreter(components, None) extractors = get_entity_extractors(mock_interpreter) assert extractors == expected_extractors
def test_check_labels_features_exist(messages, expected): attribute = TEXT classifier = DIETClassifier() assert classifier._check_labels_features_exist(messages, attribute) == expected