Esempio n. 1
0
def test_flexible_nlu_pipeline():
    message = Message("This is a test message.", data={"intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"})
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        })
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 6
    assert message.features[0].origin == "cvf_word"
    assert message.features[0].type == FEATURE_TYPE_SEQUENCE
    assert message.features[1].origin == "cvf_word"
    assert message.features[1].type == FEATURE_TYPE_SENTENCE
    # cvf word is also extracted for the intent
    assert message.features[2].origin == "cvf_word"
    assert message.features[2].type == FEATURE_TYPE_SEQUENCE
    assert message.features[3].origin == "cvf_char"
    assert message.features[3].type == FEATURE_TYPE_SEQUENCE
    assert message.features[4].origin == "cvf_char"
    assert message.features[4].type == FEATURE_TYPE_SENTENCE
    assert message.features[5].origin == "LexicalSyntacticFeaturizer"
    assert message.features[5].type == FEATURE_TYPE_SEQUENCE

    sequence_feature_dim = (message.features[0].features.shape[1] +
                            message.features[5].features.shape[1])
    sentence_feature_dim = message.features[0].features.shape[1]

    classifier = DIETClassifier(component_config={
        FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]
    })
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get(TEXT_SENTENCE_FEATURES)) == 1
    assert len(model_data.get(TEXT_SEQUENCE_FEATURES)) == 1
    assert len(model_data.get(LABEL_SEQUENCE_FEATURES)) == 1
    assert len(model_data.get(LABEL_SENTENCE_FEATURES)) == 0
    assert model_data.get(TEXT_SEQUENCE_FEATURES)[0][0].shape == (
        5,
        sequence_feature_dim,
    )
    assert model_data.get(TEXT_SENTENCE_FEATURES)[0][0].shape == (
        1,
        sentence_feature_dim,
    )
    assert model_data.get(LABEL_SEQUENCE_FEATURES)[0][0].shape == (1, 1)
Esempio n. 2
0
def test_model_data_signature_with_entities(messages: List[Message],
                                            entity_expected: bool):
    classifier = DIETClassifier({"BILOU_flag": False})
    training_data = TrainingData(messages)

    # create tokens for entity parsing inside DIET
    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    model_data = classifier.preprocess_train_data(training_data)
    entity_exists = "entities" in model_data.get_signature().keys()
    assert entity_exists == entity_expected
Esempio n. 3
0
def test_flexible_nlu_pipeline():
    message = Message("This is a test message.", data={"intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}
    )
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        }
    )
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 4
    assert message.features[0].origin == "cvf_word"
    # cvf word is also extracted for the intent
    assert message.features[1].origin == "cvf_word"
    assert message.features[2].origin == "cvf_char"
    assert message.features[3].origin == "LexicalSyntacticFeaturizer"

    feature_dim = (
        message.features[0].features.shape[1] + message.features[3].features.shape[1]
    )

    classifier = DIETClassifier(
        component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]}
    )
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get("text_features")) == 1
    assert len(model_data.get("label_features")) == 1
    assert model_data.get("text_features")[0][0].shape == (6, feature_dim)
    assert model_data.get("label_features")[0][0].shape == (1, 1)
Esempio n. 4
0
def test_removing_label_sparse_feature_sizes(
    initial_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
    final_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
    label_attribute: Text,
):
    """Tests if label attribute is removed from sparse feature sizes collection."""
    feature_sizes = DIETClassifier._remove_label_sparse_feature_sizes(
        sparse_feature_sizes=initial_sparse_feature_sizes,
        label_attribute=label_attribute,
    )
    assert feature_sizes == final_sparse_feature_sizes
Esempio n. 5
0
    def inner(
        diet: DIETClassifier,
        pipeline: Optional[List[Dict[Text, Any]]] = None,
        training_data: str = nlu_data_path,
        message_text: Text = "Rasa is great!",
        expect_intent: bool = True,
    ) -> Message:

        if not pipeline:
            pipeline = [
                {
                    "component": WhitespaceTokenizer
                },
                {
                    "component": CountVectorsFeaturizer
                },
            ]

        training_data, loaded_pipeline = train_and_preprocess(
            pipeline, training_data)

        diet.train(training_data=training_data)

        message = Message(data={TEXT: message_text})
        message = process_message(loaded_pipeline, message)

        message2 = copy.deepcopy(message)

        classified_message = diet.process([message])[0]

        if expect_intent:
            assert classified_message.data["intent"]["name"]

        loaded_diet = create_diet(diet.component_config, load=True)

        classified_message2 = loaded_diet.process([message2])[0]

        assert classified_message2.fingerprint(
        ) == classified_message.fingerprint()

        return loaded_diet, classified_message
Esempio n. 6
0
def test_compute_default_label_features():
    label_features = [
        Message(data={TEXT: "test a"}),
        Message(data={TEXT: "test b"}),
        Message(data={TEXT: "test c"}),
        Message(data={TEXT: "test d"}),
    ]

    output = DIETClassifier._compute_default_label_features(label_features)

    output = output[0]

    for i, o in enumerate(output):
        assert isinstance(o, np.ndarray)
        assert o[0][i] == 1
        assert o.shape == (1, len(label_features))
Esempio n. 7
0
    def inner(config: Dict[Text, Any],
              load: bool = False,
              finetune: bool = False) -> DIETClassifier:
        if load:
            constructor = DIETClassifier.load
        else:
            constructor = DIETClassifier.create

        default_execution_context.is_finetuning = finetune
        return constructor(
            config=rasa.utils.common.override_defaults(
                DIETClassifier.get_default_config(), config),
            model_storage=default_model_storage,
            execution_context=default_execution_context,
            resource=default_diet_resource,
        )
    def inner(config: Dict[Text, Any],
              load: bool = False,
              finetune: bool = False) -> DIETClassifier:
        if load:
            constructor = DIETClassifier.load
        else:
            constructor = DIETClassifier.create

        default_execution_context.is_finetuning = finetune
        return constructor(
            config={
                **DIETClassifier.get_default_config(),
                **config
            },
            model_storage=default_model_storage,
            execution_context=default_execution_context,
            resource=default_diet_resource,
        )
Esempio n. 9
0
 def get_default_config() -> Dict[Text, Any]:
     """The component's default config (see parent class for full docstring)."""
     return {
         **DIETClassifier.get_default_config(),
         # ## Architecture of the used neural network
         # Hidden layer sizes for layers before the embedding layers for user message
         # and labels.
         # The number of hidden layers is equal to the length of the corresponding
         # list.
         HIDDEN_LAYERS_SIZES: {
             TEXT: [256, 128],
             LABEL: [256, 128]
         },
         # Whether to share the hidden layer weights between input words
         # and responses
         SHARE_HIDDEN_LAYERS: False,
         # Number of units in transformer
         TRANSFORMER_SIZE: None,
         # Number of transformer layers
         NUM_TRANSFORMER_LAYERS: 0,
         # Number of attention heads in transformer
         NUM_HEADS: 4,
         # If 'True' use key relative embeddings in attention
         KEY_RELATIVE_ATTENTION: False,
         # If 'True' use key relative embeddings in attention
         VALUE_RELATIVE_ATTENTION: False,
         # Max position for relative embeddings. Only in effect if key-
         # or value relative attention are turned on
         MAX_RELATIVE_POSITION: 5,
         # Use a unidirectional or bidirectional encoder.
         UNIDIRECTIONAL_ENCODER: False,
         # ## Training parameters
         # Initial and final batch sizes:
         # Batch size will be linearly increased for each epoch.
         BATCH_SIZES: [64, 256],
         # Strategy used when creating batches.
         # Can be either 'sequence' or 'balanced'.
         BATCH_STRATEGY: BALANCED,
         # Number of epochs to train
         EPOCHS: 300,
         # Set random seed to any 'int' to get reproducible results
         RANDOM_SEED: None,
         # Initial learning rate for the optimizer
         LEARNING_RATE: 0.001,
         # ## Parameters for embeddings
         # Dimension size of embedding vectors
         EMBEDDING_DIMENSION: 20,
         # Default dense dimension to use if no dense features are present.
         DENSE_DIMENSION: {
             TEXT: 512,
             LABEL: 512
         },
         # Default dimension to use for concatenating sequence and sentence features.
         CONCAT_DIMENSION: {
             TEXT: 512,
             LABEL: 512
         },
         # The number of incorrect labels. The algorithm will minimize
         # their similarity to the user input during training.
         NUM_NEG: 20,
         # Type of similarity measure to use, either 'auto' or 'cosine' or 'inner'.
         SIMILARITY_TYPE: AUTO,
         # The type of the loss function, either 'cross_entropy' or 'margin'.
         LOSS_TYPE: CROSS_ENTROPY,
         # Number of top actions for which confidences should be predicted.
         # Set to 0 if confidences for all intents should be reported.
         RANKING_LENGTH: 10,
         # Determines whether the confidences of the chosen top actions should be
         # renormalized so that they sum up to 1. By default, we do not renormalize
         # and return the confidences for the top actions as is.
         # Note that renormalization only makes sense if confidences are generated
         # via `softmax`.
         RENORMALIZE_CONFIDENCES: False,
         # Indicates how similar the algorithm should try to make embedding vectors
         # for correct labels.
         # Should be 0.0 < ... < 1.0 for 'cosine' similarity type.
         MAX_POS_SIM: 0.8,
         # Maximum negative similarity for incorrect labels.
         # Should be -1.0 < ... < 1.0 for 'cosine' similarity type.
         MAX_NEG_SIM: -0.4,
         # If 'True' the algorithm only minimizes maximum similarity over
         # incorrect intent labels, used only if 'loss_type' is set to 'margin'.
         USE_MAX_NEG_SIM: True,
         # Scale loss inverse proportionally to confidence of correct prediction
         SCALE_LOSS: True,
         # ## Regularization parameters
         # The scale of regularization
         REGULARIZATION_CONSTANT: 0.002,
         # Fraction of trainable weights in internal layers.
         CONNECTION_DENSITY: 1.0,
         # The scale of how important is to minimize the maximum similarity
         # between embeddings of different labels.
         NEGATIVE_MARGIN_SCALE: 0.8,
         # Dropout rate for encoder
         DROP_RATE: 0.2,
         # Dropout rate for attention
         DROP_RATE_ATTENTION: 0,
         # If 'True' apply dropout to sparse input tensors
         SPARSE_INPUT_DROPOUT: False,
         # If 'True' apply dropout to dense input tensors
         DENSE_INPUT_DROPOUT: False,
         # ## Evaluation parameters
         # How often calculate validation accuracy.
         # Small values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EPOCHS: 20,
         # How many examples to use for hold out validation set
         # Large values may hurt performance, e.g. model accuracy.
         EVAL_NUM_EXAMPLES: 0,
         # ## Selector config
         # If 'True' random tokens of the input message will be masked and the model
         # should predict those tokens.
         MASKED_LM: False,
         # Name of the intent for which this response selector is to be trained
         RETRIEVAL_INTENT: None,
         # Boolean flag to check if actual text of the response
         # should be used as ground truth label for training the model.
         USE_TEXT_AS_LABEL: False,
         # If you want to use tensorboard to visualize training
         # and validation metrics,
         # set this option to a valid output directory.
         TENSORBOARD_LOG_DIR: None,
         # Define when training metrics for tensorboard should be logged.
         # Either after every epoch or for every training step.
         # Valid values: 'epoch' and 'batch'
         TENSORBOARD_LOG_LEVEL: "epoch",
         # Specify what features to use as sequence and sentence features
         # By default all features in the pipeline are used.
         FEATURIZERS: [],
         # Perform model checkpointing
         CHECKPOINT_MODEL: False,
         # if 'True' applies sigmoid on all similarity terms and adds it
         # to the loss function to ensure that similarity values are
         # approximately bounded. Used inside cross-entropy loss only.
         CONSTRAIN_SIMILARITIES: False,
         # Model confidence to be returned during inference. Currently, the only
         # possible value is `softmax`.
         MODEL_CONFIDENCE: SOFTMAX,
     }
Esempio n. 10
0
    assert result["predictions"][1] == prediction

    assert os.path.exists(
        os.path.join(report_folder, "response_selection_confusion_matrix.png"))
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_histogram.png"))
    assert not os.path.exists(
        os.path.join(report_folder, "response_selection_errors.json"))
    assert os.path.exists(
        os.path.join(report_folder, "response_selection_successes.json"))


@pytest.mark.parametrize(
    "components, expected_extractors",
    [
        ([DIETClassifier({ENTITY_RECOGNITION: False})], set()),
        ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}),
        ([CRFEntityExtractor()], {"CRFEntityExtractor"}),
        (
            [SpacyEntityExtractor(),
             CRFEntityExtractor()],
            {"SpacyEntityExtractor", "CRFEntityExtractor"},
        ),
        ([ResponseSelector()], set()),
    ],
)
def test_get_entity_extractors(components, expected_extractors):
    mock_interpreter = Interpreter(components, None)
    extractors = get_entity_extractors(mock_interpreter)

    assert extractors == expected_extractors
Esempio n. 11
0
def test_check_labels_features_exist(messages, expected):
    attribute = TEXT
    classifier = DIETClassifier()
    assert classifier._check_labels_features_exist(messages,
                                                   attribute) == expected