def compute_cross_val_metrics(dataset,
                              engine_class,
                              nb_folds=5,
                              train_size_ratio=1.0,
                              drop_entities=False,
                              include_slot_metrics=True,
                              slot_matching_lambda=None,
                              progression_handler=None):
    """Compute end-to-end metrics on the dataset using cross validation

    Args:
        dataset (dict or str): Dataset or path to dataset
        engine_class: Python class to use for training and inference, this
            class must inherit from `Engine`
        nb_folds (int, optional): Number of folds to use for cross validation
        train_size_ratio: float, ratio of intent utterances to use for
            training (default=5)
        drop_entities (bool, false): Specify whether not all entity values
            should be removed from training data
        include_slot_metrics (bool, true): If false, the slots metrics and the
            slots parsing errors will not be reported.
        slot_matching_lambda (lambda, optional):
            lambda expected_slot, actual_slot -> bool,
            if defined, this function will be use to match slots when computing
            metrics, otherwise exact match will be used.
            `expected_slot` corresponds to the slot as defined in the dataset,
            and `actual_slot` corresponds to the slot as returned by the NLU
        progression_handler (lambda, optional): handler called at each
            progression (%) step

    Returns:
        dict: Metrics results containing the following data

            - "metrics": the computed metrics
            - "parsing_errors": the list of parsing errors

    """

    if isinstance(dataset, basestring):
        with io.open(dataset, encoding="utf8") as f:
            dataset = json.load(f)

    try:
        splits = create_shuffle_stratified_splits(dataset, nb_folds,
                                                  train_size_ratio,
                                                  drop_entities)
    except NotEnoughDataError as e:
        print("Skipping metrics computation because of: %s" % e.message)
        return {METRICS: None, PARSING_ERRORS: []}

    intent_list = sorted(list(dataset["intents"]))
    global_metrics = dict()
    global_confusion_matrix = None
    global_errors = []
    total_splits = len(splits)
    for split_index, (train_dataset, test_utterances) in enumerate(splits):
        engine = engine_class()
        engine.fit(train_dataset)
        split_metrics, errors, confusion_matrix = compute_engine_metrics(
            engine, test_utterances, intent_list, include_slot_metrics,
            slot_matching_lambda)
        global_metrics = aggregate_metrics(global_metrics, split_metrics,
                                           include_slot_metrics)
        global_confusion_matrix = aggregate_matrices(global_confusion_matrix,
                                                     confusion_matrix)
        global_errors += errors
        if progression_handler is not None:
            progression_handler(float(split_index + 1) / float(total_splits))

    global_metrics = compute_precision_recall_f1(global_metrics)

    nb_utterances = {
        intent: len(data[UTTERANCES])
        for intent, data in dataset[INTENTS].items()
    }
    for intent, metrics in global_metrics.items():
        metrics[INTENT_UTTERANCES] = nb_utterances.get(intent, 0)

    return {
        METRICS: global_metrics,
        PARSING_ERRORS: global_errors,
        CONFUSION_MATRIX: global_confusion_matrix
    }
def compute_train_test_metrics(train_dataset,
                               test_dataset,
                               engine_class,
                               include_slot_metrics=True,
                               slot_matching_lambda=None):
    """Compute end-to-end metrics on `test_dataset` after having trained on
    `train_dataset`

    Args:
        train_dataset (dict or str): Dataset or path to dataset used for
            training
        test_dataset (dict or str): dataset or path to dataset used for testing
        engine_class: Python class to use for training and inference, this
            class must inherit from `Engine`
        include_slot_metrics (bool, true): If false, the slots metrics and the
            slots parsing errors will not be reported.
        slot_matching_lambda (lambda, optional):
            lambda expected_slot, actual_slot -> bool,
            if defined, this function will be use to match slots when computing
            metrics, otherwise exact match will be used.
            `expected_slot` corresponds to the slot as defined in the dataset,
            and `actual_slot` corresponds to the slot as returned by the NLU

    Returns
        dict: Metrics results containing the following data

            - "metrics": the computed metrics
            - "parsing_errors": the list of parsing errors
    """

    if isinstance(train_dataset, basestring):
        with io.open(train_dataset, encoding="utf8") as f:
            train_dataset = json.load(f)

    if isinstance(test_dataset, basestring):
        with io.open(test_dataset, encoding="utf8") as f:
            test_dataset = json.load(f)

    intent_list = set(train_dataset["intents"])
    intent_list.update(test_dataset["intents"])
    intent_list = sorted(intent_list)

    engine = engine_class()
    engine.fit(train_dataset)
    test_utterances = [
        (intent_name, utterance)
        for intent_name, intent_data in test_dataset[INTENTS].items()
        for utterance in intent_data[UTTERANCES]
    ]
    metrics, errors, confusion_matrix = compute_engine_metrics(
        engine, test_utterances, intent_list, include_slot_metrics,
        slot_matching_lambda)
    metrics = compute_precision_recall_f1(metrics)
    nb_utterances = {
        intent: len(data[UTTERANCES])
        for intent, data in train_dataset[INTENTS].items()
    }
    for intent, intent_metrics in metrics.items():
        intent_metrics[INTENT_UTTERANCES] = nb_utterances.get(intent, 0)
    return {
        METRICS: metrics,
        PARSING_ERRORS: errors,
        CONFUSION_MATRIX: confusion_matrix
    }
Beispiel #3
0
def test_should_compute_engine_metrics():
    # Given
    def create_utterance(intent_name, slot_name, slot_value):
        utterance = {
            "data": [
                {
                    "text": "this is an utterance with ",
                },
                {
                    "text": slot_value,
                    "slot_name": slot_name,
                    "entity": slot_name
                },
            ]
        }
        return intent_name, utterance

    def create_parsing_output(intent_name, slot_name, slot_value):
        return {
            "text":
            "this is an utterance with %s" % slot_value,
            "intent": {
                "intentName": intent_name,
                "probability": 1.0
            },
            "slots": [{
                "rawValue": slot_value,
                "range": {
                    "start": 26,
                    "end": 26 + len(slot_value)
                },
                "entity": slot_name,
                "slotName": slot_name,
            }],
        }

    utterances = [
        create_utterance("intent1", "slot1", "value1"),
        create_utterance("intent1", "slot1", "value2"),
        create_utterance("intent1", "slot2", "value3"),
        create_utterance("intent2", "slot3", "value4"),
        create_utterance("intent2", "slot3", "value5"),
    ]

    class TestEngine:
        def __init__(self):
            self.utterance_index = 0

        def parse(self, text):
            res = None
            if self.utterance_index == 0:
                res = create_parsing_output("intent1", "slot1", "value1")
            if self.utterance_index == 1:
                res = create_parsing_output("intent2", "slot3", "value4")
            if self.utterance_index == 2:
                res = create_parsing_output("intent1", "slot1", "value1")
            if self.utterance_index == 3:
                res = create_parsing_output("intent2", "slot3", "value4")
            if self.utterance_index == 4:
                res = create_parsing_output("intent2", "slot3", "value4")
            self.utterance_index += 1
            return res

    engine = TestEngine()

    def slots_match(lhs, rhs):
        return lhs[TEXT] == rhs["rawValue"]

    # When
    metrics, errors, confusion_matrix = compute_engine_metrics(
        engine=engine,
        test_utterances=utterances,
        intent_list=["intent1", "intent2"],
        include_slot_metrics=True,
        slot_matching_lambda=slots_match,
        intents_filter=None,
    )

    # Then
    expected_metrics = {
        "intent1": {
            "exact_parsings": 1,
            "slots": {
                "slot1": {
                    "false_positive": 1,
                    "true_positive": 1,
                    "false_negative": 0
                },
                "slot2": {
                    "false_positive": 0,
                    "true_positive": 0,
                    "false_negative": 1
                },
            },
            "intent": {
                "false_positive": 0,
                "true_positive": 2,
                "false_negative": 1
            },
        },
        "intent2": {
            "exact_parsings": 1,
            "slots": {
                "slot3": {
                    "false_positive": 1,
                    "true_positive": 1,
                    "false_negative": 1
                }
            },
            "intent": {
                "false_positive": 1,
                "true_positive": 2,
                "false_negative": 0
            },
        },
    }
    expected_errors = [
        {
            "expected_output": {
                "input":
                "this is an utterance with value2",
                "slots": [{
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "slotName": "slot1",
                    "rawValue": "value2",
                    "entity": "slot1",
                }],
                "intent": {
                    "intentName": "intent1",
                    "probability": 1.0
                },
            },
            "nlu_output": {
                "text":
                "this is an utterance with value4",
                "slots": [{
                    "slotName": "slot3",
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "rawValue": "value4",
                    "entity": "slot3",
                }],
                "intent": {
                    "intentName": "intent2",
                    "probability": 1.0
                },
            },
        },
        {
            "expected_output": {
                "input":
                "this is an utterance with value3",
                "slots": [{
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "slotName": "slot2",
                    "rawValue": "value3",
                    "entity": "slot2",
                }],
                "intent": {
                    "intentName": "intent1",
                    "probability": 1.0
                },
            },
            "nlu_output": {
                "text":
                "this is an utterance with value1",
                "slots": [{
                    "slotName": "slot1",
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "rawValue": "value1",
                    "entity": "slot1",
                }],
                "intent": {
                    "intentName": "intent1",
                    "probability": 1.0
                },
            },
        },
        {
            "expected_output": {
                "input":
                "this is an utterance with value5",
                "slots": [{
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "slotName": "slot3",
                    "rawValue": "value5",
                    "entity": "slot3",
                }],
                "intent": {
                    "intentName": "intent2",
                    "probability": 1.0
                },
            },
            "nlu_output": {
                "text":
                "this is an utterance with value4",
                "slots": [{
                    "slotName": "slot3",
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "rawValue": "value4",
                    "entity": "slot3",
                }],
                "intent": {
                    "intentName": "intent2",
                    "probability": 1.0
                },
            },
        },
    ]

    expected_confusion_matrix = {
        "intents": ["intent1", "intent2", "null"],
        "matrix": [
            [2, 1, 0],
            [0, 2, 0],
            [0, 0, 0],
        ],
    }

    assert expected_metrics == metrics
    assert expected_errors == errors
    assert expected_confusion_matrix == confusion_matrix
Beispiel #4
0
def test_should_compute_engine_metrics_with_intents_filter():
    # Given
    def create_utterance(intent_name, text):
        return intent_name, {"data": [{"text": text}]}

    def create_parsing_output(intent_name, text):
        return {
            "text": text,
            "intent": {
                "intentName": intent_name,
                "probability": 1.0
            },
            "slots": [],
        }

    utterances = [
        create_utterance("intent1", "first utterance intent1"),
        create_utterance("intent1", "second utterance intent1"),
        create_utterance("intent1", "third utterance intent1"),
        create_utterance("intent1", "ambiguous utterance intent1 and intent3"),
        create_utterance("intent2", "first utterance intent2"),
        create_utterance("intent2", "second utterance intent2"),
        create_utterance("intent2", "ambiguous utterance intent2 and intent3"),
    ]

    class EngineWithFilterAPI:
        def parse(self, text, intents_filter=None):
            intent = None
            for intent_name in ["intent3", "intent1", "intent2"]:
                if intent_name in text:
                    intent = intent_name
                    break

            if intents_filter is not None and intent not in intents_filter:
                intent = None
            return create_parsing_output(intent, text)

    class EngineWithFilterProp:
        def __init__(self):
            self.intents_filter = ["intent1", "intent2"]

        def parse(self, text):
            intent = None
            for intent_name in ["intent3", "intent1", "intent2"]:
                if intent_name in text:
                    intent = intent_name
                    break

            if self.intents_filter is not None and intent not in self.intents_filter:
                intent = None
            return create_parsing_output(intent, text)

    engine_with_filter_api = EngineWithFilterAPI()
    engine_with_filter_prop = EngineWithFilterProp()

    # When
    metrics1, _, _ = compute_engine_metrics(
        engine=engine_with_filter_api,
        test_utterances=utterances,
        intent_list=["intent1", "intent2", "intent3"],
        include_slot_metrics=False,
        intents_filter=["intent1", "intent2"],
    )
    metrics2, _, _ = compute_engine_metrics(
        engine=engine_with_filter_prop,
        test_utterances=utterances,
        intent_list=["intent1", "intent2", "intent3"],
        include_slot_metrics=False,
        intents_filter=["intent1", "intent2"],
    )

    # Then
    expected_metrics = {
        "intent1": {
            "exact_parsings": 3,
            "intent": {
                "false_positive": 0,
                "true_positive": 3,
                "false_negative": 1
            },
        },
        "intent2": {
            "exact_parsings": 2,
            "intent": {
                "false_positive": 0,
                "true_positive": 2,
                "false_negative": 1,
            },
        },
        "null": {
            "exact_parsings": 0,
            "intent": {
                "false_positive": 2,
                "true_positive": 0,
                "false_negative": 0,
            },
        },
    }

    assert expected_metrics == metrics1
    assert expected_metrics == metrics2
Beispiel #5
0
    def test_should_compute_engine_metrics(self):
        # Given
        def create_utterance(intent_name, slot_name, slot_value):
            utterance = {
                "data": [{
                    "text": "this is an utterance with ",
                }, {
                    "text": slot_value,
                    "slot_name": slot_name,
                    "entity": slot_name
                }]
            }
            return intent_name, utterance

        def create_parsing_output(intent_name, slot_name, slot_value):
            return {
                "text":
                "this is an utterance with %s" % slot_value,
                "intent": {
                    "intentName": intent_name,
                    "probability": 1.0
                },
                "slots": [{
                    "rawValue": slot_value,
                    "range": {
                        "start": 26,
                        "end": 26 + len(slot_value)
                    },
                    "entity": slot_name,
                    "slotName": slot_name
                }]
            }

        utterances = [
            create_utterance("intent1", "slot1", "value1"),
            create_utterance("intent1", "slot1", "value2"),
            create_utterance("intent1", "slot2", "value3"),
            create_utterance("intent2", "slot3", "value4"),
            create_utterance("intent2", "slot3", "value5"),
        ]

        class TestEngine(object):
            def __init__(self):
                self.utterance_index = 0

            def parse(self, text):
                res = None
                if self.utterance_index == 0:
                    res = create_parsing_output("intent1", "slot1", "value1")
                if self.utterance_index == 1:
                    res = create_parsing_output("intent2", "slot3", "value4")
                if self.utterance_index == 2:
                    res = create_parsing_output("intent1", "slot1", "value1")
                if self.utterance_index == 3:
                    res = create_parsing_output("intent2", "slot3", "value4")
                if self.utterance_index == 4:
                    res = create_parsing_output("intent2", "slot3", "value4")
                self.utterance_index += 1
                return res

        engine = TestEngine()

        def slots_match(lhs, rhs):
            return lhs[TEXT] == rhs["rawValue"]

        # When
        metrics, errors = compute_engine_metrics(
            engine=engine,
            test_utterances=utterances,
            slot_matching_lambda=slots_match)

        # Then
        expected_metrics = {
            "intent1": {
                "slots": {
                    "slot1": {
                        "false_positive": 1,
                        "true_positive": 1,
                        "false_negative": 0
                    },
                    "slot2": {
                        "false_positive": 0,
                        "true_positive": 0,
                        "false_negative": 1
                    }
                },
                "intent": {
                    "false_positive": 0,
                    "true_positive": 2,
                    "false_negative": 1
                }
            },
            "intent2": {
                "slots": {
                    "slot3": {
                        "false_positive": 1,
                        "true_positive": 1,
                        "false_negative": 1
                    }
                },
                "intent": {
                    "false_positive": 1,
                    "true_positive": 2,
                    "false_negative": 0
                }
            }
        }
        expected_errors = [{
            "expected_output": {
                "input":
                "this is an utterance with value2",
                "slots": [{
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "slotName": "slot1",
                    "rawValue": "value2",
                    "entity": "slot1"
                }],
                "intent": {
                    "intentName": "intent1",
                    "probability": 1.0
                }
            },
            "nlu_output": {
                "text":
                "this is an utterance with value4",
                "slots": [{
                    "slotName": "slot3",
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "rawValue": "value4",
                    "entity": "slot3"
                }],
                "intent": {
                    "intentName": "intent2",
                    "probability": 1.0
                }
            }
        }, {
            "expected_output": {
                "input":
                "this is an utterance with value3",
                "slots": [{
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "slotName": "slot2",
                    "rawValue": "value3",
                    "entity": "slot2"
                }],
                "intent": {
                    "intentName": "intent1",
                    "probability": 1.0
                }
            },
            "nlu_output": {
                "text":
                "this is an utterance with value1",
                "slots": [{
                    "slotName": "slot1",
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "rawValue": "value1",
                    "entity": "slot1"
                }],
                "intent": {
                    "intentName": "intent1",
                    "probability": 1.0
                }
            }
        }, {
            "expected_output": {
                "input":
                "this is an utterance with value5",
                "slots": [{
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "slotName": "slot3",
                    "rawValue": "value5",
                    "entity": "slot3"
                }],
                "intent": {
                    "intentName": "intent2",
                    "probability": 1.0
                }
            },
            "nlu_output": {
                "text":
                "this is an utterance with value4",
                "slots": [{
                    "slotName": "slot3",
                    "range": {
                        "start": 26,
                        "end": 32
                    },
                    "rawValue": "value4",
                    "entity": "slot3"
                }],
                "intent": {
                    "intentName": "intent2",
                    "probability": 1.0
                }
            }
        }]
        self.assertDictEqual(expected_metrics, metrics)
        self.assertListEqual(expected_errors, errors)