Esempio n. 1
0
    def test_get_string_variations_should_not_generate_number_variations(self):
        # Given
        builtin_entity_parser = MagicMock()
        mocked_parse = MagicMock(return_value=[])
        builtin_entity_parser.parse = mocked_parse

        # When/Then
        get_string_variations("", "en", builtin_entity_parser, numbers=False)
        mocked_parse.assert_not_called()
        get_string_variations("", "en", builtin_entity_parser, numbers=True)
        self.assertGreater(mocked_parse.call_count, 0)
Esempio n. 2
0
    def test_get_string_variations(self):
        # Given
        language = LANGUAGE_EN
        string = "a and b 2"

        # When
        variations = get_string_variations(string, language)

        # Then
        expected_variations = {
            "a and b 2",
            "a & b 2",
            "a b 2",
            "a and b two",
            "a & b two",
            "a b two",
            "A B two",
            "A And B two",
            "A  B 2",
            "A and B two",
            "A & B two",
            "A & B 2",
            "A  B two",
            "A and B 2",
            "a  b 2",
            "a  b two",
            "A B 2",
            "A And B 2",
        }
        self.assertSetEqual(variations, expected_variations)
Esempio n. 3
0
def add_variation_if_needed(utterances, variation, utterance, language):
    if not variation:
        return utterances
    all_variations = get_string_variations(variation, language)
    for v in all_variations:
        if v not in utterances:
            utterances[v] = utterance
    return utterances
Esempio n. 4
0
def add_variation_if_needed(utterances, variation, utterance, language):
    if not variation:
        return utterances
    all_variations = get_string_variations(variation, language)
    for v in all_variations:
        if v not in utterances:
            utterances[v] = utterance
    return utterances
Esempio n. 5
0
    def test_should_variate_case_and_normalization(self):
        # Given
        language = LANGUAGE_EN
        string = "Küche"

        # When
        variations = get_string_variations(string, language)

        # Then
        expected_variations = {"kuche", "küche", "Kuche", "Küche"}
        self.assertSetEqual(variations, expected_variations)
Esempio n. 6
0
    def test_get_france_24(self):
        # Given
        language = LANGUAGE_FR
        string = "france 24"

        # When
        variations = get_string_variations(string, language)

        # Then
        expected_variations = {
            "france vingt-quatre",
            "France vingt-quatre",
            "france vingt quatre",
            "France vingt quatre",
            "france 24",
            "France 24",
        }
        self.assertSetEqual(variations, expected_variations)
Esempio n. 7
0
def _validate_and_format_custom_entity(entity, utterance_entities, language,
                                       builtin_entity_parser):
    validate_type(entity, dict, object_label="entity")

    # TODO: this is here temporarily, only to allow backward compatibility
    if MATCHING_STRICTNESS not in entity:
        strictness = entity.get("parser_threshold", 1.0)

        entity[MATCHING_STRICTNESS] = strictness

    mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA,
                      MATCHING_STRICTNESS]
    validate_keys(entity, mandatory_keys, object_label="custom entity")
    validate_type(entity[USE_SYNONYMS], bool, object_label="use_synonyms")
    validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool,
                  object_label="automatically_extensible")
    validate_type(entity[DATA], list, object_label="entity data")
    validate_type(entity[MATCHING_STRICTNESS], (float, int),
                  object_label="matching_strictness")

    formatted_entity = dict()
    formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[
        AUTOMATICALLY_EXTENSIBLE]
    formatted_entity[MATCHING_STRICTNESS] = entity[MATCHING_STRICTNESS]
    if LICENSE_INFO in entity:
        formatted_entity[LICENSE_INFO] = entity[LICENSE_INFO]
    use_synonyms = entity[USE_SYNONYMS]

    # Validate format and filter out unused data
    valid_entity_data = []
    for entry in entity[DATA]:
        validate_type(entry, dict, object_label="entity entry")
        validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry")
        entry[VALUE] = entry[VALUE].strip()
        if not entry[VALUE]:
            continue
        validate_type(entry[SYNONYMS], list, object_label="entity synonyms")
        entry[SYNONYMS] = [s.strip() for s in entry[SYNONYMS] if s.strip()]
        valid_entity_data.append(entry)
    entity[DATA] = valid_entity_data

    # Compute capitalization before normalizing
    # Normalization lowercase and hence lead to bad capitalization calculation
    formatted_entity[CAPITALIZE] = _has_any_capitalization(utterance_entities,
                                                           language)

    validated_utterances = dict()
    # Map original values an synonyms
    for data in entity[DATA]:
        ent_value = data[VALUE]
        validated_utterances[ent_value] = ent_value
        if use_synonyms:
            for s in data[SYNONYMS]:
                if s not in validated_utterances:
                    validated_utterances[s] = ent_value

    # Number variations in entities values are expensive since each entity
    # value is parsed with the builtin entity parser before creating the
    # variations. We avoid generating these variations if there's enough entity
    # values

    # Add variations if not colliding
    all_original_values = _extract_entity_values(entity)
    if len(entity[DATA]) < VARIATIONS_GENERATION_THRESHOLD:
        variations_args = {
            "case": True,
            "and_": True,
            "punctuation": True
        }
    else:
        variations_args = {
            "case": False,
            "and_": False,
            "punctuation": False
        }

    variations_args["numbers"] = len(
        entity[DATA]) < NUMBER_VARIATIONS_THRESHOLD

    variations = dict()
    for data in entity[DATA]:
        ent_value = data[VALUE]
        values_to_variate = {ent_value}
        if use_synonyms:
            values_to_variate.update(set(data[SYNONYMS]))
        variations[ent_value] = set(
            v for value in values_to_variate
            for v in get_string_variations(
                value, language, builtin_entity_parser, **variations_args)
        )
    variation_counter = Counter(
        [v for variations_ in itervalues(variations) for v in variations_])
    non_colliding_variations = {
        value: [
            v for v in variations if
            v not in all_original_values and variation_counter[v] == 1
        ]
        for value, variations in iteritems(variations)
    }

    for entry in entity[DATA]:
        entry_value = entry[VALUE]
        validated_utterances = _add_entity_variations(
            validated_utterances, non_colliding_variations, entry_value)

    # Merge utterances entities
    utterance_entities_variations = {
        ent: get_string_variations(
            ent, language, builtin_entity_parser, **variations_args)
        for ent in utterance_entities
    }

    for original_ent, variations in iteritems(utterance_entities_variations):
        if not original_ent or original_ent in validated_utterances:
            continue
        validated_utterances[original_ent] = original_ent
        for variation in variations:
            if variation and variation not in validated_utterances \
                    and variation not in utterance_entities:
                validated_utterances[variation] = original_ent
    formatted_entity[UTTERANCES] = validated_utterances
    return formatted_entity
Esempio n. 8
0
def validate_and_format_custom_entity(entity, queries_entities, language,
                                      builtin_entity_parser):
    validate_type(entity, dict)

    # TODO: this is here temporarily, only to allow backward compatibility
    if MATCHING_STRICTNESS not in entity:
        strictness = entity.get("parser_threshold", 1.0)

        entity[MATCHING_STRICTNESS] = strictness

    mandatory_keys = [
        USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS
    ]
    validate_keys(entity, mandatory_keys, object_label="entity")
    validate_type(entity[USE_SYNONYMS], bool)
    validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool)
    validate_type(entity[DATA], list)
    validate_type(entity[MATCHING_STRICTNESS], float)

    formatted_entity = dict()
    formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[
        AUTOMATICALLY_EXTENSIBLE]
    formatted_entity[MATCHING_STRICTNESS] = entity[MATCHING_STRICTNESS]
    use_synonyms = entity[USE_SYNONYMS]

    # Validate format and filter out unused data
    valid_entity_data = []
    for entry in entity[DATA]:
        validate_type(entry, dict)
        validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry")
        entry[VALUE] = entry[VALUE].strip()
        if not entry[VALUE]:
            continue
        validate_type(entry[SYNONYMS], list)
        entry[SYNONYMS] = [
            s.strip() for s in entry[SYNONYMS] if len(s.strip()) > 0
        ]
        valid_entity_data.append(entry)
    entity[DATA] = valid_entity_data

    # Compute capitalization before normalizing
    # Normalization lowercase and hence lead to bad capitalization calculation
    formatted_entity[CAPITALIZE] = has_any_capitalization(
        queries_entities, language)

    validated_utterances = dict()
    # Map original values an synonyms
    for data in entity[DATA]:
        ent_value = data[VALUE]
        if not ent_value:
            continue
        validated_utterances[ent_value] = ent_value
        if use_synonyms:
            for s in data[SYNONYMS]:
                if s and s not in validated_utterances:
                    validated_utterances[s] = ent_value

    # Add variations if not colliding
    all_original_values = _extract_entity_values(entity)
    variations = dict()
    for data in entity[DATA]:
        ent_value = data[VALUE]
        values_to_variate = {ent_value}
        if use_synonyms:
            values_to_variate.update(set(data[SYNONYMS]))
        variations[ent_value] = set(
            v for value in values_to_variate for v in get_string_variations(
                value, language, builtin_entity_parser))
    variation_counter = Counter(
        [v for vars in itervalues(variations) for v in vars])
    non_colliding_variations = {
        value: [
            v for v in variations
            if v not in all_original_values and variation_counter[v] == 1
        ]
        for value, variations in iteritems(variations)
    }

    for entry in entity[DATA]:
        entry_value = entry[VALUE]
        validated_utterances = add_entity_variations(validated_utterances,
                                                     non_colliding_variations,
                                                     entry_value)

    # Merge queries entities
    queries_entities_variations = {
        ent: get_string_variations(ent, language, builtin_entity_parser)
        for ent in queries_entities
    }
    for original_ent, variations in iteritems(queries_entities_variations):
        if not original_ent or original_ent in validated_utterances:
            continue
        validated_utterances[original_ent] = original_ent
        for variation in variations:
            if variation and variation not in validated_utterances:
                validated_utterances[variation] = original_ent
    formatted_entity[UTTERANCES] = validated_utterances
    return formatted_entity