Beispiel #1
0
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it"""
    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict)
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict)
    validate_type(dataset[INTENTS], dict)
    language = dataset[LANGUAGE]
    validate_type(language, str)
    if language not in get_all_languages():
        raise ValueError("Unknown language: '%s'" % language)

    for intent in itervalues(dataset[INTENTS]):
        validate_and_format_intent(intent, dataset[ENTITIES])

    queries_entities_values = extract_queries_entities(dataset)

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        queries_entities = queries_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                validate_and_format_builtin_entity(entity, queries_entities)
        else:
            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
                entity, queries_entities, language)
    dataset[VALIDATED] = True
    return dataset
Beispiel #2
0
def validate_and_format_dataset(dataset):
    """Checks that the dataset is valid and format it"""
    # Make this function idempotent
    if dataset.get(VALIDATED, False):
        return dataset
    dataset = deepcopy(dataset)
    dataset = json.loads(json.dumps(dataset))
    validate_type(dataset, dict)
    mandatory_keys = [INTENTS, ENTITIES, LANGUAGE]
    for key in mandatory_keys:
        validate_key(dataset, key, object_label="dataset")
    validate_type(dataset[ENTITIES], dict)
    validate_type(dataset[INTENTS], dict)
    language = dataset[LANGUAGE]
    validate_type(language, str)
    if language not in get_all_languages():
        raise ValueError("Unknown language: '%s'" % language)

    for intent in itervalues(dataset[INTENTS]):
        validate_and_format_intent(intent, dataset[ENTITIES])

    queries_entities_values = extract_queries_entities(dataset)

    for entity_name, entity in iteritems(dataset[ENTITIES]):
        queries_entities = queries_entities_values[entity_name]
        if is_builtin_entity(entity_name):
            dataset[ENTITIES][entity_name] = \
                validate_and_format_builtin_entity(entity, queries_entities)
        else:
            dataset[ENTITIES][entity_name] = validate_and_format_custom_entity(
                entity, queries_entities, language)
    dataset[VALIDATED] = True
    return dataset
Beispiel #3
0
def validate_and_format_intent(intent, entities):
    validate_type(intent, dict)
    validate_key(intent, UTTERANCES, object_label="intent dict")
    validate_type(intent[UTTERANCES], list)
    for utterance in intent[UTTERANCES]:
        validate_type(utterance, dict)
        validate_key(utterance, DATA, object_label="utterance")
        validate_type(utterance[DATA], list)
        for chunk in utterance[DATA]:
            validate_type(chunk, dict)
            validate_key(chunk, TEXT, object_label="chunk")
            if ENTITY in chunk or SLOT_NAME in chunk:
                mandatory_keys = [ENTITY, SLOT_NAME]
                validate_keys(chunk, mandatory_keys, object_label="chunk")
                if is_builtin_entity(chunk[ENTITY]):
                    continue
                else:
                    validate_key(entities, chunk[ENTITY],
                                 object_label=ENTITIES)
    return intent
Beispiel #4
0
def validate_and_format_intent(intent, entities):
    validate_type(intent, dict)
    validate_key(intent, UTTERANCES, object_label="intent dict")
    validate_type(intent[UTTERANCES], list)
    for utterance in intent[UTTERANCES]:
        validate_type(utterance, dict)
        validate_key(utterance, DATA, object_label="utterance")
        validate_type(utterance[DATA], list)
        for chunk in utterance[DATA]:
            validate_type(chunk, dict)
            validate_key(chunk, TEXT, object_label="chunk")
            if ENTITY in chunk or SLOT_NAME in chunk:
                mandatory_keys = [ENTITY, SLOT_NAME]
                validate_keys(chunk, mandatory_keys, object_label="chunk")
                if is_builtin_entity(chunk[ENTITY]):
                    continue
                else:
                    validate_key(entities, chunk[ENTITY],
                                 object_label=ENTITIES)
    return intent
Beispiel #5
0
def validate_and_format_builtin_entity(entity, queries_entities):
    validate_type(entity, dict)
    return {UTTERANCES: set(queries_entities)}
Beispiel #6
0
def validate_and_format_custom_entity(entity, queries_entities, language):
    validate_type(entity, dict)
    mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA]
    validate_keys(entity, mandatory_keys, object_label="entity")
    validate_type(entity[USE_SYNONYMS], bool)
    validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool)
    validate_type(entity[DATA], list)

    formatted_entity = dict()
    formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[
        AUTOMATICALLY_EXTENSIBLE]
    use_synonyms = entity[USE_SYNONYMS]

    # Validate format and filter out unused data
    valid_entity_data = []
    for entry in entity[DATA]:
        validate_type(entry, dict)
        validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry")
        entry[VALUE] = entry[VALUE].strip()
        if not entry[VALUE]:
            continue
        validate_type(entry[SYNONYMS], list)
        entry[SYNONYMS] = [
            s.strip() for s in entry[SYNONYMS] if len(s.strip()) > 0
        ]
        valid_entity_data.append(entry)
    entity[DATA] = valid_entity_data

    # Compute capitalization before normalizing
    # Normalization lowercase and hence lead to bad capitalization calculation
    formatted_entity[CAPITALIZE] = has_any_capitalization(
        queries_entities, language)

    # Normalize
    validated_data = dict()
    for entry in entity[DATA]:
        entry_value = entry[VALUE]
        validated_data = add_variation_if_needed(validated_data, entry_value,
                                                 entry_value, language)

        if use_synonyms:
            for s in entry[SYNONYMS]:
                validated_data = add_variation_if_needed(
                    validated_data, s, entry_value, language)

    formatted_entity[UTTERANCES] = validated_data
    # Merge queries_entities
    for value in queries_entities:
        formatted_entity = add_entity_value_if_missing(value, formatted_entity,
                                                       language)

    return formatted_entity
Beispiel #7
0
def validate_and_format_builtin_entity(entity):
    validate_type(entity, dict)
    return entity
Beispiel #8
0
def validate_and_format_custom_entity(entity, queries_entities, language,
                                      builtin_entity_parser):
    validate_type(entity, dict)

    # TODO: this is here temporarily, only to allow backward compatibility
    if MATCHING_STRICTNESS not in entity:
        strictness = entity.get("parser_threshold", 1.0)

        entity[MATCHING_STRICTNESS] = strictness

    mandatory_keys = [
        USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS
    ]
    validate_keys(entity, mandatory_keys, object_label="entity")
    validate_type(entity[USE_SYNONYMS], bool)
    validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool)
    validate_type(entity[DATA], list)
    validate_type(entity[MATCHING_STRICTNESS], float)

    formatted_entity = dict()
    formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[
        AUTOMATICALLY_EXTENSIBLE]
    formatted_entity[MATCHING_STRICTNESS] = entity[MATCHING_STRICTNESS]
    use_synonyms = entity[USE_SYNONYMS]

    # Validate format and filter out unused data
    valid_entity_data = []
    for entry in entity[DATA]:
        validate_type(entry, dict)
        validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry")
        entry[VALUE] = entry[VALUE].strip()
        if not entry[VALUE]:
            continue
        validate_type(entry[SYNONYMS], list)
        entry[SYNONYMS] = [
            s.strip() for s in entry[SYNONYMS] if len(s.strip()) > 0
        ]
        valid_entity_data.append(entry)
    entity[DATA] = valid_entity_data

    # Compute capitalization before normalizing
    # Normalization lowercase and hence lead to bad capitalization calculation
    formatted_entity[CAPITALIZE] = has_any_capitalization(
        queries_entities, language)

    validated_utterances = dict()
    # Map original values an synonyms
    for data in entity[DATA]:
        ent_value = data[VALUE]
        if not ent_value:
            continue
        validated_utterances[ent_value] = ent_value
        if use_synonyms:
            for s in data[SYNONYMS]:
                if s and s not in validated_utterances:
                    validated_utterances[s] = ent_value

    # Add variations if not colliding
    all_original_values = _extract_entity_values(entity)
    variations = dict()
    for data in entity[DATA]:
        ent_value = data[VALUE]
        values_to_variate = {ent_value}
        if use_synonyms:
            values_to_variate.update(set(data[SYNONYMS]))
        variations[ent_value] = set(
            v for value in values_to_variate for v in get_string_variations(
                value, language, builtin_entity_parser))
    variation_counter = Counter(
        [v for vars in itervalues(variations) for v in vars])
    non_colliding_variations = {
        value: [
            v for v in variations
            if v not in all_original_values and variation_counter[v] == 1
        ]
        for value, variations in iteritems(variations)
    }

    for entry in entity[DATA]:
        entry_value = entry[VALUE]
        validated_utterances = add_entity_variations(validated_utterances,
                                                     non_colliding_variations,
                                                     entry_value)

    # Merge queries entities
    queries_entities_variations = {
        ent: get_string_variations(ent, language, builtin_entity_parser)
        for ent in queries_entities
    }
    for original_ent, variations in iteritems(queries_entities_variations):
        if not original_ent or original_ent in validated_utterances:
            continue
        validated_utterances[original_ent] = original_ent
        for variation in variations:
            if variation and variation not in validated_utterances:
                validated_utterances[variation] = original_ent
    formatted_entity[UTTERANCES] = validated_utterances
    return formatted_entity
Beispiel #9
0
def validate_and_format_builtin_entity(entity, queries_entities):
    validate_type(entity, dict)
    return {UTTERANCES: set(queries_entities)}
Beispiel #10
0
def validate_and_format_custom_entity(entity, queries_entities, language):
    validate_type(entity, dict)
    mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA]
    validate_keys(entity, mandatory_keys, object_label="entity")
    validate_type(entity[USE_SYNONYMS], bool)
    validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool)
    validate_type(entity[DATA], list)

    formatted_entity = dict()
    formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[
        AUTOMATICALLY_EXTENSIBLE]
    use_synonyms = entity[USE_SYNONYMS]

    # Validate format and filter out unused data
    valid_entity_data = []
    for entry in entity[DATA]:
        validate_type(entry, dict)
        validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry")
        entry[VALUE] = entry[VALUE].strip()
        if not entry[VALUE]:
            continue
        validate_type(entry[SYNONYMS], list)
        entry[SYNONYMS] = [s.strip() for s in entry[SYNONYMS]
                           if len(s.strip()) > 0]
        valid_entity_data.append(entry)
    entity[DATA] = valid_entity_data

    # Compute capitalization before normalizing
    # Normalization lowercase and hence lead to bad capitalization calculation
    formatted_entity[CAPITALIZE] = has_any_capitalization(queries_entities,
                                                          language)

    # Normalize
    validated_data = dict()
    for entry in entity[DATA]:
        entry_value = entry[VALUE]
        validated_data = add_variation_if_needed(
            validated_data, entry_value, entry_value, language)

        if use_synonyms:
            for s in entry[SYNONYMS]:
                validated_data = add_variation_if_needed(
                    validated_data, s, entry_value, language)

    formatted_entity[UTTERANCES] = validated_data
    # Merge queries_entities
    for value in queries_entities:
        formatted_entity = add_entity_value_if_missing(
            value, formatted_entity, language)

    return formatted_entity