def _get_joined_entity_utterances(dataset, language):
    joined_entity_utterances = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        # matches are performed in a case insensitive manner
        utterances = set(u.lower() for u in entity[UTTERANCES])
        patterns = []
        if is_builtin_entity(entity_name):
            # We add a placeholder value for builtin entities
            placeholder = _get_entity_name_placeholder(entity_name, language)
            patterns.append(regex_escape(placeholder))
        else:
            for utterance in utterances:
                tokens = tokenize_light(utterance, language)
                pattern = WHITESPACE_PATTERN.join(regex_escape(t)
                                                  for t in tokens)
                patterns.append(pattern)
        patterns = (p for p in patterns if p)
        joined_entity_utterances[entity_name] = r"|".join(
            sorted(patterns, key=len, reverse=True))
    return joined_entity_utterances
def _query_to_pattern(query, joined_entity_utterances,
                      group_names_to_slot_names, language):
    pattern = []
    for chunk in query[DATA]:
        if SLOT_NAME in chunk:
            max_index = _generate_new_index(group_names_to_slot_names)
            slot_name = chunk[SLOT_NAME]
            entity = chunk[ENTITY]
            group_names_to_slot_names[max_index] = slot_name
            pattern.append(
                r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity]))
        else:
            tokens = tokenize_light(chunk[TEXT], language)
            pattern += [regex_escape(t) for t in tokens]
    ignored_char_pattern = get_ignored_characters_pattern(language)
    pattern = r"^%s%s%s$" % (ignored_char_pattern,
                             ignored_char_pattern.join(pattern),
                             ignored_char_pattern)
    return pattern, group_names_to_slot_names
def _query_to_pattern(query, joined_entity_utterances,
                      group_names_to_slot_names, language):
    pattern = []
    for chunk in query[DATA]:
        if SLOT_NAME in chunk:
            max_index = _generate_new_index(group_names_to_slot_names)
            slot_name = chunk[SLOT_NAME]
            entity = chunk[ENTITY]
            group_names_to_slot_names[max_index] = slot_name
            pattern.append(
                r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity]))
        else:
            tokens = tokenize_light(chunk[TEXT], language)
            pattern += [regex_escape(t) for t in tokens]

    pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN,
                             WHITESPACE_PATTERN.join(pattern),
                             WHITESPACE_PATTERN)
    return pattern, group_names_to_slot_names
Beispiel #4
0
from __future__ import unicode_literals

import re
import string

from num2words import num2words

from snips_nlu.utils import regex_escape

SPACE = " "
WHITE_SPACES = "%s\t\n\r\f\v" % SPACE  # equivalent of r"\s"
COMMONLY_IGNORED_CHARACTERS = "%s%s" % (WHITE_SPACES, string.punctuation)
COMMONLY_IGNORED_CHARACTERS_PATTERN = r"[%s]*" % regex_escape(
    COMMONLY_IGNORED_CHARACTERS)

_PUNCTUATION_REGEXES = dict()
_NUM2WORDS_SUPPORT = dict()


# pylint:disable=unused-argument
def get_default_sep(language):
    return " "


# pylint:enable=unused-argument

# pylint:disable=unused-argument
def get_punctuation(language):
    return string.punctuation