Exemple #1
0
def _slot_matching_lambda(lhs_slot, rhs_slot):
    lhs_value = lhs_slot["text"]
    rhs_value = rhs_slot["rawValue"]
    if lhs_slot["entity"] != "snips/datetime":
        return lhs_value == rhs_value
    else:
        # Allow fuzzy matching when comparing datetimes
        lhs_tokens = tokenize_light(lhs_value, LANGUAGE_EN)
        rhs_tokens = tokenize_light(rhs_value, LANGUAGE_EN)
        if lhs_tokens and lhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES:
            lhs_tokens = lhs_tokens[1:]
        if rhs_tokens and rhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES:
            rhs_tokens = rhs_tokens[1:]
        return lhs_tokens == rhs_tokens
def generate_noise_utterances(augmented_utterances, noise, num_intents,
                              data_augmentation_config, language,
                              random_state):
    if not augmented_utterances or not num_intents:
        return []
    avg_num_utterances = len(augmented_utterances) / float(num_intents)
    if data_augmentation_config.unknown_words_replacement_string is not None:
        noise = generate_smart_noise(
            noise, augmented_utterances,
            data_augmentation_config.unknown_words_replacement_string,
            language)

    noise_size = min(
        int(data_augmentation_config.noise_factor * avg_num_utterances),
        len(noise))
    utterances_lengths = [
        len(tokenize_light(get_text_from_chunks(u[DATA]), language))
        for u in augmented_utterances
    ]
    mean_utterances_length = np.mean(utterances_lengths)
    std_utterances_length = np.std(utterances_lengths)
    noise_it = get_noise_it(noise, mean_utterances_length,
                            std_utterances_length, random_state)
    # Remove duplicate 'unknownword unknownword'
    return [
        text_to_utterance(UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it)))
        for _ in range(noise_size)
    ]
    def _utterance_to_pattern(self, utterance, stop_words,
                              entity_placeholders):
        from snips_nlu_utils import normalize

        slot_names_count = defaultdict(int)
        pattern = []
        for chunk in utterance[DATA]:
            if SLOT_NAME in chunk:
                slot_name = chunk[SLOT_NAME]
                slot_names_count[slot_name] += 1
                group_name = self.slot_names_to_group_names[slot_name]
                count = slot_names_count[slot_name]
                if count > 1:
                    group_name = "%s_%s" % (group_name, count)
                placeholder = entity_placeholders[chunk[ENTITY]]
                pattern.append(r"(?P<%s>%s)" % (group_name, placeholder))
            else:
                tokens = tokenize_light(chunk[TEXT], self.language)
                pattern += [
                    regex_escape(t.lower()) for t in tokens
                    if normalize(t) not in stop_words
                ]

        pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN,
                                 WHITESPACE_PATTERN.join(pattern),
                                 WHITESPACE_PATTERN)
        return pattern
    def _init_vectorizer(self, language):
        from sklearn.feature_extraction.text import (TfidfVectorizer as
                                                     SklearnTfidfVectorizer)

        self._tfidf_vectorizer = SklearnTfidfVectorizer(
            tokenizer=lambda x: tokenize_light(x, language))
        return self
Exemple #5
0
def generate_smart_noise(noise, augmented_utterances, replacement_string,
                         language):
    text_utterances = [get_text_from_chunks(u[DATA])
                       for u in augmented_utterances]
    vocab = [w for u in text_utterances for w in tokenize_light(u, language)]
    vocab = set(vocab)
    return [w if w in vocab else replacement_string for w in noise]
Exemple #6
0
 def _preprocess_text(self, txt, intent):
     """Replaces stop words and characters that are tokenized out by
         whitespaces"""
     stop_words = self._get_intent_stop_words(intent)
     tokens = tokenize_light(txt, self.language)
     cleaned_string = " ".join(
         [tkn for tkn in tokens if normalize(tkn) not in stop_words])
     return cleaned_string.lower()
Exemple #7
0
    def fit(self, dataset, utterances, classes):
        utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances)
        if not any(tokenize_light(q, self.language) for q in utterances_texts):
            return None

        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[_normalize_stem(
                k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        features_idx = {
            self.tfidf_vectorizer.vocabulary_[word]: word
            for word in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [
            i for i, v in enumerate(pval) if v < self.config.pvalue_threshold
        ]
        if not self.best_features:
            self.best_features = [
                idx for idx, val in enumerate(pval) if val == pval.min()
            ]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": features_idx[utterance_index],
                "pval": pval[utterance_index]
            }

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
Exemple #8
0
 def _enrich_utterance(self, x, builtin_ents, custom_ents):
     utterance = get_text_from_chunks(x[DATA])
     all_entities = builtin_ents + custom_ents
     placeholder_fn = self._placeholder_fn
     # Replace entities with placeholders
     enriched_utterance = replace_entities_with_placeholders(
         utterance, all_entities, placeholder_fn)[1]
     # Tokenize
     enriched_utterance = tokenize_light(enriched_utterance, self.language)
     # Remove the unknownword strings if needed
     if self.config.unknown_words_replacement_string:
         enriched_utterance = [
             t for t in enriched_utterance
             if t != self.config.unknown_words_replacement_string
         ]
     return enriched_utterance
    def from_path(cls, path, **shared):
        import numpy as np
        import scipy.sparse as sp
        from sklearn.feature_extraction.text import (TfidfTransformer,
                                                     TfidfVectorizer as
                                                     SklearnTfidfVectorizer)

        path = Path(path)

        model_path = path / "vectorizer.json"
        if not model_path.exists():
            raise LoadingError("Missing vectorizer model file: %s" %
                               model_path.name)
        with model_path.open("r", encoding="utf-8") as f:
            vectorizer_dict = json.load(f)

        vectorizer = cls(vectorizer_dict["config"], **shared)
        vectorizer._language = vectorizer_dict["language_code"]

        builtin_entity_scope = vectorizer_dict["builtin_entity_scope"]
        if builtin_entity_scope is not None:
            builtin_entity_scope = set(builtin_entity_scope)
        vectorizer.builtin_entity_scope = builtin_entity_scope

        vectorizer_ = vectorizer_dict["vectorizer"]
        if vectorizer_:
            vocab = vectorizer_["vocab"]
            idf_diag_data = vectorizer_["idf_diag"]
            idf_diag_data = np.array(idf_diag_data)

            idf_diag_shape = (len(idf_diag_data), len(idf_diag_data))
            row = list(range(idf_diag_shape[0]))
            col = list(range(idf_diag_shape[0]))
            idf_diag = sp.csr_matrix((idf_diag_data, (row, col)),
                                     shape=idf_diag_shape)

            tfidf_transformer = TfidfTransformer()
            tfidf_transformer._idf_diag = idf_diag

            vectorizer_ = SklearnTfidfVectorizer(
                tokenizer=lambda x: tokenize_light(x, vectorizer._language))
            vectorizer_.vocabulary_ = vocab

            vectorizer_._tfidf = tfidf_transformer

        vectorizer._tfidf_vectorizer = vectorizer_
        return vectorizer
Exemple #10
0
def _create_custom_entity_parser_configuration(
        entities, stopwords_fraction, language):
    """Dynamically creates the gazetteer parser configuration.

    Args:
        entities (dict): entity for the dataset
        stopwords_fraction (float): fraction of the vocabulary of
            the entity values that will be considered as stop words (
            the top n_vocabulary * stopwords_fraction most frequent words will
            be considered stop words)
        language (str): language of the entities

    Returns: the parser configuration as dictionary
    """

    if not 0 < stopwords_fraction < 1:
        raise ValueError("stopwords_fraction must be in ]0.0, 1.0[")

    parser_configurations = []
    for entity_name, entity in sorted(iteritems(entities)):
        vocabulary = set(
            t for raw_value in entity[UTTERANCES]
            for t in tokenize_light(raw_value, language)
        )
        num_stopwords = int(stopwords_fraction * len(vocabulary))
        config = {
            "entity_identifier": entity_name,
            "entity_parser": {
                "threshold": entity[MATCHING_STRICTNESS],
                "n_gazetteer_stop_words": num_stopwords,
                "gazetteer": [
                    {
                        "raw_value": k,
                        "resolved_value": v
                    } for k, v in sorted(iteritems(entity[UTTERANCES]))
                ]
            }
        }
        if LICENSE_INFO in entity:
            config["entity_parser"][LICENSE_INFO] = entity[LICENSE_INFO]
        parser_configurations.append(config)

    configuration = {
        "entity_parsers": parser_configurations
    }

    return configuration
def _query_to_pattern(query, joined_entity_utterances,
                      group_names_to_slot_names, language):
    pattern = []
    for chunk in query[DATA]:
        if SLOT_NAME in chunk:
            max_index = _generate_new_index(group_names_to_slot_names)
            slot_name = chunk[SLOT_NAME]
            entity = chunk[ENTITY]
            group_names_to_slot_names[max_index] = slot_name
            pattern.append(
                r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity]))
        else:
            tokens = tokenize_light(chunk[TEXT], language)
            pattern += [regex_escape(t) for t in tokens]

    pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN,
                             WHITESPACE_PATTERN.join(pattern),
                             WHITESPACE_PATTERN)
    return pattern, group_names_to_slot_names
Exemple #12
0
    def fit_transform(self, dataset, utterances, classes, none_class):
        dataset = validate_and_format_dataset(dataset)
        self.language = dataset[LANGUAGE]

        utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances)
        if not any(tokenize_light(q, self.language) for q in utterances_texts):
            raise _EmptyDatasetUtterancesError(
                "Tokenized utterances are empty")

        x_tfidf = self._fit_transform_tfidf_vectorizer(utterances, classes,
                                                       dataset)
        x = x_tfidf
        if self.config.added_cooccurrence_feature_ratio:
            self._fit_cooccurrence_vectorizer(utterances, classes, none_class,
                                              dataset)
            x_cooccurrence = self.cooccurrence_vectorizer.transform(utterances)
            x = sp.hstack((x_tfidf, x_cooccurrence))

        return x
Exemple #13
0
def _preprocess_utterance(utterance, language, builtin_entity_parser,
                          custom_entity_parser, word_clusters_name,
                          use_stemming, unknownword_replacement_string):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language, use_stemming)
                                 for t in utterance_tokens]

    custom_entities = custom_entity_parser.parse(
        " ".join(normalized_stemmed_tokens))
    custom_entities = [e for e in custom_entities
                       if e["value"] != unknownword_replacement_string]
    custom_entities_features = [
        _entity_name_to_feature(e[ENTITY_KIND], language)
        for e in custom_entities]

    builtin_entities = builtin_entity_parser.parse(
        utterance_text, use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language, use_stemming)
        for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if custom_entities_features:
        features += " " + " ".join(sorted(custom_entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
def _get_joined_entity_utterances(dataset, language):
    joined_entity_utterances = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        # matches are performed in a case insensitive manner
        utterances = set(u.lower() for u in entity[UTTERANCES])
        patterns = []
        if is_builtin_entity(entity_name):
            # We add a placeholder value for builtin entities
            placeholder = _get_entity_name_placeholder(entity_name, language)
            patterns.append(regex_escape(placeholder))
        else:
            for utterance in utterances:
                tokens = tokenize_light(utterance, language)
                pattern = WHITESPACE_PATTERN.join(regex_escape(t)
                                                  for t in tokens)
                patterns.append(pattern)
        patterns = (p for p in patterns if p)
        joined_entity_utterances[entity_name] = r"|".join(
            sorted(patterns, key=len, reverse=True))
    return joined_entity_utterances
def get_string_variations(string, language):
    variations = {string}
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(normalization_variations(v) for v in variations))
    # We re-generate case variations as normalization can produce new
    # variations
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(and_variations(v, language) for v in variations))
    variations.update(
        flatten(punctuation_variations(v, language) for v in variations))
    variations.update(
        flatten(numbers_variations(v, language) for v in variations))
    # Add single space variations
    single_space_variations = set(" ".join(v.split()) for v in variations)
    variations.update(single_space_variations)
    # Add tokenized variations
    tokenized_variations = set(
        get_default_sep(language).join(tokenize_light(v, language))
        for v in variations)
    variations.update(tokenized_variations)
    return variations
Exemple #16
0
    def _preprocess(self, utterances, training=False):
        normalized_utterances = deepcopy(utterances)
        for u in normalized_utterances:
            for chunk in u[DATA]:
                chunk[TEXT] = _normalize_stem(chunk[TEXT], self.language,
                                              self.resources,
                                              self.config.use_stemming)

        if training:
            builtin_ents, custom_ents = zip(
                *[_entities_from_utterance(u) for u in utterances])
        else:
            # Extract builtin entities on unormalized utterances
            builtin_ents = [
                self.builtin_entity_parser.parse(get_text_from_chunks(u[DATA]),
                                                 self.builtin_entity_scope,
                                                 use_cache=True)
                for u in utterances
            ]
            # Extract builtin entities on normalized utterances
            custom_ents = [
                self.custom_entity_parser.parse(get_text_from_chunks(u[DATA]),
                                                use_cache=True)
                for u in normalized_utterances
            ]
        if self.config.word_clusters_name:
            # Extract world clusters on unormalized utterances
            original_utterances_text = [
                get_text_from_chunks(u[DATA]) for u in utterances
            ]
            w_clusters = [
                _get_word_cluster_features(
                    tokenize_light(u.lower(), self.language),
                    self.config.word_clusters_name, self.resources)
                for u in original_utterances_text
            ]
        else:
            w_clusters = [None for _ in normalized_utterances]

        return normalized_utterances, builtin_ents, custom_ents, w_clusters
Exemple #17
0
    def fit(self, dataset, utterances, classes):
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)

        utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances)
        if not any(tokenize_light(q, self.language) for q in utterances_texts):
            return None

        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        features_idx = {self.tfidf_vectorizer.vocabulary_[word]: word for word
                        in self.tfidf_vectorizer.vocabulary_}

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [i for i, v in enumerate(pval) if
                              v < self.config.pvalue_threshold]
        if not self.best_features:
            self.best_features = [idx for idx, val in enumerate(pval) if
                                  val == pval.min()]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": features_idx[utterance_index],
                "pval": pval[utterance_index]}

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
Exemple #18
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance_text,
                                            language,
                                            use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
Exemple #19
0
def _builtin_entity_to_feature(builtin_entity_label, language):
    return "builtinentityfeature%s" % "".join(tokenize_light(
        builtin_entity_label.lower(), language))
Exemple #20
0
def _entity_name_to_feature(entity_name, language):
    return "entityfeature%s" % "".join(tokenize_light(
        entity_name.lower(), language))
Exemple #21
0
def _get_tfidf_vectorizer(language, sublinear_tf=False):
    return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language),
                           sublinear_tf=sublinear_tf)
def _get_entity_name_placeholder(entity_label, language):
    return "%%%s%%" % "".join(tokenize_light(entity_label, language)).upper()
def _has_any_capitalization(entity_utterances, language):
    for utterance in entity_utterances:
        tokens = tokenize_light(utterance, language)
        if any(t.isupper() or t.istitle() for t in tokens):
            return True
    return False
Exemple #24
0
 def stem_function(text, language):
     return get_default_sep(language).join(
         [_stem(t) for t in tokenize_light(text, language)])
Exemple #25
0
def capitalize(text, language, resources):
    tokens = tokenize_light(text, language)
    stop_words = get_stop_words(resources)
    return get_default_sep(language).join(
        t.title() if t.lower() not in stop_words else t.lower()
        for t in tokens)
Exemple #26
0
 def _placeholder_fn(self, entity_name):
     return "".join(tokenize_light(str(entity_name),
                                   str(self.language))).upper()
Exemple #27
0
 def placeholder_fn(x):
     return "%%%s%%" % "".join(tokenize_light(x, "en")).upper()
Exemple #28
0
 def _init_vectorizer(self, language):
     self._tfidf_vectorizer = SklearnTfidfVectorizer(
         tokenizer=lambda x: tokenize_light(x, language))
     return self
    variations.update(flatten(normalization_variations(v) for v in variations))
    # We re-generate case variations as normalization can produce new
    # variations
    if case:
        variations.update(flatten(case_variations(v) for v in variations))
    if and_:
        variations.update(
            flatten(and_variations(v, language) for v in variations))
    if punctuation:
        variations.update(
            flatten(punctuation_variations(v, language) for v in variations))

    # Special case of number variation which are long to generate due to the
    # BuilinEntityParser running on each variation
    if numbers:
        variations.update(
            flatten(
                numbers_variations(v, language, builtin_entity_parser)
                for v in variations))

    # Add single space variations
    single_space_variations = set(" ".join(v.split()) for v in variations)
    variations.update(single_space_variations)
    # Add tokenized variations
    tokenized_variations = set(
        get_default_sep(language).join(tokenize_light(v, language))
        for v in variations)
    variations.update(tokenized_variations)
    return variations