コード例 #1
0
def _slot_matching_lambda(lhs_slot, rhs_slot):
    lhs_value = lhs_slot["text"]
    rhs_value = rhs_slot["rawValue"]
    if lhs_slot["entity"] != "snips/datetime":
        return lhs_value == rhs_value
    else:
        # Allow fuzzy matching when comparing datetimes
        lhs_tokens = tokenize_light(lhs_value, LANGUAGE_EN)
        rhs_tokens = tokenize_light(rhs_value, LANGUAGE_EN)
        if lhs_tokens and lhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES:
            lhs_tokens = lhs_tokens[1:]
        if rhs_tokens and rhs_tokens[0].lower() in SKIPPED_DATE_PREFIXES:
            rhs_tokens = rhs_tokens[1:]
        return lhs_tokens == rhs_tokens
コード例 #2
0
def generate_noise_utterances(augmented_utterances, num_intents,
                              data_augmentation_config, language,
                              random_state):
    if not augmented_utterances or not num_intents:
        return []
    avg_num_utterances = len(augmented_utterances) / float(num_intents)
    if data_augmentation_config.unknown_words_replacement_string is not None:
        noise = generate_smart_noise(
            augmented_utterances,
            data_augmentation_config.unknown_words_replacement_string,
            language)
    else:
        noise = get_noises(language)

    noise_size = min(
        int(data_augmentation_config.noise_factor * avg_num_utterances),
        len(noise))
    utterances_lengths = [
        len(tokenize_light(get_text_from_chunks(u[DATA]), language))
        for u in augmented_utterances
    ]
    mean_utterances_length = np.mean(utterances_lengths)
    std_utterances_length = np.std(utterances_lengths)
    noise_it = get_noise_it(noise, mean_utterances_length,
                            std_utterances_length, random_state)
    # Remove duplicate 'unknowword unknowword'
    return [
        UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it))
        for _ in range(noise_size)
    ]
コード例 #3
0
def generate_noise_utterances(augmented_utterances, num_intents,
                              data_augmentation_config, language,
                              random_state):
    if not augmented_utterances or not num_intents:
        return []
    avg_num_utterances = len(augmented_utterances) / float(num_intents)
    if data_augmentation_config.unknown_words_replacement_string is not None:
        noise = generate_smart_noise(
            augmented_utterances,
            data_augmentation_config.unknown_words_replacement_string,
            language)
    else:
        noise = get_noises(language)

    noise_size = min(
        int(data_augmentation_config.noise_factor * avg_num_utterances),
        len(noise))
    utterances_lengths = [
        len(tokenize_light(get_text_from_chunks(u[DATA]), language))
        for u in augmented_utterances]
    mean_utterances_length = np.mean(utterances_lengths)
    std_utterances_length = np.std(utterances_lengths)
    noise_it = get_noise_it(noise, mean_utterances_length,
                            std_utterances_length, random_state)
    # Remove duplicate 'unknowword unknowword'
    return [UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it))
            for _ in range(noise_size)]
コード例 #4
0
def generate_smart_noise(augmented_utterances, replacement_string, language):
    text_utterances = [get_text_from_chunks(u[DATA])
                       for u in augmented_utterances]
    vocab = [w for u in text_utterances for w in tokenize_light(u, language)]
    vocab = set(vocab)
    noise = get_noises(language)
    return [w if w in vocab else replacement_string for w in noise]
コード例 #5
0
ファイル: featurizer.py プロジェクト: warp-x/snips-nlu
    def fit(self, dataset, utterances, classes):
        utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances)
        if not any(tokenize_light(q, self.language) for q in utterances_texts):
            return None

        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[_normalize_stem(
                k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        features_idx = {
            self.tfidf_vectorizer.vocabulary_[word]: word
            for word in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [
            i for i, v in enumerate(pval) if v < self.config.pvalue_threshold
        ]
        if not self.best_features:
            self.best_features = [
                idx for idx, val in enumerate(pval) if val == pval.min()
            ]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": features_idx[utterance_index],
                "pval": pval[utterance_index]
            }

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
コード例 #6
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_tokens = tokenize_light(utterance, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance, language)
    entities_ranges = (e[RES_MATCH_RANGE] for e in sorted(
        builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]))
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove builtin entities from the utterance to avoid learning specific
    # examples such as '42'
    filtered_utterance = _remove_ranges(utterance, entities_ranges)
    filtered_utterance_tokens = tokenize_light(filtered_utterance, language)
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in filtered_utterance_tokens
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
コード例 #7
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_tokens = tokenize_light(utterance, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language)
                                 for t in utterance_tokens]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance, language)
    entities_ranges = (
        e[RES_MATCH_RANGE] for e in
        sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START])
    )
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove builtin entities from the utterance to avoid learning specific
    # examples such as '42'
    filtered_utterance = _remove_ranges(utterance, entities_ranges)
    filtered_utterance_tokens = tokenize_light(filtered_utterance, language)
    filtered_normalized_stemmed_tokens = [_normalize_stem(t, language)
                                          for t in filtered_utterance_tokens]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
コード例 #8
0
    def fit(self, dataset, utterances, classes):
        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[
                _normalize_stem(k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        if all(not "".join(tokenize_light(q, self.language)) for q in
               utterances):
            return None
        preprocessed_utterances = self.preprocess_utterances(utterances)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_utterances)
        # pylint: enable=C0103
        list_index_words = {
            self.tfidf_vectorizer.vocabulary_[word]: word
            for word in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, classes)
        self.best_features = [i for i, v in enumerate(pval) if
                              v < self.config.pvalue_threshold]
        if not self.best_features:
            self.best_features = [idx for idx, val in enumerate(pval) if
                                  val == pval.min()]

        feature_names = {}
        for utterance_index in self.best_features:
            feature_names[utterance_index] = {
                "word": list_index_words[utterance_index],
                "pval": pval[utterance_index]}

        for feat in feature_names:
            if feature_names[feat]["word"] in stop_words:
                if feature_names[feat]["pval"] > \
                        self.config.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
コード例 #9
0
ファイル: featurizer.py プロジェクト: lym0302/snips-nlu
def _preprocess_query(query, language, entity_utterances_to_features_names):
    query_tokens = tokenize_light(query, language)
    word_clusters_features = _get_word_cluster_features(query_tokens, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language)
                                 for t in query_tokens]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    features = get_default_sep(language).join(normalized_stemmed_tokens)
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))
    return features
コード例 #10
0
ファイル: featurizer.py プロジェクト: xzm2004260/snips-nlu
def _preprocess_query(query, language, entity_utterances_to_features_names):
    query_tokens = tokenize_light(query, language)
    word_clusters_features = _get_word_cluster_features(query_tokens, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in query_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    features = get_default_sep(language).join(normalized_stemmed_tokens)
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))
    return features
コード例 #11
0
ファイル: featurizer.py プロジェクト: xzm2004260/snips-nlu
    def fit(self, dataset, queries, y):
        utterances_to_features = _get_utterances_to_features_names(
            dataset, self.language)
        normalized_utterances_to_features = defaultdict(set)
        for k, v in iteritems(utterances_to_features):
            normalized_utterances_to_features[_normalize_stem(
                k, self.language)].update(v)
        if self.unknown_words_replacement_string is not None \
                and self.unknown_words_replacement_string in \
                normalized_utterances_to_features:
            normalized_utterances_to_features.pop(
                self.unknown_words_replacement_string)
        self.entity_utterances_to_feature_names = dict(
            normalized_utterances_to_features)

        if all(not "".join(tokenize_light(q, self.language)) for q in queries):
            return None
        preprocessed_queries = self.preprocess_queries(queries)
        # pylint: disable=C0103
        X_train_tfidf = self.tfidf_vectorizer.fit_transform(
            preprocessed_queries)
        # pylint: enable=C0103
        list_index_words = {
            self.tfidf_vectorizer.vocabulary_[x]: x
            for x in self.tfidf_vectorizer.vocabulary_
        }

        stop_words = get_stop_words(self.language)

        _, pval = chi2(X_train_tfidf, y)
        self.best_features = [
            i for i, v in enumerate(pval) if v < self.pvalue_threshold
        ]
        if not self.best_features:
            self.best_features = [
                idx for idx, val in enumerate(pval) if val == pval.min()
            ]

        feature_names = {}
        for i in self.best_features:
            feature_names[i] = {'word': list_index_words[i], 'pval': pval[i]}

        for feat in feature_names:
            if feature_names[feat]['word'] in stop_words:
                if feature_names[feat]['pval'] > self.pvalue_threshold / 2.0:
                    self.best_features.remove(feat)

        return self
コード例 #12
0
def _query_to_pattern(query, joined_entity_utterances,
                      group_names_to_slot_names, language):
    pattern = []
    for chunk in query[DATA]:
        if SLOT_NAME in chunk:
            max_index = _generate_new_index(group_names_to_slot_names)
            slot_name = chunk[SLOT_NAME]
            entity = chunk[ENTITY]
            group_names_to_slot_names[max_index] = slot_name
            pattern.append(
                r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity]))
        else:
            tokens = tokenize_light(chunk[TEXT], language)
            pattern += [regex_escape(t) for t in tokens]
    ignored_char_pattern = get_ignored_characters_pattern(language)
    pattern = r"^%s%s%s$" % (ignored_char_pattern,
                             ignored_char_pattern.join(pattern),
                             ignored_char_pattern)
    return pattern, group_names_to_slot_names
コード例 #13
0
def _query_to_pattern(query, joined_entity_utterances,
                      group_names_to_slot_names, language):
    pattern = []
    for chunk in query[DATA]:
        if SLOT_NAME in chunk:
            max_index = _generate_new_index(group_names_to_slot_names)
            slot_name = chunk[SLOT_NAME]
            entity = chunk[ENTITY]
            group_names_to_slot_names[max_index] = slot_name
            pattern.append(
                r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity]))
        else:
            tokens = tokenize_light(chunk[TEXT], language)
            pattern += [regex_escape(t) for t in tokens]

    pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN,
                             WHITESPACE_PATTERN.join(pattern),
                             WHITESPACE_PATTERN)
    return pattern, group_names_to_slot_names
コード例 #14
0
def _get_joined_entity_utterances(dataset, language):
    joined_entity_utterances = dict()
    for entity_name, entity in iteritems(dataset[ENTITIES]):
        # matches are performed in a case insensitive manner
        utterances = set(u.lower() for u in entity[UTTERANCES])
        patterns = []
        for utterance in utterances:
            tokens = tokenize_light(utterance, language)
            pattern = WHITESPACE_PATTERN.join(regex_escape(t) for t in tokens)
            patterns.append(pattern)

        # We also add a placeholder value for builtin entities
        if is_builtin_entity(entity_name):
            placeholder = _get_entity_name_placeholder(entity_name, language)
            patterns.append(regex_escape(placeholder))

        patterns = (p for p in patterns if p)
        joined_entity_utterances[entity_name] = r"|".join(
            sorted(patterns, key=len, reverse=True))
    return joined_entity_utterances
コード例 #15
0
def get_string_variations(string, language):
    variations = {string}
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(normalization_variations(v) for v in variations))
    # We re-generate case variations as normalization can produce new
    # variations
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(and_variations(v, language) for v in variations))
    variations.update(
        flatten(punctuation_variations(v, language) for v in variations))
    variations.update(
        flatten(numbers_variations(v, language) for v in variations))
    # Add single space variations
    single_space_variations = set(" ".join(v.split()) for v in variations)
    variations.update(single_space_variations)
    # Add tokenized variations
    tokenized_variations = set(
        get_default_sep(language).join(tokenize_light(v, language)) for v in
        variations)
    variations.update(tokenized_variations)
    return variations
コード例 #16
0
def get_string_variations(string, language):
    variations = {string}
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(normalization_variations(v) for v in variations))
    # We re-generate case variations as normalization can produce new
    # variations
    variations.update(flatten(case_variations(v) for v in variations))
    variations.update(flatten(and_variations(v, language) for v in variations))
    variations.update(
        flatten(punctuation_variations(v, language) for v in variations))
    variations.update(
        flatten(numbers_variations(v, language) for v in variations))
    # Add single space variations
    single_space_variations = set(" ".join(v.split()) for v in variations)
    variations.update(single_space_variations)
    # Add tokenized variations
    tokenized_variations = set(
        get_default_sep(language).join(tokenize_light(v, language))
        for v in variations)
    variations.update(tokenized_variations)
    return variations
コード例 #17
0
ファイル: featurizer.py プロジェクト: warp-x/snips-nlu
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance_text,
                                            language,
                                            use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
コード例 #18
0
def has_any_capitalization(entity_utterances, language):
    for utterance in entity_utterances:
        tokens = tokenize_light(utterance, language)
        if any(t.isupper() or t.istitle() for t in tokens):
            return True
    return False
コード例 #19
0
def _get_entity_name_placeholder(entity_label, language):
    return "%%%s%%" % "".join(
        tokenize_light(entity_label, language)).upper()
コード例 #20
0
def capitalize(text, language):
    tokens = tokenize_light(text, language)
    stop_words = get_stop_words(language)
    return get_default_sep(language).join(
        t.title() if t.lower() not in stop_words
        else t.lower() for t in tokens)
コード例 #21
0
ファイル: featurizer.py プロジェクト: lym0302/snips-nlu
def _get_tfidf_vectorizer(language, extra_args=None):
    if extra_args is None:
        extra_args = dict()
    return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language),
                           **extra_args)
コード例 #22
0
 def stem_function(text, language):
     return get_default_sep(language).join(
         [_stem(t) for t in tokenize_light(text, language)])
コード例 #23
0
ファイル: featurizer.py プロジェクト: xzm2004260/snips-nlu
def _get_tfidf_vectorizer(language, extra_args=None):
    if extra_args is None:
        extra_args = dict()
    return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language),
                           **extra_args)
コード例 #24
0
ファイル: featurizer.py プロジェクト: warp-x/snips-nlu
def _get_tfidf_vectorizer(language, sublinear_tf=False):
    return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language),
                           sublinear_tf=sublinear_tf)
コード例 #25
0
ファイル: featurizer.py プロジェクト: warp-x/snips-nlu
def _entity_name_to_feature(entity_name, language):
    return "entityfeature%s" % "".join(
        tokenize_light(entity_name, language=language))
コード例 #26
0
def _get_tfidf_vectorizer(language, sublinear_tf=False):
    return TfidfVectorizer(tokenizer=lambda x: tokenize_light(x, language),
                           sublinear_tf=sublinear_tf)
コード例 #27
0
ファイル: dataset.py プロジェクト: ravindraprasad75/snips-nlu
def has_any_capitalization(entity_utterances, language):
    for utterance in entity_utterances:
        tokens = tokenize_light(utterance, language)
        if any(t.isupper() or t.istitle() for t in tokens):
            return True
    return False
コード例 #28
0
def _get_builtin_entity_name(entity_label, language):
    return "%%%s%%" % "".join(
        tokenize_light(entity_label, language)).upper()
コード例 #29
0
def _entity_name_to_feature(entity_name, language):
    return "entityfeature%s" % "".join(tokenize_light(
        entity_name, language=language))
コード例 #30
0
ファイル: preprocessing.py プロジェクト: zengfy2017/snips-nlu
def stem(string, language):
    tokens = tokenize_light(string, language)
    stemmed_tokens = [_stem(token, language) for token in tokens]
    return ' '.join(stemmed_tokens)
コード例 #31
0
def capitalize(text, language):
    tokens = tokenize_light(text, language)
    stop_words = get_stop_words(language)
    return get_default_sep(language).join(
        t.title() if t.lower() not in stop_words
        else t.lower() for t in tokens)
コード例 #32
0
ファイル: featurizer.py プロジェクト: warp-x/snips-nlu
def _builtin_entity_to_feature(builtin_entity_label, language):
    return "builtinentityfeature%s" % "".join(
        tokenize_light(builtin_entity_label, language=language))
コード例 #33
0
def _get_builtin_entity_name(entity_label, language):
    return "%%%s%%" % "".join(tokenize_light(entity_label, language)).upper()
コード例 #34
0
def _builtin_entity_to_feature(builtin_entity_label, language):
    return "builtinentityfeature%s" % "".join(tokenize_light(
        builtin_entity_label, language=language))