def _preprocess(self, x): # Extract all entities on unnormalized data builtin_ents = [ self.builtin_entity_parser.parse(get_text_from_chunks(u[DATA]), self.builtin_entity_scope, use_cache=True) for u in x ] custom_ents = [ self.custom_entity_parser.parse(get_text_from_chunks(u[DATA]), use_cache=True) for u in x ] return x, builtin_ents, custom_ents
def generate_noise_utterances(augmented_utterances, noise, num_intents, data_augmentation_config, language, random_state): if not augmented_utterances or not num_intents: return [] avg_num_utterances = len(augmented_utterances) / float(num_intents) if data_augmentation_config.unknown_words_replacement_string is not None: noise = generate_smart_noise( noise, augmented_utterances, data_augmentation_config.unknown_words_replacement_string, language) noise_size = min( int(data_augmentation_config.noise_factor * avg_num_utterances), len(noise)) utterances_lengths = [ len(tokenize_light(get_text_from_chunks(u[DATA]), language)) for u in augmented_utterances ] mean_utterances_length = np.mean(utterances_lengths) std_utterances_length = np.std(utterances_lengths) noise_it = get_noise_it(noise, mean_utterances_length, std_utterances_length, random_state) # Remove duplicate 'unknownword unknownword' return [ text_to_utterance(UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it))) for _ in range(noise_size) ]
def _preprocess(self, x, training=False): if training: builtin_ents, custom_ents = zip( *[_entities_from_utterance(u) for u in x]) else: # Extract all entities on unnormalized data builtin_ents = [ self.builtin_entity_parser.parse(get_text_from_chunks(u[DATA]), self.builtin_entity_scope, use_cache=True) for u in x ] custom_ents = [ self.custom_entity_parser.parse(get_text_from_chunks(u[DATA]), use_cache=True) for u in x ] return x, builtin_ents, custom_ents
def generate_smart_noise(augmented_utterances, replacement_string, language): text_utterances = [get_text_from_chunks(u[DATA]) for u in augmented_utterances] vocab = [w for u in text_utterances for w in tokenize_light(u, language)] vocab = set(vocab) noise = get_noises(language) return [w if w in vocab else replacement_string for w in noise]
def generate_smart_noise(noise, augmented_utterances, replacement_string, language): text_utterances = [get_text_from_chunks(u[DATA]) for u in augmented_utterances] vocab = [w for u in text_utterances for w in tokenize_light(u, language)] vocab = set(vocab) return [w if w in vocab else replacement_string for w in noise]
def generate_noise_utterances(augmented_utterances, num_intents, data_augmentation_config, language, random_state): if not augmented_utterances or not num_intents: return [] avg_num_utterances = len(augmented_utterances) / float(num_intents) if data_augmentation_config.unknown_words_replacement_string is not None: noise = generate_smart_noise( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, language) else: noise = get_noises(language) noise_size = min( int(data_augmentation_config.noise_factor * avg_num_utterances), len(noise)) utterances_lengths = [ len(tokenize_light(get_text_from_chunks(u[DATA]), language)) for u in augmented_utterances] mean_utterances_length = np.mean(utterances_lengths) std_utterances_length = np.std(utterances_lengths) noise_it = get_noise_it(noise, mean_utterances_length, std_utterances_length, random_state) # Remove duplicate 'unknowword unknowword' return [UNKNOWNWORD_REGEX.sub(UNKNOWNWORD, next(noise_it)) for _ in range(noise_size)]
def build_training_data(dataset, language, data_augmentation_config, random_state): # Create class mapping intents = dataset[INTENTS] intent_index = 0 classes_mapping = dict() for intent in sorted(intents): classes_mapping[intent] = intent_index intent_index += 1 noise_class = intent_index # Computing dataset statistics nb_utterances = [len(intent[UTTERANCES]) for intent in itervalues(intents)] augmented_utterances = [] utterance_classes = [] for nb_utterance, intent_name in zip(nb_utterances, intents): min_utterances_to_generate = max( data_augmentation_config.min_utterances, nb_utterance) utterances = augment_utterances( dataset, intent_name, language=language, min_utterances=min_utterances_to_generate, capitalization_ratio=0.0, random_state=random_state) augmented_utterances += utterances utterance_classes += [classes_mapping[intent_name] for _ in range(len(utterances))] augmented_utterances = add_unknown_word_to_utterances( augmented_utterances, data_augmentation_config.unknown_words_replacement_string, data_augmentation_config.unknown_word_prob, random_state ) # Adding noise noisy_utterances = generate_noise_utterances( augmented_utterances, len(intents), data_augmentation_config, language, random_state) augmented_utterances = [get_text_from_chunks(u[DATA]) for u in augmented_utterances] augmented_utterances += noisy_utterances utterance_classes += [noise_class for _ in noisy_utterances] if noisy_utterances: classes_mapping[NOISE_NAME] = noise_class nb_classes = len(set(itervalues(classes_mapping))) intent_mapping = [None for _ in range(nb_classes)] for intent, intent_class in iteritems(classes_mapping): if intent == NOISE_NAME: intent_mapping[intent_class] = None else: intent_mapping[intent_class] = intent return augmented_utterances, np.array(utterance_classes), intent_mapping
def fit(self, dataset, utterances, classes): utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): return None utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[_normalize_stem( k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 features_idx = { self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [ i for i, v in enumerate(pval) if v < self.config.pvalue_threshold ] if not self.best_features: self.best_features = [ idx for idx, val in enumerate(pval) if val == pval.min() ] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": features_idx[utterance_index], "pval": pval[utterance_index] } for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def test_should_build_training_data_with_noise(self, mocked_augment_utterances, mocked_get_noises): # Given mocked_noises = ["mocked_noise_%s" % i for i in range(100)] mocked_get_noises.return_value = mocked_noises mocked_augment_utterances.side_effect = get_mocked_augment_utterances num_intents = 3 utterances_length = 5 num_queries_per_intent = 3 fake_utterance = { "data": [{ "text": " ".join("1" for _ in range(utterances_length)) }] } dataset = { "intents": { str(i): { "utterances": [fake_utterance] * num_queries_per_intent } for i in range(num_intents) } } random_state = np.random.RandomState(1) # When np.random.seed(42) noise_factor = 2 data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor, unknown_word_prob=0, unknown_words_replacement_string=None) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [ get_text_from_chunks(utterance[DATA]) for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES] ] np.random.seed(42) noise = list(mocked_noises) noise_size = int(min(noise_factor * num_queries_per_intent, len(noise))) noise_it = get_noise_it(mocked_noises, utterances_length, 0, random_state) noisy_utterances = [next(noise_it) for _ in range(noise_size)] expected_utterances += list(noisy_utterances) expected_intent_mapping = sorted(dataset["intents"]) expected_intent_mapping.append(None) self.assertListEqual(utterances, expected_utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def _preprocess(self, utterances, training=False): normalized_utterances = deepcopy(utterances) for u in normalized_utterances: for chunk in u[DATA]: chunk[TEXT] = _normalize_stem(chunk[TEXT], self.language, self.resources, self.config.use_stemming) if training: builtin_ents, custom_ents = zip( *[_entities_from_utterance(u) for u in utterances]) else: # Extract builtin entities on unormalized utterances builtin_ents = [ self.builtin_entity_parser.parse(get_text_from_chunks(u[DATA]), self.builtin_entity_scope, use_cache=True) for u in utterances ] # Extract builtin entities on normalized utterances custom_ents = [ self.custom_entity_parser.parse(get_text_from_chunks(u[DATA]), use_cache=True) for u in normalized_utterances ] if self.config.word_clusters_name: # Extract world clusters on unormalized utterances original_utterances_text = [ get_text_from_chunks(u[DATA]) for u in utterances ] w_clusters = [ _get_word_cluster_features( tokenize_light(u.lower(), self.language), self.config.word_clusters_name, self.resources) for u in original_utterances_text ] else: w_clusters = [None for _ in normalized_utterances] return normalized_utterances, builtin_ents, custom_ents, w_clusters
def test_should_build_training_data_with_noise( self, mocked_augment_utterances, mocked_get_noises): # Given mocked_noises = ["mocked_noise_%s" % i for i in range(100)] mocked_get_noises.return_value = mocked_noises mocked_augment_utterances.side_effect = get_mocked_augment_utterances num_intents = 3 utterances_length = 5 num_queries_per_intent = 3 fake_utterance = { "data": [ {"text": " ".join("1" for _ in range(utterances_length))} ] } dataset = { "intents": { str(i): { "utterances": [fake_utterance] * num_queries_per_intent } for i in range(num_intents) } } random_state = np.random.RandomState(1) # When np.random.seed(42) noise_factor = 2 data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=noise_factor, unknown_word_prob=0, unknown_words_replacement_string=None) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [get_text_from_chunks(utterance[DATA]) for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES]] np.random.seed(42) noise = list(mocked_noises) noise_size = int(min(noise_factor * num_queries_per_intent, len(noise))) noise_it = get_noise_it(mocked_noises, utterances_length, 0, random_state) noisy_utterances = [next(noise_it) for _ in range(noise_size)] expected_utterances += list(noisy_utterances) expected_intent_mapping = sorted(dataset["intents"]) expected_intent_mapping.append(None) self.assertListEqual(utterances, expected_utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def _enrich_utterance(self, x, builtin_ents, custom_ents): utterance = get_text_from_chunks(x[DATA]) all_entities = builtin_ents + custom_ents placeholder_fn = self._placeholder_fn # Replace entities with placeholders enriched_utterance = replace_entities_with_placeholders( utterance, all_entities, placeholder_fn)[1] # Tokenize enriched_utterance = tokenize_light(enriched_utterance, self.language) # Remove the unknownword strings if needed if self.config.unknown_words_replacement_string: enriched_utterance = [ t for t in enriched_utterance if t != self.config.unknown_words_replacement_string ] return enriched_utterance
def fit_transform(self, dataset, utterances, classes, none_class): dataset = validate_and_format_dataset(dataset) self.language = dataset[LANGUAGE] utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): raise _EmptyDatasetUtterancesError( "Tokenized utterances are empty") x_tfidf = self._fit_transform_tfidf_vectorizer(utterances, classes, dataset) x = x_tfidf if self.config.added_cooccurrence_feature_ratio: self._fit_cooccurrence_vectorizer(utterances, classes, none_class, dataset) x_cooccurrence = self.cooccurrence_vectorizer.transform(utterances) x = sp.hstack((x_tfidf, x_cooccurrence)) return x
def _preprocess_utterance(utterance, language, builtin_entity_parser, custom_entity_parser, word_clusters_name, use_stemming, unknownword_replacement_string): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [_normalize_stem(t, language, use_stemming) for t in utterance_tokens] custom_entities = custom_entity_parser.parse( " ".join(normalized_stemmed_tokens)) custom_entities = [e for e in custom_entities if e["value"] != unknownword_replacement_string] custom_entities_features = [ _entity_name_to_feature(e[ENTITY_KIND], language) for e in custom_entities] builtin_entities = builtin_entity_parser.parse( utterance_text, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language, use_stemming) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if custom_entities_features: features += " " + " ".join(sorted(custom_entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def test_should_build_training_data_with_no_stemming_no_noise( self, mocked_augment_utterances): # Given dataset = SAMPLE_DATASET mocked_augment_utterances.side_effect = get_mocked_augment_utterances random_state = np.random.RandomState(1) # When data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=0) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [get_text_from_chunks(utterance[DATA]) for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES]] expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2'] self.assertListEqual(utterances, expected_utterances) self.assertListEqual(expected_intent_mapping, intent_mapping)
def test_should_build_training_data_with_no_stemming_no_noise( self, mocked_augment_utterances): # Given dataset = SAMPLE_DATASET mocked_augment_utterances.side_effect = get_mocked_augment_utterances random_state = np.random.RandomState(1) # When data_augmentation_config = IntentClassifierDataAugmentationConfig( noise_factor=0) utterances, _, intent_mapping = build_training_data( dataset, LANGUAGE_EN, data_augmentation_config, random_state) # Then expected_utterances = [ get_text_from_chunks(utterance[DATA]) for intent in itervalues(dataset[INTENTS]) for utterance in intent[UTTERANCES] ] expected_intent_mapping = [u'dummy_intent_1', u'dummy_intent_2'] self.assertListEqual(utterances, expected_utterances) self.assertListEqual(expected_intent_mapping, intent_mapping)
def fit(self, dataset, utterances, classes): self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): return None preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 features_idx = {self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_} stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [i for i, v in enumerate(pval) if v < self.config.pvalue_threshold] if not self.best_features: self.best_features = [idx for idx, val in enumerate(pval) if val == pval.min()] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": features_idx[utterance_index], "pval": pval[utterance_index]} for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance_text, language, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features