def test_all_languages_should_have_stop_words(self): # The capitalization for the CRF assumes all languages have stop_words # Given for language in get_all_languages(): try: # When/Then get_stop_words(language) except: # pylint: disable=W0702 self.fail("%s has not stop words" % language)
def language(self, value): self._language = value if value is None: self._stop_words = None else: if self.config.ignore_stop_words: self._stop_words = get_stop_words(self.resources) else: self._stop_words = set()
def fit(self, dataset, utterances, classes): utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): return None utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[_normalize_stem( k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 features_idx = { self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [ i for i, v in enumerate(pval) if v < self.config.pvalue_threshold ] if not self.best_features: self.best_features = [ idx for idx, val in enumerate(pval) if val == pval.min() ] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": features_idx[utterance_index], "pval": pval[utterance_index] } for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def fit(self, dataset, utterances, classes): utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[ _normalize_stem(k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) if all(not "".join(tokenize_light(q, self.language)) for q in utterances): return None preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 list_index_words = { self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [i for i, v in enumerate(pval) if v < self.config.pvalue_threshold] if not self.best_features: self.best_features = [idx for idx, val in enumerate(pval) if val == pval.min()] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": list_index_words[utterance_index], "pval": pval[utterance_index]} for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def _extract_word_pairs(self, utterance): if self.config.filter_stop_words: stop_words = get_stop_words(self.resources) utterance = [t for t in utterance if t not in stop_words] pairs = set() for j, w1 in enumerate(utterance): max_index = None if self.config.window_size is not None: max_index = j + self.config.window_size + 1 for w2 in utterance[j + 1:max_index]: key = (w1, w2) if not self.config.keep_order: key = tuple(sorted(key)) pairs.add(key) return pairs
def fit(self, dataset, queries, y): utterances_to_features = _get_utterances_to_features_names( dataset, self.language) normalized_utterances_to_features = defaultdict(set) for k, v in iteritems(utterances_to_features): normalized_utterances_to_features[_normalize_stem( k, self.language)].update(v) if self.unknown_words_replacement_string is not None \ and self.unknown_words_replacement_string in \ normalized_utterances_to_features: normalized_utterances_to_features.pop( self.unknown_words_replacement_string) self.entity_utterances_to_feature_names = dict( normalized_utterances_to_features) if all(not "".join(tokenize_light(q, self.language)) for q in queries): return None preprocessed_queries = self.preprocess_queries(queries) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_queries) # pylint: enable=C0103 list_index_words = { self.tfidf_vectorizer.vocabulary_[x]: x for x in self.tfidf_vectorizer.vocabulary_ } stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, y) self.best_features = [ i for i, v in enumerate(pval) if v < self.pvalue_threshold ] if not self.best_features: self.best_features = [ idx for idx, val in enumerate(pval) if val == pval.min() ] feature_names = {} for i in self.best_features: feature_names[i] = {'word': list_index_words[i], 'pval': pval[i]} for feat in feature_names: if feature_names[feat]['word'] in stop_words: if feature_names[feat]['pval'] > self.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def fit(self, dataset, utterances, classes): self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): return None preprocessed_utterances = self.preprocess_utterances(utterances) # pylint: disable=C0103 X_train_tfidf = self.tfidf_vectorizer.fit_transform( preprocessed_utterances) # pylint: enable=C0103 features_idx = {self.tfidf_vectorizer.vocabulary_[word]: word for word in self.tfidf_vectorizer.vocabulary_} stop_words = get_stop_words(self.language) _, pval = chi2(X_train_tfidf, classes) self.best_features = [i for i, v in enumerate(pval) if v < self.config.pvalue_threshold] if not self.best_features: self.best_features = [idx for idx, val in enumerate(pval) if val == pval.min()] feature_names = {} for utterance_index in self.best_features: feature_names[utterance_index] = { "word": features_idx[utterance_index], "pval": pval[utterance_index]} for feat in feature_names: if feature_names[feat]["word"] in stop_words: if feature_names[feat]["pval"] > \ self.config.pvalue_threshold / 2.0: self.best_features.remove(feat) return self
def capitalize(text, language, resources): tokens = tokenize_light(text, language) stop_words = get_stop_words(resources) return get_default_sep(language).join( t.title() if t.lower() not in stop_words else t.lower() for t in tokens)
def capitalize(text, language): tokens = tokenize_light(text, language) stop_words = get_stop_words(language) return get_default_sep(language).join( t.title() if t.lower() not in stop_words else t.lower() for t in tokens)