def _fit_transform_tfidf_vectorizer(self, x, y, dataset): self.tfidf_vectorizer = TfidfVectorizer( config=self.config.tfidf_vectorizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources) x_tfidf = self.tfidf_vectorizer.fit_transform(x, dataset) if not self.tfidf_vectorizer.vocabulary: raise _EmptyDatasetUtterancesError( "Dataset is empty or with empty utterances") _, tfidf_pval = chi2(x_tfidf, y) best_tfidf_features = set(i for i, v in enumerate(tfidf_pval) if v < self.config.pvalue_threshold) if not best_tfidf_features: best_tfidf_features = set(idx for idx, val in enumerate(tfidf_pval) if val == tfidf_pval.min()) best_ngrams = [ ng for ng, i in iteritems(self.tfidf_vectorizer.vocabulary) if i in best_tfidf_features ] self.tfidf_vectorizer.limit_vocabulary(best_ngrams) # We can't return x_tfidf[:best_tfidf_features] because of the # normalization in the transform of the tfidf_vectorizer # this would lead to inconsistent result between: fit_transform(x, y) # and fit(x, y).transform(x) return self.tfidf_vectorizer.transform(x)
def fit_transform(self, dataset, utterances, classes, none_class): dataset = validate_and_format_dataset(dataset) self.language = dataset[LANGUAGE] utterances_texts = (get_text_from_chunks(u[DATA]) for u in utterances) if not any(tokenize_light(q, self.language) for q in utterances_texts): raise _EmptyDatasetUtterancesError( "Tokenized utterances are empty") x_tfidf = self._fit_transform_tfidf_vectorizer(utterances, classes, dataset) x = x_tfidf if self.config.added_cooccurrence_feature_ratio: self._fit_cooccurrence_vectorizer(utterances, classes, none_class, dataset) x_cooccurrence = self.cooccurrence_vectorizer.transform(utterances) x = sp.hstack((x_tfidf, x_cooccurrence)) return x