def _freeze_vocabulary(self, X=None): if not self.fixed_vocabulary: frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_)) if X is not None: X = self._reorder_features(X, self.vocabulary_, frozen) self.vocabulary_ = frozen self.fixed_vocabulary = True del self.stop_words_ return X
def test_metaestimator_delegation(): # Ensures specified metaestimators have methods iff subestimator does def hides(method): @property def wrapper(obj): if obj.hidden_method == method.__name__: raise AttributeError('%r is hidden' % obj.hidden_method) return functools.partial(method, obj) return wrapper class SubEstimator(BaseEstimator): def __init__(self, param=1, hidden_method=None): self.param = param self.hidden_method = hidden_method def fit(self, X, y=None, *args, **kwargs): self.coef_ = np.arange(X.shape[1]) return True def _check_fit(self): check_is_fitted(self, 'coef_') @hides def inverse_transform(self, X, *args, **kwargs): self._check_fit() return X @hides def transform(self, X, *args, **kwargs): self._check_fit() return X @hides def predict(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_log_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def decision_function(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def score(self, X, y, *args, **kwargs): self._check_fit() return 1.0 methods = [k for k in iterkeys(SubEstimator.__dict__) if not k.startswith('_') and not k.startswith('fit')] methods.sort() for delegator_data in DELEGATING_METAESTIMATORS: delegate = SubEstimator() delegator = delegator_data.construct(delegate) for method in methods: if method in delegator_data.skip_methods: continue assert hasattr(delegate, method) assert hasattr(delegator, method), ( "%s does not have method %r when its delegate does" % (delegator_data.name, method)) # delegation before fit raises a NotFittedError if method == 'score': assert_raises(NotFittedError, getattr(delegator, method), delegator_data.fit_args[0], delegator_data.fit_args[1]) else: assert_raises(NotFittedError, getattr(delegator, method), delegator_data.fit_args[0]) delegator.fit(*delegator_data.fit_args) for method in methods: if method in delegator_data.skip_methods: continue # smoke test delegation if method == 'score': getattr(delegator, method)(delegator_data.fit_args[0], delegator_data.fit_args[1]) else: getattr(delegator, method)(delegator_data.fit_args[0]) for method in methods: if method in delegator_data.skip_methods: continue delegate = SubEstimator(hidden_method=method) delegator = delegator_data.construct(delegate) assert not hasattr(delegate, method) assert not hasattr(delegator, method), ( "%s has method %r when its delegate does not" % (delegator_data.name, method))
def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return the count vectors. This is more efficient than calling fit followed by transform. Parameters ---------- raw_documents : iterable An iterable which yields either str, unicode or file objects. Returns ------- vectors : array, [n_samples, n_features] """ # We intentionally don't call the transform method to make # fit_transform overridable without unwanted side effects in # TfidfVectorizer. fixed_vocab = self.fixed_vocabulary if fixed_vocab: vocab = self.vocabulary_ vocab_max_ind = max(six.itervalues(self.vocabulary_)) + 1 else: vocab = {} vocab_max_ind = 0 # Result of document conversion to term count arrays. row_ind = _make_int_array() col_ind = _make_int_array() feature_values = _make_int_array() term_counts = Counter() # term counts across entire corpus (count each term maximum once per # document) document_counts = Counter() analyze = self.build_analyzer() for n_doc, doc in enumerate(raw_documents): term_count_current = Counter(analyze(doc)) term_counts.update(term_count_current) if not fixed_vocab: for term in six.iterkeys(term_count_current): if term not in vocab: vocab[term] = vocab_max_ind vocab_max_ind += 1 document_counts.update(six.iterkeys(term_count_current)) for term, count in six.iteritems(term_count_current): if term in vocab: row_ind.append(n_doc) col_ind.append(vocab[term]) feature_values.append(count) n_doc += 1 if fixed_vocab: # XXX max_df, min_df and max_features have no effect # with a fixed vocabulary. i_indices = row_ind j_indices = col_ind values = feature_values else: max_features = self.max_features max_df = self.max_df min_df = self.min_df max_doc_count = (max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc) min_doc_count = (min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc) # filter out stop words: terms that occur in almost all documents if max_doc_count < n_doc or min_doc_count > 1: stop_words = set(t for t, dc in six.iteritems(document_counts) if not min_doc_count <= dc <= max_doc_count) else: stop_words = set() # list the terms that should be part of the vocabulary if max_features is None: terms = set(term_counts) - stop_words else: # extract the most frequent terms for the vocabulary terms = set() for t, tc in term_counts.most_common(): if t not in stop_words: terms.add(t) if len(terms) >= max_features: break # store the learned stop words to make it easier to debug the value # of max_df self.stop_words_ = stop_words # free memory term_counts.clear() document_counts.clear() # store map from term name to feature integer index: we sort the # terms to have reproducible outcome for the vocabulary structure: # otherwise the mapping from feature name to indices might depend # on the memory layout of the machine. Furthermore sorted terms # might make it possible to perform binary search in the feature # names array. terms = sorted(terms) # reorder term indices reorder_indices = dict((vocab[term], i) for i, term in enumerate(terms)) self.vocabulary_ = dict(((t, i) for i, t in enumerate(terms))) # create term count arrays with new vocabulary structure i_indices = _make_int_array() j_indices = _make_int_array() values = _make_int_array() for i, col in enumerate(col_ind): if col in reorder_indices: i_indices.append(row_ind[i]) j_indices.append(reorder_indices[col_ind[i]]) values.append(feature_values[i]) # free memory del reorder_indices del row_ind del col_ind del feature_values if not vocab: msg = "Empty vocabulary; " if fixed_vocab: msg += "%r passed to constructor." % vocab else: msg += "perhaps your documents contain stop words only?" raise ValueError(msg) # the term_counts and document_counts might be useful statistics, are # we really sure want we want to drop them? They take some memory but # can be useful for corpus introspection return self._term_counts_to_matrix(n_doc, i_indices, j_indices, values)
def _freeze_vocabulary(self, X=None): if not self.fixed_vocabulary_: self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_)) self.fixed_vocabulary_ = True del self.stop_words_
def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return the count vectors. This is more efficient than calling fit followed by transform. Parameters ---------- raw_documents : iterable An iterable which yields either str, unicode or file objects. Returns ------- vectors : array, [n_samples, n_features] """ if self.fixed_vocabulary: # No need to fit anything, directly perform the transformation. # We intentionally don't call the transform method to make it # fit_transform overridable without unwanted side effects in # TfidfVectorizer analyze = self.build_analyzer() term_counts_per_doc = (Counter(analyze(doc)) for doc in raw_documents) return self._term_count_dicts_to_matrix(term_counts_per_doc) self.vocabulary_ = {} # result of document conversion to term count dicts term_counts_per_doc = [] term_counts = Counter() # term counts across entire corpus (count each term maximum once per # document) document_counts = Counter() analyze = self.build_analyzer() # TODO: parallelize the following loop with joblib? # (see XXX up ahead) for doc in raw_documents: term_count_current = Counter(analyze(doc)) term_counts.update(term_count_current) document_counts.update(six.iterkeys(term_count_current)) term_counts_per_doc.append(term_count_current) n_doc = len(term_counts_per_doc) max_features = self.max_features max_df = self.max_df min_df = self.min_df max_doc_count = (max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc) min_doc_count = (min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc) # filter out stop words: terms that occur in almost all documents if max_doc_count < n_doc or min_doc_count > 1: stop_words = set(t for t, dc in six.iteritems(document_counts) if dc > max_doc_count or dc < min_doc_count) else: stop_words = set() # list the terms that should be part of the vocabulary if max_features is None: terms = set(term_counts) - stop_words else: # extract the most frequent terms for the vocabulary terms = set() for t, tc in term_counts.most_common(): if t not in stop_words: terms.add(t) if len(terms) >= max_features: break # store the learned stop words to make it easier to debug the value of # max_df self.stop_words_ = stop_words # store map from term name to feature integer index: we sort the term # to have reproducible outcome for the vocabulary structure: otherwise # the mapping from feature name to indices might depend on the memory # layout of the machine. Furthermore sorted terms might make it # possible to perform binary search in the feature names array. vocab = dict(((t, i) for i, t in enumerate(sorted(terms)))) if not vocab: raise ValueError("empty vocabulary; training set may have" " contained only stop words or min_df (resp. " "max_df) may be too high (resp. too low).") self.vocabulary_ = vocab # the term_counts and document_counts might be useful statistics, are # we really sure want we want to drop them? They take some memory but # can be useful for corpus introspection return self._term_count_dicts_to_matrix(term_counts_per_doc)