def _freeze_vocabulary(self, X=None):
     if not self.fixed_vocabulary:
         frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
         if X is not None:
             X = self._reorder_features(X, self.vocabulary_, frozen)
         self.vocabulary_ = frozen
         self.fixed_vocabulary = True
         del self.stop_words_
     return X
Ejemplo n.º 2
0
def test_metaestimator_delegation():
    # Ensures specified metaestimators have methods iff subestimator does
    def hides(method):
        @property
        def wrapper(obj):
            if obj.hidden_method == method.__name__:
                raise AttributeError('%r is hidden' % obj.hidden_method)
            return functools.partial(method, obj)
        return wrapper

    class SubEstimator(BaseEstimator):
        def __init__(self, param=1, hidden_method=None):
            self.param = param
            self.hidden_method = hidden_method

        def fit(self, X, y=None, *args, **kwargs):
            self.coef_ = np.arange(X.shape[1])
            return True

        def _check_fit(self):
            check_is_fitted(self, 'coef_')

        @hides
        def inverse_transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def predict(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_log_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def decision_function(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def score(self, X, y, *args, **kwargs):
            self._check_fit()
            return 1.0

    methods = [k for k in iterkeys(SubEstimator.__dict__)
               if not k.startswith('_') and not k.startswith('fit')]
    methods.sort()

    for delegator_data in DELEGATING_METAESTIMATORS:
        delegate = SubEstimator()
        delegator = delegator_data.construct(delegate)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            assert hasattr(delegate, method)
            assert hasattr(delegator, method), (
                    "%s does not have method %r when its delegate does"
                    % (delegator_data.name, method))
            # delegation before fit raises a NotFittedError
            if method == 'score':
                assert_raises(NotFittedError, getattr(delegator, method),
                              delegator_data.fit_args[0],
                              delegator_data.fit_args[1])
            else:
                assert_raises(NotFittedError, getattr(delegator, method),
                              delegator_data.fit_args[0])

        delegator.fit(*delegator_data.fit_args)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            # smoke test delegation
            if method == 'score':
                getattr(delegator, method)(delegator_data.fit_args[0],
                                           delegator_data.fit_args[1])
            else:
                getattr(delegator, method)(delegator_data.fit_args[0])

        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            delegate = SubEstimator(hidden_method=method)
            delegator = delegator_data.construct(delegate)
            assert not hasattr(delegate, method)
            assert not hasattr(delegator, method), (
                    "%s has method %r when its delegate does not"
                    % (delegator_data.name, method))
Ejemplo n.º 3
0
    def fit_transform(self, raw_documents, y=None):
        """Learn the vocabulary dictionary and return the count vectors.

        This is more efficient than calling fit followed by transform.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        vectors : array, [n_samples, n_features]
        """
        # We intentionally don't call the transform method to make
        # fit_transform overridable without unwanted side effects in
        # TfidfVectorizer.
        fixed_vocab = self.fixed_vocabulary

        if fixed_vocab:
            vocab = self.vocabulary_
            vocab_max_ind = max(six.itervalues(self.vocabulary_)) + 1
        else:
            vocab = {}
            vocab_max_ind = 0

        # Result of document conversion to term count arrays.
        row_ind = _make_int_array()
        col_ind = _make_int_array()
        feature_values = _make_int_array()
        term_counts = Counter()

        # term counts across entire corpus (count each term maximum once per
        # document)
        document_counts = Counter()

        analyze = self.build_analyzer()

        for n_doc, doc in enumerate(raw_documents):
            term_count_current = Counter(analyze(doc))
            term_counts.update(term_count_current)

            if not fixed_vocab:
                for term in six.iterkeys(term_count_current):
                    if term not in vocab:
                        vocab[term] = vocab_max_ind
                        vocab_max_ind += 1

            document_counts.update(six.iterkeys(term_count_current))

            for term, count in six.iteritems(term_count_current):
                if term in vocab:
                    row_ind.append(n_doc)
                    col_ind.append(vocab[term])
                    feature_values.append(count)
        n_doc += 1

        if fixed_vocab:
            # XXX max_df, min_df and max_features have no effect
            # with a fixed vocabulary.
            i_indices = row_ind
            j_indices = col_ind
            values = feature_values
        else:
            max_features = self.max_features
            max_df = self.max_df
            min_df = self.min_df

            max_doc_count = (max_df if isinstance(max_df, numbers.Integral)
                                    else max_df * n_doc)
            min_doc_count = (min_df if isinstance(min_df, numbers.Integral)
                                    else min_df * n_doc)

            # filter out stop words: terms that occur in almost all documents
            if max_doc_count < n_doc or min_doc_count > 1:
                stop_words = set(t for t, dc in six.iteritems(document_counts)
                                   if not min_doc_count <= dc <= max_doc_count)
            else:
                stop_words = set()

            # list the terms that should be part of the vocabulary
            if max_features is None:
                terms = set(term_counts) - stop_words
            else:
                # extract the most frequent terms for the vocabulary
                terms = set()
                for t, tc in term_counts.most_common():
                    if t not in stop_words:
                        terms.add(t)
                    if len(terms) >= max_features:
                        break

            # store the learned stop words to make it easier to debug the value
            # of max_df
            self.stop_words_ = stop_words

            # free memory
            term_counts.clear()
            document_counts.clear()

            # store map from term name to feature integer index: we sort the
            # terms to have reproducible outcome for the vocabulary structure:
            # otherwise the mapping from feature name to indices might depend
            # on the memory layout of the machine. Furthermore sorted terms
            # might make it possible to perform binary search in the feature
            # names array.
            terms = sorted(terms)

            # reorder term indices
            reorder_indices = dict((vocab[term], i)
                                   for i, term in enumerate(terms))
            self.vocabulary_ = dict(((t, i) for i, t in enumerate(terms)))

            # create term count arrays with new vocabulary structure
            i_indices = _make_int_array()
            j_indices = _make_int_array()
            values = _make_int_array()
            for i, col in enumerate(col_ind):
                if col in reorder_indices:
                    i_indices.append(row_ind[i])
                    j_indices.append(reorder_indices[col_ind[i]])
                    values.append(feature_values[i])

            # free memory
            del reorder_indices
            del row_ind
            del col_ind
            del feature_values

        if not vocab:
            msg = "Empty vocabulary; "
            if fixed_vocab:
                msg += "%r passed to constructor." % vocab
            else:
                msg += "perhaps your documents contain stop words only?"
            raise ValueError(msg)

        # the term_counts and document_counts might be useful statistics, are
        # we really sure want we want to drop them? They take some memory but
        # can be useful for corpus introspection
        return self._term_counts_to_matrix(n_doc, i_indices, j_indices, values)
Ejemplo n.º 4
0
 def _freeze_vocabulary(self, X=None):
     if not self.fixed_vocabulary_:
         self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
         self.fixed_vocabulary_ = True
         del self.stop_words_
Ejemplo n.º 5
0
    def fit_transform(self, raw_documents, y=None):
        """Learn the vocabulary dictionary and return the count vectors.

        This is more efficient than calling fit followed by transform.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        vectors : array, [n_samples, n_features]
        """
        if self.fixed_vocabulary:
            # No need to fit anything, directly perform the transformation.
            # We intentionally don't call the transform method to make it
            # fit_transform overridable without unwanted side effects in
            # TfidfVectorizer
            analyze = self.build_analyzer()
            term_counts_per_doc = (Counter(analyze(doc))
                                   for doc in raw_documents)
            return self._term_count_dicts_to_matrix(term_counts_per_doc)

        self.vocabulary_ = {}
        # result of document conversion to term count dicts
        term_counts_per_doc = []
        term_counts = Counter()

        # term counts across entire corpus (count each term maximum once per
        # document)
        document_counts = Counter()

        analyze = self.build_analyzer()

        # TODO: parallelize the following loop with joblib?
        # (see XXX up ahead)
        for doc in raw_documents:
            term_count_current = Counter(analyze(doc))
            term_counts.update(term_count_current)

            document_counts.update(six.iterkeys(term_count_current))

            term_counts_per_doc.append(term_count_current)

        n_doc = len(term_counts_per_doc)
        max_features = self.max_features
        max_df = self.max_df
        min_df = self.min_df

        max_doc_count = (max_df
                         if isinstance(max_df, numbers.Integral)
                         else max_df * n_doc)
        min_doc_count = (min_df
                         if isinstance(min_df, numbers.Integral)
                         else min_df * n_doc)

        # filter out stop words: terms that occur in almost all documents
        if max_doc_count < n_doc or min_doc_count > 1:
            stop_words = set(t for t, dc in six.iteritems(document_counts)
                             if dc > max_doc_count or dc < min_doc_count)
        else:
            stop_words = set()

        # list the terms that should be part of the vocabulary
        if max_features is None:
            terms = set(term_counts) - stop_words
        else:
            # extract the most frequent terms for the vocabulary
            terms = set()
            for t, tc in term_counts.most_common():
                if t not in stop_words:
                    terms.add(t)
                if len(terms) >= max_features:
                    break

        # store the learned stop words to make it easier to debug the value of
        # max_df
        self.stop_words_ = stop_words

        # store map from term name to feature integer index: we sort the term
        # to have reproducible outcome for the vocabulary structure: otherwise
        # the mapping from feature name to indices might depend on the memory
        # layout of the machine. Furthermore sorted terms might make it
        # possible to perform binary search in the feature names array.
        vocab = dict(((t, i) for i, t in enumerate(sorted(terms))))
        if not vocab:
            raise ValueError("empty vocabulary; training set may have"
                             " contained only stop words or min_df (resp. "
                             "max_df) may be too high (resp. too low).")
        self.vocabulary_ = vocab

        # the term_counts and document_counts might be useful statistics, are
        # we really sure want we want to drop them? They take some memory but
        # can be useful for corpus introspection
        return self._term_count_dicts_to_matrix(term_counts_per_doc)