コード例 #1
0
ファイル: pipe.py プロジェクト: yejiachen/seglearn
    def set_params(self, **params):
        """
        Set the parameters of this estimator.
        Valid parameter keys can be listed with ``get_params()``.

        Returns
        -------
        self
        """
        items = self.steps
        names, _ = zip(*items)

        keys = list(six.iterkeys(params))

        for name in keys:
            if '__' not in name and name in names:
                # replace an estimator
                self._replace_estimator('steps', name, params.pop(name))

            if callable(params[name]):
                # use a callable or function to set parameters
                params[name] = params[name](params)

            elif params[name] in keys:
                # set one arg from another
                params[name] = params[params[name]]

        BaseEstimator.set_params(self, **params)
        return self
コード例 #2
0
    def set_params(self, **params):
        """
        """
        valid_params = self.get_params(deep=False)
        # 1. replace `config`
        if 'config' in params:
            setattr(self, 'config', params.pop('config'))

        # 2. replace individual layer or non-layer parameters
        named_layers = self.named_layers
        names = []
        named_layers_dict = {}
        if named_layers:
            names, _ = zip(*named_layers)
            named_layers_dict = dict(named_layers)
        for name in list(six.iterkeys(params)):
            if '__' not in name:
                for i, layer_name in enumerate(names):
                    # replace layer
                    if layer_name == name:
                        new_val = params.pop(name)
                        if new_val is None:
                            del self.config['layers'][i]
                        else:
                            self.config['layers'][i] = new_val
                        break
                else:
                    # replace non-layer parameter
                    if name not in valid_params:
                        raise ValueError(
                            "Invalid parameter %s for estimator %s. "
                            "Check the list of available parameters "
                            "with `estimator.get_params().keys()`."
                            % (name, self))
                    setattr(self, name, params.pop(name))

            elif not name.startswith('layers'):
                # suppose all other parameters are layers parameters,
                # raise error otherwise
                raise ValueError("Invalid parameter %s for estimator %s. "
                                 "Check the list of available parameters "
                                 "with `estimator.get_params().keys()`." %
                                 (name, self))

        # 3. replace layer parameter
        search_params = [SearchParam(k, v) for k, v in six.iteritems(params)]
        search_params = sorted(search_params, key=lambda x: x.depth)

        for param in search_params:
            update = param.to_dict()
            try:
                _update_dict(named_layers_dict, update)
            except KeyError:
                raise ValueError("Invalid parameter %s for estimator %s. "
                                 "Check the list of available parameters "
                                 "with `estimator.get_params().keys()`." %
                                 (param.s_param, self))

        return self
コード例 #3
0
    def set_params(self, **params):

        for key in list(six.iterkeys(params)):
            if not key.startswith('layers'):
                raise ValueError("Only layer structure parameters are "
                                 "not searchable!")
        # 1. replace `layers`
        if 'layers' in params:
            setattr(self, 'layers', params.pop('layers'))

        # 2. replace individual layer
        layers = self.layers
        named_layers = self.named_layers
        names = []
        named_layers_dict = {}
        if named_layers:
            names, _ = zip(*named_layers)
            named_layers_dict = dict(named_layers)
        for name in list(six.iterkeys(params)):
            if '__' not in name:
                for i, layer_name in enumerate(names):
                    if layer_name == name:
                        new_val = params.pop(name)
                        if new_val is None:
                            del layers[i]
                        else:
                            layers[i] = new_val
                        break
                setattr(self, 'layers', layers)

        # 3. replace other layer parameter
        search_params = [SearchParam(k, v) for k, v in six.iteritems(params)]
        search_params = sorted(search_params, key=lambda x: x.depth)

        for param in search_params:
            update = param.to_dict()
            try:
                _update_dict(named_layers_dict, update)
            except KeyError:
                raise ValueError("Invalid parameter %s for estimator %s. "
                                 "Check the list of available parameters "
                                 "with `estimator.get_params().keys()`." %
                                 (param.s_param, self))

        return self
コード例 #4
0
 def _freeze_vocabulary(self, X=None):
     if not self.fixed_vocabulary_:
         frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
         if X is not None:
             X = self._reorder_features(X, self.vocabulary_, frozen)
         self.vocabulary_ = frozen
         self.fixed_vocabulary_ = True
         del self.stop_words_
     return X
コード例 #5
0
 def _freeze_vocabulary(self, X=None):
     if not self.fixed_vocabulary:
         frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
         if X is not None:
             X = self._reorder_features(X, self.vocabulary_, frozen)
         self.vocabulary_ = frozen
         self.fixed_vocabulary = True
         del self.stop_words_
     return X
コード例 #6
0
ファイル: FeaturePipeline.py プロジェクト: nonabelian/leapy
 def _set_params(self, attr, **params):
     # Ensure strict ordering of parameter setting:
     # 1. All steps
     if attr in params:
         setattr(self, attr, params.pop(attr))
     # 2. Step replacement
     items = getattr(self, attr)
     names = []
     if items:
         names, _, _ = zip(*items)
     for name in list(six.iterkeys(params)):
         if '__' not in name and name in names:
             self._replace_estimator(attr, name, params.pop(name))
     # 3. Step parameters and other initialisation arguments
     super().set_params(**params)
     return self
コード例 #7
0
def check_params(params, fn):
    """
    Check whether params are valid for function(s)

    Parameter:
    ----------
    params : dict
    fn : function or functions iterables
    """
    if not isinstance(fn, (list, tuple)):
        fn = [fn]
    for p in list(six.iterkeys(params)):
        for f in fn:
            if has_arg(f, p):
                break
        else:
            raise ValueError(
                "{} is not a legal parameter".format(p))
コード例 #8
0
 def _freeze_vocabulary(self, X=None):
     if not self.fixed_vocabulary_:
         self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
         self.fixed_vocabulary_ = True
         del self.stop_words_
コード例 #9
0
def test_metaestimator_delegation():
    # Ensures specified metaestimators have methods iff subestimator does
    def hides(method):
        @property
        def wrapper(obj):
            if obj.hidden_method == method.__name__:
                raise AttributeError('%r is hidden' % obj.hidden_method)
            return functools.partial(method, obj)

        return wrapper

    class SubEstimator(BaseEstimator):
        def __init__(self, param=1, hidden_method=None):
            self.param = param
            self.hidden_method = hidden_method

        def fit(self, X, y=None, *args, **kwargs):
            self.coef_ = np.arange(X.shape[1])
            return True

        def _check_fit(self):
            check_is_fitted(self, 'coef_')

        @hides
        def inverse_transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def predict(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_log_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def decision_function(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def score(self, X, *args, **kwargs):
            self._check_fit()
            return 1.0

    methods = [
        k for k in iterkeys(SubEstimator.__dict__)
        if not k.startswith('_') and not k.startswith('fit')
    ]
    methods.sort()

    for delegator_data in DELEGATING_METAESTIMATORS:
        delegate = SubEstimator()
        delegator = delegator_data.construct(delegate)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            assert_true(hasattr(delegate, method))
            assert_true(
                hasattr(delegator, method),
                msg="%s does not have method %r when its delegate does" %
                (delegator_data.name, method))
            # delegation before fit raises a NotFittedError
            assert_raises(NotFittedError, getattr(delegator, method),
                          delegator_data.fit_args[0])

        delegator.fit(*delegator_data.fit_args)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            # smoke test delegation
            getattr(delegator, method)(delegator_data.fit_args[0])

        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            delegate = SubEstimator(hidden_method=method)
            delegator = delegator_data.construct(delegate)
            assert_false(hasattr(delegate, method))
            assert_false(hasattr(delegator, method),
                         msg="%s has method %r when its delegate does not" %
                         (delegator_data.name, method))
コード例 #10
0
def test_metaestimator_delegation():
    # Ensures specified metaestimators have methods iff subestimator does
    def hides(method):
        @property
        def wrapper(obj):
            if obj.hidden_method == method.__name__:
                raise AttributeError('%r is hidden' % obj.hidden_method)
            return functools.partial(method, obj)
        return wrapper

    class SubEstimator(BaseEstimator):
        def __init__(self, param=1, hidden_method=None):
            self.param = param
            self.hidden_method = hidden_method

        def fit(self, X, y=None, *args, **kwargs):
            self.coef_ = np.arange(X.shape[1])
            return True

        def _check_fit(self):
            check_is_fitted(self, 'coef_')

        @hides
        def inverse_transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def transform(self, X, *args, **kwargs):
            self._check_fit()
            return X

        @hides
        def predict(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def predict_log_proba(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def decision_function(self, X, *args, **kwargs):
            self._check_fit()
            return np.ones(X.shape[0])

        @hides
        def score(self, X, y, *args, **kwargs):
            self._check_fit()
            return 1.0

    methods = [k for k in iterkeys(SubEstimator.__dict__)
               if not k.startswith('_') and not k.startswith('fit')]
    methods.sort()

    for delegator_data in DELEGATING_METAESTIMATORS:
        delegate = SubEstimator()
        delegator = delegator_data.construct(delegate)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            assert hasattr(delegate, method)
            assert hasattr(delegator, method), (
                    "%s does not have method %r when its delegate does"
                    % (delegator_data.name, method))
            # delegation before fit raises a NotFittedError
            if method == 'score':
                assert_raises(NotFittedError, getattr(delegator, method),
                              delegator_data.fit_args[0],
                              delegator_data.fit_args[1])
            else:
                assert_raises(NotFittedError, getattr(delegator, method),
                              delegator_data.fit_args[0])

        delegator.fit(*delegator_data.fit_args)
        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            # smoke test delegation
            if method == 'score':
                getattr(delegator, method)(delegator_data.fit_args[0],
                                           delegator_data.fit_args[1])
            else:
                getattr(delegator, method)(delegator_data.fit_args[0])

        for method in methods:
            if method in delegator_data.skip_methods:
                continue
            delegate = SubEstimator(hidden_method=method)
            delegator = delegator_data.construct(delegate)
            assert not hasattr(delegate, method)
            assert not hasattr(delegator, method), (
                    "%s has method %r when its delegate does not"
                    % (delegator_data.name, method))
コード例 #11
0
ファイル: text.py プロジェクト: janardhanv/scikit-learn
    def fit_transform(self, raw_documents, y=None):
        """Learn the vocabulary dictionary and return the count vectors.

        This is more efficient than calling fit followed by transform.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        vectors : array, [n_samples, n_features]
        """
        # We intentionally don't call the transform method to make
        # fit_transform overridable without unwanted side effects in
        # TfidfVectorizer.
        fixed_vocab = self.fixed_vocabulary

        if fixed_vocab:
            vocab = self.vocabulary_
            vocab_max_ind = max(six.itervalues(self.vocabulary_)) + 1
        else:
            vocab = {}
            vocab_max_ind = 0

        # Result of document conversion to term count arrays.
        row_ind = _make_int_array()
        col_ind = _make_int_array()
        feature_values = _make_int_array()
        term_counts = Counter()

        # term counts across entire corpus (count each term maximum once per
        # document)
        document_counts = Counter()

        analyze = self.build_analyzer()

        for n_doc, doc in enumerate(raw_documents):
            term_count_current = Counter(analyze(doc))
            term_counts.update(term_count_current)

            if not fixed_vocab:
                for term in six.iterkeys(term_count_current):
                    if term not in vocab:
                        vocab[term] = vocab_max_ind
                        vocab_max_ind += 1

            document_counts.update(six.iterkeys(term_count_current))

            for term, count in six.iteritems(term_count_current):
                if term in vocab:
                    row_ind.append(n_doc)
                    col_ind.append(vocab[term])
                    feature_values.append(count)
        n_doc += 1

        if fixed_vocab:
            # XXX max_df, min_df and max_features have no effect
            # with a fixed vocabulary.
            i_indices = row_ind
            j_indices = col_ind
            values = feature_values
        else:
            max_features = self.max_features
            max_df = self.max_df
            min_df = self.min_df

            max_doc_count = (max_df if isinstance(max_df, numbers.Integral)
                                    else max_df * n_doc)
            min_doc_count = (min_df if isinstance(min_df, numbers.Integral)
                                    else min_df * n_doc)

            # filter out stop words: terms that occur in almost all documents
            if max_doc_count < n_doc or min_doc_count > 1:
                stop_words = set(t for t, dc in six.iteritems(document_counts)
                                   if not min_doc_count <= dc <= max_doc_count)
            else:
                stop_words = set()

            # list the terms that should be part of the vocabulary
            if max_features is None:
                terms = set(term_counts) - stop_words
            else:
                # extract the most frequent terms for the vocabulary
                terms = set()
                for t, tc in term_counts.most_common():
                    if t not in stop_words:
                        terms.add(t)
                    if len(terms) >= max_features:
                        break

            # store the learned stop words to make it easier to debug the value
            # of max_df
            self.stop_words_ = stop_words

            # free memory
            term_counts.clear()
            document_counts.clear()

            # store map from term name to feature integer index: we sort the
            # terms to have reproducible outcome for the vocabulary structure:
            # otherwise the mapping from feature name to indices might depend
            # on the memory layout of the machine. Furthermore sorted terms
            # might make it possible to perform binary search in the feature
            # names array.
            terms = sorted(terms)

            # reorder term indices
            reorder_indices = dict((vocab[term], i)
                                   for i, term in enumerate(terms))
            self.vocabulary_ = dict(((t, i) for i, t in enumerate(terms)))

            # create term count arrays with new vocabulary structure
            i_indices = _make_int_array()
            j_indices = _make_int_array()
            values = _make_int_array()
            for i, col in enumerate(col_ind):
                if col in reorder_indices:
                    i_indices.append(row_ind[i])
                    j_indices.append(reorder_indices[col_ind[i]])
                    values.append(feature_values[i])

            # free memory
            del reorder_indices
            del row_ind
            del col_ind
            del feature_values

        if not vocab:
            msg = "Empty vocabulary; "
            if fixed_vocab:
                msg += "%r passed to constructor." % vocab
            else:
                msg += "perhaps your documents contain stop words only?"
            raise ValueError(msg)

        # the term_counts and document_counts might be useful statistics, are
        # we really sure want we want to drop them? They take some memory but
        # can be useful for corpus introspection
        return self._term_counts_to_matrix(n_doc, i_indices, j_indices, values)
コード例 #12
0
ファイル: utils.py プロジェクト: KenHollandWHY/kaggle
 def _freeze_vocabulary(self, X=None):
     if not self.fixed_vocabulary_:
         self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_))
         self.fixed_vocabulary_ = True
         del self.stop_words_
コード例 #13
0
ファイル: text.py プロジェクト: jinbochen/scikit-learn
    def fit_transform(self, raw_documents, y=None):
        """Learn the vocabulary dictionary and return the count vectors.

        This is more efficient than calling fit followed by transform.

        Parameters
        ----------
        raw_documents : iterable
            An iterable which yields either str, unicode or file objects.

        Returns
        -------
        vectors : array, [n_samples, n_features]
        """
        if self.fixed_vocabulary:
            # No need to fit anything, directly perform the transformation.
            # We intentionally don't call the transform method to make it
            # fit_transform overridable without unwanted side effects in
            # TfidfVectorizer
            analyze = self.build_analyzer()
            term_counts_per_doc = (Counter(analyze(doc))
                                   for doc in raw_documents)
            return self._term_count_dicts_to_matrix(term_counts_per_doc)

        self.vocabulary_ = {}
        # result of document conversion to term count dicts
        term_counts_per_doc = []
        term_counts = Counter()

        # term counts across entire corpus (count each term maximum once per
        # document)
        document_counts = Counter()

        analyze = self.build_analyzer()

        # TODO: parallelize the following loop with joblib?
        # (see XXX up ahead)
        for doc in raw_documents:
            term_count_current = Counter(analyze(doc))
            term_counts.update(term_count_current)

            document_counts.update(six.iterkeys(term_count_current))

            term_counts_per_doc.append(term_count_current)

        n_doc = len(term_counts_per_doc)
        max_features = self.max_features
        max_df = self.max_df
        min_df = self.min_df

        max_doc_count = (max_df
                         if isinstance(max_df, numbers.Integral)
                         else max_df * n_doc)
        min_doc_count = (min_df
                         if isinstance(min_df, numbers.Integral)
                         else min_df * n_doc)

        # filter out stop words: terms that occur in almost all documents
        if max_doc_count < n_doc or min_doc_count > 1:
            stop_words = set(t for t, dc in six.iteritems(document_counts)
                             if dc > max_doc_count or dc < min_doc_count)
        else:
            stop_words = set()

        # list the terms that should be part of the vocabulary
        if max_features is None:
            terms = set(term_counts) - stop_words
        else:
            # extract the most frequent terms for the vocabulary
            terms = set()
            for t, tc in term_counts.most_common():
                if t not in stop_words:
                    terms.add(t)
                if len(terms) >= max_features:
                    break

        # store the learned stop words to make it easier to debug the value of
        # max_df
        self.stop_words_ = stop_words

        # store map from term name to feature integer index: we sort the term
        # to have reproducible outcome for the vocabulary structure: otherwise
        # the mapping from feature name to indices might depend on the memory
        # layout of the machine. Furthermore sorted terms might make it
        # possible to perform binary search in the feature names array.
        vocab = dict(((t, i) for i, t in enumerate(sorted(terms))))
        if not vocab:
            raise ValueError("empty vocabulary; training set may have"
                             " contained only stop words or min_df (resp. "
                             "max_df) may be too high (resp. too low).")
        self.vocabulary_ = vocab

        # the term_counts and document_counts might be useful statistics, are
        # we really sure want we want to drop them? They take some memory but
        # can be useful for corpus introspection
        return self._term_count_dicts_to_matrix(term_counts_per_doc)