def set_params(self, **params): """ Set the parameters of this estimator. Valid parameter keys can be listed with ``get_params()``. Returns ------- self """ items = self.steps names, _ = zip(*items) keys = list(six.iterkeys(params)) for name in keys: if '__' not in name and name in names: # replace an estimator self._replace_estimator('steps', name, params.pop(name)) if callable(params[name]): # use a callable or function to set parameters params[name] = params[name](params) elif params[name] in keys: # set one arg from another params[name] = params[params[name]] BaseEstimator.set_params(self, **params) return self
def set_params(self, **params): """ """ valid_params = self.get_params(deep=False) # 1. replace `config` if 'config' in params: setattr(self, 'config', params.pop('config')) # 2. replace individual layer or non-layer parameters named_layers = self.named_layers names = [] named_layers_dict = {} if named_layers: names, _ = zip(*named_layers) named_layers_dict = dict(named_layers) for name in list(six.iterkeys(params)): if '__' not in name: for i, layer_name in enumerate(names): # replace layer if layer_name == name: new_val = params.pop(name) if new_val is None: del self.config['layers'][i] else: self.config['layers'][i] = new_val break else: # replace non-layer parameter if name not in valid_params: raise ValueError( "Invalid parameter %s for estimator %s. " "Check the list of available parameters " "with `estimator.get_params().keys()`." % (name, self)) setattr(self, name, params.pop(name)) elif not name.startswith('layers'): # suppose all other parameters are layers parameters, # raise error otherwise raise ValueError("Invalid parameter %s for estimator %s. " "Check the list of available parameters " "with `estimator.get_params().keys()`." % (name, self)) # 3. replace layer parameter search_params = [SearchParam(k, v) for k, v in six.iteritems(params)] search_params = sorted(search_params, key=lambda x: x.depth) for param in search_params: update = param.to_dict() try: _update_dict(named_layers_dict, update) except KeyError: raise ValueError("Invalid parameter %s for estimator %s. " "Check the list of available parameters " "with `estimator.get_params().keys()`." % (param.s_param, self)) return self
def set_params(self, **params): for key in list(six.iterkeys(params)): if not key.startswith('layers'): raise ValueError("Only layer structure parameters are " "not searchable!") # 1. replace `layers` if 'layers' in params: setattr(self, 'layers', params.pop('layers')) # 2. replace individual layer layers = self.layers named_layers = self.named_layers names = [] named_layers_dict = {} if named_layers: names, _ = zip(*named_layers) named_layers_dict = dict(named_layers) for name in list(six.iterkeys(params)): if '__' not in name: for i, layer_name in enumerate(names): if layer_name == name: new_val = params.pop(name) if new_val is None: del layers[i] else: layers[i] = new_val break setattr(self, 'layers', layers) # 3. replace other layer parameter search_params = [SearchParam(k, v) for k, v in six.iteritems(params)] search_params = sorted(search_params, key=lambda x: x.depth) for param in search_params: update = param.to_dict() try: _update_dict(named_layers_dict, update) except KeyError: raise ValueError("Invalid parameter %s for estimator %s. " "Check the list of available parameters " "with `estimator.get_params().keys()`." % (param.s_param, self)) return self
def _freeze_vocabulary(self, X=None): if not self.fixed_vocabulary_: frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_)) if X is not None: X = self._reorder_features(X, self.vocabulary_, frozen) self.vocabulary_ = frozen self.fixed_vocabulary_ = True del self.stop_words_ return X
def _freeze_vocabulary(self, X=None): if not self.fixed_vocabulary: frozen = marisa_trie.Trie(six.iterkeys(self.vocabulary_)) if X is not None: X = self._reorder_features(X, self.vocabulary_, frozen) self.vocabulary_ = frozen self.fixed_vocabulary = True del self.stop_words_ return X
def _set_params(self, attr, **params): # Ensure strict ordering of parameter setting: # 1. All steps if attr in params: setattr(self, attr, params.pop(attr)) # 2. Step replacement items = getattr(self, attr) names = [] if items: names, _, _ = zip(*items) for name in list(six.iterkeys(params)): if '__' not in name and name in names: self._replace_estimator(attr, name, params.pop(name)) # 3. Step parameters and other initialisation arguments super().set_params(**params) return self
def check_params(params, fn): """ Check whether params are valid for function(s) Parameter: ---------- params : dict fn : function or functions iterables """ if not isinstance(fn, (list, tuple)): fn = [fn] for p in list(six.iterkeys(params)): for f in fn: if has_arg(f, p): break else: raise ValueError( "{} is not a legal parameter".format(p))
def _freeze_vocabulary(self, X=None): if not self.fixed_vocabulary_: self.vocabulary_ = marisa_trie.Trie(six.iterkeys(self.vocabulary_)) self.fixed_vocabulary_ = True del self.stop_words_
def test_metaestimator_delegation(): # Ensures specified metaestimators have methods iff subestimator does def hides(method): @property def wrapper(obj): if obj.hidden_method == method.__name__: raise AttributeError('%r is hidden' % obj.hidden_method) return functools.partial(method, obj) return wrapper class SubEstimator(BaseEstimator): def __init__(self, param=1, hidden_method=None): self.param = param self.hidden_method = hidden_method def fit(self, X, y=None, *args, **kwargs): self.coef_ = np.arange(X.shape[1]) return True def _check_fit(self): check_is_fitted(self, 'coef_') @hides def inverse_transform(self, X, *args, **kwargs): self._check_fit() return X @hides def transform(self, X, *args, **kwargs): self._check_fit() return X @hides def predict(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_log_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def decision_function(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def score(self, X, *args, **kwargs): self._check_fit() return 1.0 methods = [ k for k in iterkeys(SubEstimator.__dict__) if not k.startswith('_') and not k.startswith('fit') ] methods.sort() for delegator_data in DELEGATING_METAESTIMATORS: delegate = SubEstimator() delegator = delegator_data.construct(delegate) for method in methods: if method in delegator_data.skip_methods: continue assert_true(hasattr(delegate, method)) assert_true( hasattr(delegator, method), msg="%s does not have method %r when its delegate does" % (delegator_data.name, method)) # delegation before fit raises a NotFittedError assert_raises(NotFittedError, getattr(delegator, method), delegator_data.fit_args[0]) delegator.fit(*delegator_data.fit_args) for method in methods: if method in delegator_data.skip_methods: continue # smoke test delegation getattr(delegator, method)(delegator_data.fit_args[0]) for method in methods: if method in delegator_data.skip_methods: continue delegate = SubEstimator(hidden_method=method) delegator = delegator_data.construct(delegate) assert_false(hasattr(delegate, method)) assert_false(hasattr(delegator, method), msg="%s has method %r when its delegate does not" % (delegator_data.name, method))
def test_metaestimator_delegation(): # Ensures specified metaestimators have methods iff subestimator does def hides(method): @property def wrapper(obj): if obj.hidden_method == method.__name__: raise AttributeError('%r is hidden' % obj.hidden_method) return functools.partial(method, obj) return wrapper class SubEstimator(BaseEstimator): def __init__(self, param=1, hidden_method=None): self.param = param self.hidden_method = hidden_method def fit(self, X, y=None, *args, **kwargs): self.coef_ = np.arange(X.shape[1]) return True def _check_fit(self): check_is_fitted(self, 'coef_') @hides def inverse_transform(self, X, *args, **kwargs): self._check_fit() return X @hides def transform(self, X, *args, **kwargs): self._check_fit() return X @hides def predict(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def predict_log_proba(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def decision_function(self, X, *args, **kwargs): self._check_fit() return np.ones(X.shape[0]) @hides def score(self, X, y, *args, **kwargs): self._check_fit() return 1.0 methods = [k for k in iterkeys(SubEstimator.__dict__) if not k.startswith('_') and not k.startswith('fit')] methods.sort() for delegator_data in DELEGATING_METAESTIMATORS: delegate = SubEstimator() delegator = delegator_data.construct(delegate) for method in methods: if method in delegator_data.skip_methods: continue assert hasattr(delegate, method) assert hasattr(delegator, method), ( "%s does not have method %r when its delegate does" % (delegator_data.name, method)) # delegation before fit raises a NotFittedError if method == 'score': assert_raises(NotFittedError, getattr(delegator, method), delegator_data.fit_args[0], delegator_data.fit_args[1]) else: assert_raises(NotFittedError, getattr(delegator, method), delegator_data.fit_args[0]) delegator.fit(*delegator_data.fit_args) for method in methods: if method in delegator_data.skip_methods: continue # smoke test delegation if method == 'score': getattr(delegator, method)(delegator_data.fit_args[0], delegator_data.fit_args[1]) else: getattr(delegator, method)(delegator_data.fit_args[0]) for method in methods: if method in delegator_data.skip_methods: continue delegate = SubEstimator(hidden_method=method) delegator = delegator_data.construct(delegate) assert not hasattr(delegate, method) assert not hasattr(delegator, method), ( "%s has method %r when its delegate does not" % (delegator_data.name, method))
def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return the count vectors. This is more efficient than calling fit followed by transform. Parameters ---------- raw_documents : iterable An iterable which yields either str, unicode or file objects. Returns ------- vectors : array, [n_samples, n_features] """ # We intentionally don't call the transform method to make # fit_transform overridable without unwanted side effects in # TfidfVectorizer. fixed_vocab = self.fixed_vocabulary if fixed_vocab: vocab = self.vocabulary_ vocab_max_ind = max(six.itervalues(self.vocabulary_)) + 1 else: vocab = {} vocab_max_ind = 0 # Result of document conversion to term count arrays. row_ind = _make_int_array() col_ind = _make_int_array() feature_values = _make_int_array() term_counts = Counter() # term counts across entire corpus (count each term maximum once per # document) document_counts = Counter() analyze = self.build_analyzer() for n_doc, doc in enumerate(raw_documents): term_count_current = Counter(analyze(doc)) term_counts.update(term_count_current) if not fixed_vocab: for term in six.iterkeys(term_count_current): if term not in vocab: vocab[term] = vocab_max_ind vocab_max_ind += 1 document_counts.update(six.iterkeys(term_count_current)) for term, count in six.iteritems(term_count_current): if term in vocab: row_ind.append(n_doc) col_ind.append(vocab[term]) feature_values.append(count) n_doc += 1 if fixed_vocab: # XXX max_df, min_df and max_features have no effect # with a fixed vocabulary. i_indices = row_ind j_indices = col_ind values = feature_values else: max_features = self.max_features max_df = self.max_df min_df = self.min_df max_doc_count = (max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc) min_doc_count = (min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc) # filter out stop words: terms that occur in almost all documents if max_doc_count < n_doc or min_doc_count > 1: stop_words = set(t for t, dc in six.iteritems(document_counts) if not min_doc_count <= dc <= max_doc_count) else: stop_words = set() # list the terms that should be part of the vocabulary if max_features is None: terms = set(term_counts) - stop_words else: # extract the most frequent terms for the vocabulary terms = set() for t, tc in term_counts.most_common(): if t not in stop_words: terms.add(t) if len(terms) >= max_features: break # store the learned stop words to make it easier to debug the value # of max_df self.stop_words_ = stop_words # free memory term_counts.clear() document_counts.clear() # store map from term name to feature integer index: we sort the # terms to have reproducible outcome for the vocabulary structure: # otherwise the mapping from feature name to indices might depend # on the memory layout of the machine. Furthermore sorted terms # might make it possible to perform binary search in the feature # names array. terms = sorted(terms) # reorder term indices reorder_indices = dict((vocab[term], i) for i, term in enumerate(terms)) self.vocabulary_ = dict(((t, i) for i, t in enumerate(terms))) # create term count arrays with new vocabulary structure i_indices = _make_int_array() j_indices = _make_int_array() values = _make_int_array() for i, col in enumerate(col_ind): if col in reorder_indices: i_indices.append(row_ind[i]) j_indices.append(reorder_indices[col_ind[i]]) values.append(feature_values[i]) # free memory del reorder_indices del row_ind del col_ind del feature_values if not vocab: msg = "Empty vocabulary; " if fixed_vocab: msg += "%r passed to constructor." % vocab else: msg += "perhaps your documents contain stop words only?" raise ValueError(msg) # the term_counts and document_counts might be useful statistics, are # we really sure want we want to drop them? They take some memory but # can be useful for corpus introspection return self._term_counts_to_matrix(n_doc, i_indices, j_indices, values)
def fit_transform(self, raw_documents, y=None): """Learn the vocabulary dictionary and return the count vectors. This is more efficient than calling fit followed by transform. Parameters ---------- raw_documents : iterable An iterable which yields either str, unicode or file objects. Returns ------- vectors : array, [n_samples, n_features] """ if self.fixed_vocabulary: # No need to fit anything, directly perform the transformation. # We intentionally don't call the transform method to make it # fit_transform overridable without unwanted side effects in # TfidfVectorizer analyze = self.build_analyzer() term_counts_per_doc = (Counter(analyze(doc)) for doc in raw_documents) return self._term_count_dicts_to_matrix(term_counts_per_doc) self.vocabulary_ = {} # result of document conversion to term count dicts term_counts_per_doc = [] term_counts = Counter() # term counts across entire corpus (count each term maximum once per # document) document_counts = Counter() analyze = self.build_analyzer() # TODO: parallelize the following loop with joblib? # (see XXX up ahead) for doc in raw_documents: term_count_current = Counter(analyze(doc)) term_counts.update(term_count_current) document_counts.update(six.iterkeys(term_count_current)) term_counts_per_doc.append(term_count_current) n_doc = len(term_counts_per_doc) max_features = self.max_features max_df = self.max_df min_df = self.min_df max_doc_count = (max_df if isinstance(max_df, numbers.Integral) else max_df * n_doc) min_doc_count = (min_df if isinstance(min_df, numbers.Integral) else min_df * n_doc) # filter out stop words: terms that occur in almost all documents if max_doc_count < n_doc or min_doc_count > 1: stop_words = set(t for t, dc in six.iteritems(document_counts) if dc > max_doc_count or dc < min_doc_count) else: stop_words = set() # list the terms that should be part of the vocabulary if max_features is None: terms = set(term_counts) - stop_words else: # extract the most frequent terms for the vocabulary terms = set() for t, tc in term_counts.most_common(): if t not in stop_words: terms.add(t) if len(terms) >= max_features: break # store the learned stop words to make it easier to debug the value of # max_df self.stop_words_ = stop_words # store map from term name to feature integer index: we sort the term # to have reproducible outcome for the vocabulary structure: otherwise # the mapping from feature name to indices might depend on the memory # layout of the machine. Furthermore sorted terms might make it # possible to perform binary search in the feature names array. vocab = dict(((t, i) for i, t in enumerate(sorted(terms)))) if not vocab: raise ValueError("empty vocabulary; training set may have" " contained only stop words or min_df (resp. " "max_df) may be too high (resp. too low).") self.vocabulary_ = vocab # the term_counts and document_counts might be useful statistics, are # we really sure want we want to drop them? They take some memory but # can be useful for corpus introspection return self._term_count_dicts_to_matrix(term_counts_per_doc)