def pca_kpca(train_data, labels): estimators = make_union(PCA(), TruncatedSVD(), KernelPCA()) # estimators = [('linear_pca', PCA()), ('kernel_pca', KernelPCA())] combined = FeatureUnion(estimators) combined.fit(train_data, labels) # combined.fit_tranform(tain_data, labels) return combined
def rbf_kernels(env, n_samples=100000, gamma=[0.01, 0.1], n_components=100): """Represent observation samples using RBF-kernels. EXAMPLE ------- >>> env = gym.make('MountainCar-v0') >>> n_params, rbf = rbf_kernels(env, n_components=100) >>> sample = env.observation_space.sample().reshape((1, env.observation_space.shape[0])) >>> rbf(sample).shape (1, 100) """ observation_examples = np.array([env.observation_space.sample() for _ in range(n_samples)]) # Fit feature scaler scaler = sklearn.preprocessing.StandardScaler() scaler.fit(observation_examples) # Fir feature extractor features = [] for g in gamma: features.append(('gamma={}'.format(g), RBFSampler(n_components=n_components // len(gamma), gamma=g))) features = FeatureUnion(features) features.fit(scaler.transform(observation_examples)) def _rbf_kernels(observation): return features.transform(scaler.transform(observation)) return _rbf_kernels
def test_set_feature_union_step_none(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=None) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=None) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X))
def test_feature_union(self): """Tests that combining multiple featurizers works as expected""" modules = ["bag-of-words", "entities"] modules_list, _ = modules_to_dictionary(modules) feature_union = FeatureUnion(modules_list) feature_union.fit(texts_entities, outcomes) feature_union.transform(["unknown"])
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def test_feature_stacker(): # basic sanity check for feature stacker iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def test_feature_union_parallel(): # test that n_jobs work for FeatureUnion X = JUNK_FOOD_DOCS fs = FeatureUnion([("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))]) fs_parallel = FeatureUnion( [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2 ) fs_parallel2 = FeatureUnion( [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2 ) fs.fit(X) X_transformed = fs.transform(X) assert_equal(X_transformed.shape[0], len(X)) fs_parallel.fit(X) X_transformed_parallel = fs_parallel.transform(X) assert_equal(X_transformed.shape, X_transformed_parallel.shape) assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray()) # fit_transform should behave the same X_transformed_parallel2 = fs_parallel2.fit_transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) # transformers should stay fit after fit_transform X_transformed_parallel2 = fs_parallel2.transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
def pca(x, y, test_x, n_features=-1): if n_features == -1: n_features = int(np.ceil(np.sqrt(x.shape[1]))) pca = PCA(n_components=n_features) selection = SelectKBest(k=n_features/2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) combined_features.fit(x, y) return combined_features.transform(x), combined_features.transform(test_x)
def taskA(strTweets, stances, X_train, X_test, y_train, y_test): le = preprocessing.LabelEncoder() count_word = TfidfVectorizer(ngram_range=(1, 3)) count_char = TfidfVectorizer(analyzer='char', ngram_range=(4, 4)) vectorizer = FeatureUnion([('word', count_word), ('char', count_char)]) vectorizer.fit(strTweets) y_train = le.fit_transform(y_train) y_test = le.fit_transform(y_test) X_test = vectorizer.transform(X_test) X_train = vectorizer.transform(X_train) print("Task A w/o gender: {}".format( run_tests(X_train, X_test, y_train, y_test)))
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35) ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) assert_raise_message( AttributeError, 'Transformer tr1 (type Transf) does not provide ' 'get_feature_names', ft.get_feature_names)
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert "chars__" in feat or "words__" in feat assert len(feature_names) == 35 ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) assert_raise_message( AttributeError, 'Transformer tr1 (type Transf) does not provide ' 'get_feature_names', ft.get_feature_names)
def test_feature_union_warns_unknown_transformer_weight(): # Warn user when transformer_weights containers a key not present in # transformer_list X = [[1, 2], [3, 4], [5, 6]] y = [0, 1, 2] transformer_list = [("transf", Transf())] # Transformer weights dictionary with incorrect name weights = {"transformer": 1} expected_msg = ('Attempting to weight transformer "transformer", ' "but it is not present in transformer_list.") union = FeatureUnion(transformer_list, transformer_weights=weights) with pytest.raises(ValueError, match=expected_msg): union.fit(X, y)
def data_vectorize(df): russian_stop = set(stopwords.words("russian")) tfidf_para = { "stop_words": russian_stop, "analyzer": "word", "token_pattern": r"\w{1,}", "sublinear_tf": True, "dtype": np.float32, "norm": "l2", #"min_df":5, #"max_df":.9, "smooth_idf": False } def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ ("description", TfidfVectorizer(ngram_range=(1, 2), max_features=36000, **tfidf_para, preprocessor=get_col("description"))), ("title_description", TfidfVectorizer(ngram_range=(1, 2), max_features=200000, **tfidf_para, preprocessor=get_col("title_description"))), ("text_feature", CountVectorizer(ngram_range=(1, 2), preprocessor=get_col("text_feature"))), ("title", TfidfVectorizer(ngram_range=(1, 2), **tfidf_para, preprocessor=get_col("title"))), ]) vectorizer.fit(df.to_dict("records")) ready_full_df = vectorizer.transform(df.to_dict("records")) tfvocab = vectorizer.get_feature_names() df.drop([ "text_feature", "text_feature_2", "description", "title", "title_description" ], axis=1, inplace=True) df.fillna(-1, inplace=True) return df, ready_full_df, tfvocab
def test_feature_stacker_weights(): # test feature stacker with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # check against expected result assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
def test_feature_union_feature_names(): word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert "chars__" in feat or "words__" in feat assert len(feature_names) == 35 ft = FeatureUnion([("tr1", Transf())]).fit([[1]]) msg = re.escape("Transformer tr1 (type Transf) does not provide get_feature_names") with pytest.raises(AttributeError, match=msg): ft.get_feature_names()
def __init__(self, env, n_components=500): observation_samples = np.array([env.observation_space.sample() for x in range(10000)]) scaler = StandardScaler() scaler.fit(observation_samples.astype('float')) #convert a state to feature representation featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=3.0, n_components=n_components)), #("rbf2", RBFSampler(gamma=.5, n_components=n_components)), ]) featurizer.fit(scaler.transform(observation_samples.astype('float'))) self.scaler = scaler self.featurizer = featurizer
class RBFAgent(BaseAgent): def __init__(self): super().__init__("MountainCar-v0") self.name = "RBFAgent" self.env._max_episode_steps = 300 self.max_epochs = 1000 self.alpha = 0.1 self.gamma = 1.0 self.epsilon = 0.8 gammas = [0.5, 1.0, 2.0, 3.5, 5.0] self.n_actions = self.env.action_space.n self.n_components = 30 features = [(f'rbf{i}', RBFSampler(gamma=g, n_components=self.n_components, random_state=1)) for i, g in enumerate(gammas)] samples = np.array( [self.env.observation_space.sample() for _ in range(10000)]) self.scaler = StandardScaler() self.scaler.fit(samples) self.featurizer = FeatureUnion(features) self.featurizer.fit(self.scaler.transform(samples)) self.w = np.zeros((self.n_actions, self.n_components * len(features))) def featurize(self, state): return self.featurizer.transform(self.scaler.transform([state])) def Q(self, state, action): return state.dot(self.w[action]) def policy(self, state, epsilon=0): A = np.ones(self.n_actions, dtype=float) * epsilon / self.n_actions a = np.argmax([self.Q(state, a) for a in range(self.n_actions)]) A[a] += (1.0 - epsilon) return np.random.choice(self.n_actions, p=A) def perform_step(self, state, action): next_state, reward, done, _ = self.env.step(action) next_state = self.featurize(next_state) next_action = self.policy(next_state) current_q = self.Q(state, action) next_q = self.Q(next_state, next_action) self.w[action] += self.alpha * (reward + self.gamma * next_q - current_q).dot(state) return next_state, done
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = assert_no_warnings(clone, fs) assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1]) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8)) # test error if some elements do not support transform assert_raises_regex(TypeError, 'All estimators should implement fit and ' 'transform.*\\bNoTrans\\b', FeatureUnion, [("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y)
def fit(self, X, y=None): Trans2 = Q2Transformer() Trans3 = Q3Transformer() Trans4 = Q4Transformer() combined_features = FeatureUnion([("Q2", Trans2), ("Q3", Trans3), ("Q4", Trans4)]) self.fit = combined_features.fit(X) return self
class Featurizer(BaseEstimator, TransformerMixin): """constructs a feature union of text and numeric features for each video. """ def __init__(self, *args, **kwargs): self.featurizer = FeatureUnion( transformer_list=[ ('text_title', Pipeline([ ('selector', ItemSelector(key='title')), ('count_vectorizer', CountVectorizer(*args, **kwargs)), ])), ('text_channel_title', Pipeline([ ('selector', ItemSelector(key='channel_title')), ('count_vectorizer', CountVectorizer(*args, **kwargs)), ])), ('numeric', NumericFeatures()), ], # weight components in FeatureUnion transformer_weights={ 'text_title': 1.0, 'text_channel_title': 1.0, 'numeric': 1.0, }, ) def fit(self, X, y=None): return self.featurizer.fit(X) def transform(self, X): return self.featurizer.transform(X).todense()
def concat_feature_extractors(train_data, labels): # This dataset is way to high-dimensional. Better do PCA: pca = PCA(n_components=2) # Maybe some original features where good, too? selection = SelectKBest(k=1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit(train_data, labels).transform(train_data) # Classify: svm = SVC(kernel="linear") svm.fit(X_features, labels) # Do grid search over k, n_components and C: pipeline = Pipeline([("features", combined_features), ("svm", svm)]) param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2], svm__C=[0.1, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10) grid_search.fit(train_data, labels) print(grid_search.best_estimator_)
def testSVC(lbda=1.0, n_components=20, kbest=4): otto = load_otto() X = otto.data y = otto.target # X = otto.data[:10000, :10] # y = otto.target[:10000] scaler = StandardScaler().fit(X) X = scaler.transform(X) pca = PCA(n_components=n_components) selection = SelectKBest(k=kbest) combined_features = FeatureUnion( [("pca", pca), ("univ_select", selection)] ) X_features = combined_features.fit(X, y).transform(X) svc = SVC(C=1.0/lbda, kernel='rbf', cache_size=400, probability=True) pipe = Pipeline(steps=[('features', combined_features), ('svc', svc)]) trainData = X trainTarget = y pipe.fit(trainData, trainTarget) test_otto = load_testotto() testData = test_otto.data testData = scaler.transform(testData) 'save the prediction' prediction = pipe.predict_proba(testData) proba = pipe.predict_proba(testData) save_submission(lbda, proba, prediction)
def testSVC(lbda=1.0, n_components=20, kbest=4): otto = load_otto() X = otto.data y = otto.target # X = otto.data[:10000, :10] # y = otto.target[:10000] scaler = StandardScaler().fit(X) X = scaler.transform(X) pca = PCA(n_components=n_components) selection = SelectKBest(k=kbest) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit(X, y).transform(X) svc = SVC(C=1.0 / lbda, kernel='rbf', cache_size=400, probability=True) pipe = Pipeline(steps=[('features', combined_features), ('svc', svc)]) trainData = X trainTarget = y pipe.fit(trainData, trainTarget) test_otto = load_testotto() testData = test_otto.data testData = scaler.transform(testData) 'save the prediction' prediction = pipe.predict_proba(testData) proba = pipe.predict_proba(testData) save_submission(lbda, proba, prediction)
def best_estimator(self, X, y): try: pca = PCA(n_components=2) selection = SelectKBest(k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit(X, y).transform(X) regr = linear_model.LassoCV() pipeline = Pipeline([("features", combined_features), ("regression", regr)]) if 'batter' in self.player: param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2]) else: param_grid = dict(features__pca__n_components=[1, 2,3], features__univ_select__k=[1,2]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=100) grid_search.fit(X, y) self.modelled = True regr = grid_search return regr except ValueError,e: print e self.modelled = False return None
def featureVect(X_train, y, compoents, feature_para): bigram_vectorizer = CountVectorizer(ngram_range=(1, 25), stop_words="english") X_2 = bigram_vectorizer.fit_transform(X_train).toarray() vectorizer = TfidfVectorizer(ngram_range=(1, 25), stop_words="english") X_2_DFIDF = vectorizer.fit_transform(X_train).toarray() X = np.multiply(X_2, X_2_DFIDF) # This dataset is way to high-dimensional. Better do PCA: # pca = PCA(n_components=400) pca = SparsePCA(n_components=compoents[0]) # Build estimator from PCA and Univariate selection: # ,("dfr",selection_fdr),("fwe",selection_fwe),("fpr",selection_fpr), ("univ_select", selection) feature_list = [("pca", pca)] feature_list += feature_para combined_features = FeatureUnion(feature_list) # Use combined features to transform dataset: X_features = combined_features.fit(X, y).transform(X) select_chi = chi2(X_2, y) ind = np.argpartition(select_chi[0], -compoents[1])[-compoents[1]:] selection_chi2 = X_2[:, ind] X_features = np.concatenate((X_features, selection_chi2), axis=1) return [X_features, combined_features, bigram_vectorizer, vectorizer, ind]
def build_cat_data(category, model, best_params, train_data, dev_data, train_labels, dev_labels, train_categories, dev_categories): if model not in ['mlp', 'knn']: # reduce # f dimensions pca = PCA(n_components=best_params['features__pca__n_components']) # Select high value original features selection = SelectKBest(k=best_params['features__univ_select__k']) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: sub_features = combined_features.fit(train_data, train_labels) train_reduced = sub_features.transform(train_data) dev_reduced = sub_features.transform(dev_data) else: train_reduced = train_data dev_reduced = dev_data #now, subset out the correct set of data based on category idx = list(np.array(np.where(train_categories == category))[0]) idx_dev = list(np.array(np.where(dev_categories == category))[0]) cat_train_data = train_reduced.take(idx, axis=0) cat_train_labels = train_labels.take(idx, axis=0) cat_dev_data = dev_reduced.take(idx_dev, axis=0) cat_dev_labels = dev_labels.take(idx_dev, axis=0) return cat_train_data, cat_train_labels, cat_dev_data, cat_dev_labels
def testLogistic(lbda=1.0, n_components=20, kbest=4): # X = otto.data[:1000, :20] # y = otto.target[:1000] otto = load_otto() X = otto.data[:, :] y = otto.target[:] # n_components = 20 # kbest = 4 # print 'y.shape =', y.shape scalar = StandardScaler().fit(X) X = scalar.transform(X) pca = PCA(n_components=n_components) selection = SelectKBest(k=kbest) combined_features = FeatureUnion( [("pca", pca), ('univ_select', selection)] ) X_features = combined_features.fit(X,y).transform(X) logistic = LogisticRegression(C=1.0/lbda) pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)]) trainData = X trainTarget = y pipe.fit(trainData, trainTarget) # print trainTarget test_otto = load_testotto() testData = test_otto.data testData = scalar.transform(testData) # logging.debug('lambda=%.3f: score is %.3f' % (lbda, pipe.score())) 'save the prediction' prediction = pipe.predict_proba(testData) proba = pipe.predict_proba(testData) save_submission(lbda, proba, prediction)
def concat_feature_extractors(train_data, labels): # This dataset is way to high-dimensional. Better do PCA: pca = PCA(n_components = 2) # Maybe some original features where good, too? selection = SelectKBest(k = 1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit(train_data, labels).transform(train_data) # Classify: svm = SVC(kernel = "linear") svm.fit(X_features, labels) # Do grid search over k, n_components and C: pipeline = Pipeline([("features", combined_features), ("svm", svm)]) param_grid = dict(features__pca__n_components = [1, 2, 3], features__univ_select__k = [1, 2], svm__C = [0.1, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose = 10) grid_search.fit(train_data, labels) print(grid_search.best_estimator_)
def trainItalianSexClassifier(self): #get correct labels from dictionary in trainY and testY trainX = self.italianTrainData[0] trainY = self.getYlabels(self.italianTrainData[1], 'sex') combined_features = FeatureUnion([("tfidf", TfidfVectorizer()), ("ngrams", TfidfVectorizer(ngram_range=(3, 3), analyzer="char")), ("counts", CountVectorizer()), ("latin", Latin()), ],transformer_weights={ 'latin': 1, 'tfidf': 2, 'ngrams': 2, 'counts': 1, }) X_features = combined_features.fit(trainX, trainY).transform(trainX) classifier = svm.LinearSVC() pipeline = Pipeline([("features", combined_features), ("classifier", classifier)]) pipeline.fit(trainX, trainY) return pipeline
def best_estimator(self, X, y): try: pca = PCA(n_components=2) selection = SelectKBest(k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit(X, y).transform(X) regr = linear_model.LassoCV() pipeline = Pipeline([("features", combined_features), ("regression", regr)]) if 'batter' in self.player: param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2]) else: param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=100) grid_search.fit(X, y) self.modelled = True regr = grid_search return regr except ValueError, e: print e self.modelled = False return None
def example(): import numpy as np from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp.fit([[1, 2], [np.nan, 3], [7, 6]]) X = [[np.nan, 2], [6, np.nan], [7, 6]] print(imp.transform(X)) ###################################### from sklearn.datasets import load_iris from sklearn.impute import SimpleImputer, MissingIndicator from sklearn.model_selection import train_test_split from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.tree import DecisionTreeClassifier X, y = load_iris(return_X_y=True) mask = np.random.randint(0, 2, size=X.shape).astype(np.bool) X[mask] = np.nan X_train, X_test, y_train, _ = train_test_split(X, y, test_size=100, random_state=0) transformer = FeatureUnion( transformer_list=[('features', SimpleImputer( strategy='mean')), ('indicators', MissingIndicator())]) transformer = transformer.fit(X_train, y_train) results = transformer.transform(X_test) print(results.shape) clf = make_pipeline(transformer, DecisionTreeClassifier()) clf = clf.fit(X_train, y_train) results = clf.predict(X_test) print(results.shape)
def testLogistic(lbda=1.0, n_components=20, kbest=4): # X = otto.data[:1000, :20] # y = otto.target[:1000] otto = load_otto() X = otto.data[:, :] y = otto.target[:] # n_components = 20 # kbest = 4 # print 'y.shape =', y.shape scalar = StandardScaler().fit(X) X = scalar.transform(X) pca = PCA(n_components=n_components) selection = SelectKBest(k=kbest) combined_features = FeatureUnion([("pca", pca), ('univ_select', selection)]) X_features = combined_features.fit(X, y).transform(X) logistic = LogisticRegression(C=1.0 / lbda) pipe = Pipeline(steps=[('features', combined_features), ('logistic', logistic)]) trainData = X trainTarget = y pipe.fit(trainData, trainTarget) # print trainTarget test_otto = load_testotto() testData = test_otto.data testData = scalar.transform(testData) # logging.debug('lambda=%.3f: score is %.3f' % (lbda, pipe.score())) 'save the prediction' prediction = pipe.predict_proba(testData) proba = pipe.predict_proba(testData) save_submission(lbda, proba, prediction)
def best_estimator(self, X, y): try: pca = PCA(n_components=2) selection = SelectKBest(k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit(X, y).transform(X) regr = linear_model.LassoCV() pipeline = Pipeline([("features", combined_features), ("regression", regr)]) if 'batter' in self.player: param_grid = dict(features__pca__n_components=[1], features__univ_select__k=[1]) else: param_grid = dict(features__pca__n_components=[1, 2, 3, 4], features__univ_select__k=[1, 2, 3, 4]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=0) grid_search.fit(X, y) self.modelled = True regr = grid_search self.R2 = r2_score( self.target_matrix, regr.predict(self.feature_matrix) ) #Ian: should do R2 on predicted points vs. points on a given day return regr except ValueError, e: print e self.modelled = False return None
def __init__(self, env, batch, n_components=500): self.actions = [0, 1, 2] # self.scaler = StandardScaler().fit(batch) # self.features_extractor = FeatureUnion([ # ("rbf1", RBFSampler(gamma=0.05, n_components=1000)), # ("rbf2", RBFSampler(gamma=1.0, n_components=1000)), # ("rbf3", RBFSampler(gamma=0.5, n_components=1000)), # ("rbf4", RBFSampler(gamma=0.1, n_components=1000)) # ]) # self.features_extractor.fit(self.scaler.transform(batch)) observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) self.scaler = StandardScaler() self.scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)), ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)), ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)), ("rbf4", RBFSampler(gamma=0.5, n_components=n_components)) ]) self.features_extractor = featurizer.fit( self.scaler.transform(observation_examples)) D = len( self.features_extractor.transform(self.scaler.transform(batch))[0]) self.w = np.array( [np.random.randn(D) / np.sqrt(D) for a in self.actions]) self.e = np.zeros((len(self.actions), D))
def __init__(self, env: TimeLimit): observation_examples = np.array( [env.observation_space.sample() for _ in range(10000)]) scaler = StandardScaler() scaler.fit(observation_examples) featurizer = FeatureUnion([ ('rbf1', RBFSampler(gamma=5.0, n_components=500)), ('rbf2', RBFSampler(gamma=2.0, n_components=500)), ('rbf3', RBFSampler(gamma=1.0, n_components=500)), ('rbf4', RBFSampler(gamma=0.5, n_components=500)), ]) featurizer.fit(scaler.transform(observation_examples)) self.scaler = scaler self.featurizer = featurizer
def best_estimator(self, X, y): try: pca = PCA(n_components=2) selection = SelectKBest(k=2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit(X, y).transform(X) regr = linear_model.LassoCV() pipeline = Pipeline([("features", combined_features), ("regression", regr)]) if 'batter' in self.player: param_grid = dict(features__pca__n_components=[1], features__univ_select__k=[1]) else: param_grid = dict(features__pca__n_components=[1,2,3,4], features__univ_select__k=[1,2,3,4]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=0) grid_search.fit(X, y) self.modelled = True regr = grid_search self.R2=r2_score(self.target_matrix,regr.predict(self.feature_matrix)) #Ian: should do R2 on predicted points vs. points on a given day return regr except ValueError,e: print e self.modelled = False return None
def fit(self, **kwargs): self.feature_list = kwargs.get('feature_list', None) k_single = kwargs.get('k_single', 0) k_pca = kwargs.get('k_pca', 1) self.train_x, self.train_y = self.tf_sample(self.train_x, self.train_y) # 数据归一化 scaler = preprocessing.StandardScaler() self.scalar_ = scaler.fit(self.train_x) # pca selection = SelectKBest(k=k_single) n_components = int(len(self.feature_names) * k_pca) pca = PCA(n_components=n_components) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) self.pca = combined_features.fit(self.train_x, self.train_y) self.pca = PCA(n_components=n_components).fit(self.train_x) self.model = SVC(kernel=self.kernel) fit_data = self.train_x.copy() fit_data = self.scalar_.transform(fit_data) fit_data = self.pca.transform(fit_data) self.model.fit(fit_data, self.train_y) # 评估训练集上的效果 self.train_y_pred = self.predict(self.train_x) self.train_y = np.array(self.train_y) self.train_y_pred = np.array(self.train_y_pred) self.train_ev = self.evaluation.evaluate(y_true=self.train_y, y_pred=self.train_y_pred, threshold=0.5) return self
class ImputerIndicatorPrim(primitive): def __init__(self, random_state=0): super(ImputerIndicatorPrim, self).__init__(name='imputerIndicator') self.id = 3 self.hyperparams = [] self.type = 'data preprocess' self.description = "All features will be imputed using SimpleImputer, in order to enable classifiers to work with this data. Additionally, it adds the the indicator variables from MissingIndicator." self.hyperparams_run = {'default': True} self.random_state = random_state self.imp = FeatureUnion(transformer_list=[('features', SimpleImputer()), ('indicators', MissingIndicator())]) self.num_cols = None self.imp_cols = None self.accept_type = 'b' def can_accept(self, data): return self.can_accept_b(data) def is_needed(self, data): # data = handle_data(data) if data['X'].isnull().any().any(): return True return False def fit(self, data): data = handle_data(data) self.num_cols = data['X']._get_numeric_data().columns self.imp.fit(data['X'][self.num_cols]) self.imp_cols = data['X'][self.num_cols].columns[data['X'][self.num_cols].isnull().any()].tolist() def produce(self, data): output = handle_data(data) cols = self.num_cols.tolist() reg_cols = list(set(cols)-set(self.imp_cols)) # new_cols = ["{}_imp_mean".format(v) for v in list(imp_cols)] for i in range(len(cols)): if cols[i] in reg_cols: continue elif cols[i] in self.imp_cols: cols[i] = "{}_imp_mean".format(cols[i]) result = self.imp.transform(output['X'][self.num_cols]) # extra_cols = list(range(result.shape[1] - len(cols))) extra_cols = ["{}_miss_indicator".format(v) for v in self.imp_cols] output['X'] = pd.DataFrame(result, columns=cols + extra_cols).reset_index(drop=True).infer_objects() output['X'] = output['X'].ix[:,~output['X'].columns.duplicated()] final_output = {0: output} return final_output
def test_same_result(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names() ) # test same results X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
def test_feature_union_feature_names(): JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", "the coke burger burger", ) word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
def __init__(self,env): observation_examples = np.array([env.observation_space.sample() for x in range(10000)]) scaler = StandardScaler() # mean 0 varaince 1 scaler.fit(observation_examples) # used to convert a state to a featurized representation # we use RBF kernels with different variance featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=500)),# n_components refer to the number exemplers ("rbf2", RBFSampler(gamma=2.0, n_components=500)), ("rbf3", RBFSampler(gamma=1.0, n_components=500)), ("rbf4", RBFSampler(gamma=0.5, n_components=500)), ]) featurizer.fit(scaler.transform(observation_examples)) self.scaler= scaler self.featurizer = featurizer
def __init__(self, env, n_components=500): examples = np.array( [env.observation_space.sample() for x in range(10000)], dtype=np.float64) scaler = StandardScaler() scaler.fit(examples) featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)), ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)), ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)), ("rbf4", RBFSampler(gamma=0.5, n_components=n_components)) ]) # example_features = featurizer.fit_transform(scaler.transform(examples)) featurizer.fit(scaler.transform(examples)) self.scaler = scaler self.featurizer = featurizer
def _fit_features_matrix_target_array(self, X: pd.DataFrame): """Get features matrix and target array. TODO - more description helpful.""" features = self._get_features_matrix_transformer() target = self._get_target_array_transformer() feat_tar = FeatureUnion(transformer_list=[("features", features), ("target", target)]) self.fitted_features_and_target_ = feat_tar.fit(X)
def __init__(self, env): observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = StandardScaler() scaler.fit(observation_examples) # used to concatenate feature vectors since RBF uses scale parameter featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=500)), ("rbf2", RBFSampler(gamma=2.0, n_components=500)), ("rbf3", RBFSampler(gamma=1.0, n_components=500)), ("rbf4", RBFSampler(gamma=0.5, n_components=500)), ]) featurizer.fit(scaler.transform(observation_examples)) self.scaler = scaler self.featurizer = featurizer
def make_tfidf(train, test): russian_stop = set(stopwords.words('russian')) tfidf_para = { "stop_words": russian_stop, "analyzer": 'word', "token_pattern": r'\w{1,}', "sublinear_tf": True, "dtype": np.float32, "norm": 'l2', # "min_df":5, # "max_df":.9, "smooth_idf": False } def get_col(col_name): return lambda x: x[col_name] vectorizer = FeatureUnion([ ('description', TfidfVectorizer(ngram_range=(1, 2), max_features=100, **tfidf_para, preprocessor=get_col('description'))), # ('text_feat', CountVectorizer( # ngram_range=(1, 2), # # max_features=7000, # preprocessor=get_col('text_feat'))), ('title', TfidfVectorizer(ngram_range=(1, 2), **tfidf_para, max_features=70, preprocessor=get_col('title'))) ]) vectorizer.fit(train) ret_df = vectorizer.transform(train) feature_names = vectorizer.get_feature_names() return ret_df, feature_names # vectorizer.fit(df.loc[traindex, :].to_dict('records')) # ready_df = vectorizer.transform(df.to_dict('records')) # tfvocab = vectorizer.get_feature_names() # # # # get char count # length_of_words = len(df["len"])
def test_feature_stacker_feature_names(): JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", "the pizza burger beer copyright", "the the pizza beer beer copyright", "the burger beer beer copyright", "the coke burger coke copyright", "the coke burger burger", ) word_vect = CountVectorizer(analyzer="word") char_vect = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) ft = FeatureUnion([("chars", char_vect), ("words", word_vect)]) ft.fit(JUNK_FOOD_DOCS) feature_names = ft.get_feature_names() for feat in feature_names: assert_true("chars__" in feat or "words__" in feat) assert_equal(len(feature_names), 35)
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
def get_pca_transformer(train_x, train_y, n_components=-1): if n_components == -1: n_components = int(np.ceil(np.sqrt(train_x.shape[1]))) pca = PCA(n_components=n_components) selection = SelectKBest(k=n_components/2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) return combined_features.fit(train_x, train_y)
def test_same_result_weight(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], transformer_weights={"words": 10}) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], transformer_weights={"words": 10}) loc_union.fit(X) dist_union.fit(Z) X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
def ageClassifier(doc, age): """ A function that trains an age classifier """ xTrain = doc yTrain = age unionOfFeatures = FeatureUnion([ ('normaltfidf', TfidfVectorizer(preprocessor = identity, tokenizer = identity)), ('bigrams', TfidfVectorizer(preprocessor = identity, tokenizer = identity, ngram_range = (3,3), analyzer = 'char')), ('counts', CountVectorizer(preprocessor = identity, tokenizer = identity)) ]) featureFit = unionOfFeatures.fit(xTrain, yTrain).transform(xTrain) classifier = Pipeline([('featureunion', unionOfFeatures), ('cls', svm.SVC(kernel='linear', C=1.5))]) classifier.fit(xTrain, yTrain) return classifier
def old(): plt.figure(1, figsize=(4, 3)) plt.clf() plt.plot(pca.explained_variance_ratio_, linewidth=2) plt.axis('tight') plt.xlabel('n_components') plt.ylabel('explained variance') plt.show() # initialize selectKBest to pick the best of the ones that occur naturally # the feature union does not check whether there is overlap between the estimators # so we need to seriously watch out for this... selection = SelectKBest(k=1) # build a dict with these for pipeline purposes combined_features = FeatureUnion([('pca', pca), ('univ_select', selection)]) # use the combined features to transform the dataset X_features = combined_features.fit(X, y).transform(X) # initialize the svm svm = SVR(kernel="linear") # I think I put the scaler into the first set of the pipeline... pipeline = Pipeline([('scaler', scaler), ('features', combined_features), ('svm', svm)]) param_grid = dict(features__pca__n_components=[2, 5, 10], features__univ_select__k=[1, 2], svm__C=[0.1, 1, 10]) #scoring: precision, accuracy, recall, grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10) grid_search.fit(X, y) print grid_search.best_estimator_ scores = grid_search.grid_scores_ fig, ax = plt.subplots() ax.scatter(y, predicted) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.show()
def optimize_clf (clf, dataset, feature_list,params): data = featureFormat(dataset, feature_list, sort_keys = True) labels, features = targetFeatureSplit(data) scores_arr = [] pca = PCA() selection = SelectKBest(k = 1) combined_features = FeatureUnion([("pca",pca),("univ_select",selection)]) X_features = combined_features.fit(features,labels).transform(features) pipeline = Pipeline([("features", combined_features),("clf",clf)]) pca_range = range(1,len(feature_list)) params['features__pca__n_components']=pca_range k_range = range(1,len(feature_list)) k_range.append('all') params['features__univ_select__k']=k_range grid_search = GridSearchCV(pipeline, param_grid= params, scoring = "f1") grid_search.fit(features,labels) return grid_search.best_estimator_
def testLogistic(otto, lbda=1.0, n_components=20, kbest=4): # X = otto.data[:1000, :20] # y = otto.target[:1000] X = otto.data[:, :] y = otto.target[:] # n_components = 20 # kbest = 4 # print 'y.shape =', y.shape scalar = StandardScaler().fit(X) X = scalar.transform(X) pca = PCA(n_components=n_components) selection = SelectKBest(k=kbest) combined_features = FeatureUnion( [("pca", pca), ('univ_select', selection)] ) X_features = combined_features.fit(X,y).transform(X) logistic = LogisticRegression(C=1.0/lbda) pipes = [ Pipeline(steps=[('features', combined_features), ('logistic', logistic)])\ for i in range(5) ] cv = KFold(n=X.shape[0], n_folds=5, shuffle=True) threadList = [] for i,(trainIndex, testIndex) in enumerate(cv): pipe = pipes[i] trainData = X[trainIndex] trainTarget = y[trainIndex] # print trainTarget testData = X[testIndex] testTarget = y[testIndex] # pipe.fit(trainData, trainTarget) t = otto_thread( name=str(i+1), args=(pipe,), kwargs={'algo':'Logistic', 'train':(trainData, trainTarget), 'test':(testData, testTarget)} ) t.start() threadList.append(t) for t in threadList: t.join()
def analysis(): genotype=pandas.read_excel('test.xlsx','data') print(genotype.describe()) # Author: Andreas Mueller <*****@*****.**> # # License: BSD 3 clause from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.grid_search import GridSearchCV from sklearn.svm import SVC from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest iris = load_iris() X, y = iris.data, iris.target # This dataset is way to high-dimensional. Better do PCA: pca = PCA(n_components=2) # Maybe some original features where good, too? selection = SelectKBest(k=1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit(X, y).transform(X) svm = SVC(kernel="linear") # Do grid search over k, n_components and C: pipeline = Pipeline([("features", combined_features), ("svm", svm)]) param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2], svm__C=[0.1, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10) grid_search.fit(X, y) print(grid_search.best_estimator_)