def test_feature_stacker(): # basic sanity check for feature stacker iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different pca object to control the random_state stream fs = FeatureUnion([("pca", pca), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def test_feature_stacker_weights(): # test feature stacker with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # check against expected result assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
def test_pipeline_methods_preprocessing_svm(): """Test the various methods of the pipeline (preprocessing + svm).""" iris = load_iris() X = iris.data y = iris.target n_samples = X.shape[0] n_classes = len(np.unique(y)) scaler = StandardScaler() pca = RandomizedPCA(n_components=2, whiten=True) clf = SVC(probability=True) for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) pipe.fit(X, y) # check shapes of various prediction functions predict = pipe.predict(X) assert_equal(predict.shape, (n_samples,)) proba = pipe.predict_proba(X) assert_equal(proba.shape, (n_samples, n_classes)) log_proba = pipe.predict_log_proba(X) assert_equal(log_proba.shape, (n_samples, n_classes)) decision_function = pipe.decision_function(X) assert_equal(decision_function.shape, (n_samples, n_classes)) pipe.score(X, y)
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
def main(fpath, hex): song_matrix = read_sparse(fpath) pca = RandomizedPCA(n_components = 2) pcas = pca.fit(song_matrix).transform(song_matrix) print(pca.explained_variance_ratio_) if hex: plt.hexbin(pcas[:,0], pcas[:,1], cmap=cm.get_cmap('bone_r', 100), bins='log', gridsize=100, mincnt=2) plt.colorbar() else: plt.scatter(pcas[:,0], pcas[:,1]) plt.legend() ax = plt.gca() plt.ylabel('Second Principal Component') plt.xlabel('First Principal Component') plt.title('PCA of the LastFM dataset') plt.show()
def test_pipeline_methods_randomized_pca_svm(): """Test the various methods of the pipeline (randomized pca + svm).""" iris = load_iris() X = iris.data y = iris.target # Test with PCA + SVC clf = SVC(probability=True) pca = RandomizedPCA(n_components=2, whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html # Provided to give you a starting point. Try a variety of classifiers. from sklearn.naive_bayes import GaussianNB from sklearn import svm from sklearn.metrics.classification import accuracy_score from sklearn.cross_validation import train_test_split from sklearn.metrics import f1_score features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) clf = svm.SVC(kernel='rbf',C=10000.0) clf=GaussianNB() pca=RandomizedPCA(n_components=4) filter=SelectKBest(f_classif,k=4) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. For more info: ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html # Example starting point. Try investigating other evaluation techniques! pipe_clf=make_pipeline(pca,filter,clf) pipe_clf.fit(features_train,labels_train) print pipe_clf
'PLSRegression':PLSRegression(), 'PLSSVD':PLSSVD(), 'PassiveAggressiveClassifier':PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor':PassiveAggressiveRegressor(), 'Perceptron':Perceptron(), 'ProjectedGradientNMF':ProjectedGradientNMF(), 'QuadraticDiscriminantAnalysis':QuadraticDiscriminantAnalysis(), 'RANSACRegressor':RANSACRegressor(), 'RBFSampler':RBFSampler(), 'RadiusNeighborsClassifier':RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor':RadiusNeighborsRegressor(), 'RandomForestClassifier':RandomForestClassifier(), 'RandomForestRegressor':RandomForestRegressor(), 'RandomizedLasso':RandomizedLasso(), 'RandomizedLogisticRegression':RandomizedLogisticRegression(), 'RandomizedPCA':RandomizedPCA(), 'Ridge':Ridge(), 'RidgeCV':RidgeCV(), 'RidgeClassifier':RidgeClassifier(), 'RidgeClassifierCV':RidgeClassifierCV(), 'RobustScaler':RobustScaler(), 'SGDClassifier':SGDClassifier(), 'SGDRegressor':SGDRegressor(), 'SVC':SVC(), 'SVR':SVR(), 'SelectFdr':SelectFdr(), 'SelectFpr':SelectFpr(), 'SelectFwe':SelectFwe(), 'SelectKBest':SelectKBest(), 'SelectPercentile':SelectPercentile(), 'ShrunkCovariance':ShrunkCovariance(),
minMaxScaler = MinMaxScaler(feature_range=(0.0, 1.0)) #normalizer = skprep.Normalizer() columnDeleter = fs.FeatureDeleter() # FEATURE SELECTION varianceThresholdSelector = VarianceThreshold(threshold=(0)) percentileSelector = SelectPercentile(score_func=f_classif, percentile=20) kBestSelector = SelectKBest(f_classif, 1000) # FEATURE EXTRACTION #rbmPipe = skpipe.Pipeline(steps=[('scaling', minMaxScaler), ('rbm', rbm)]) nmf = NMF(n_components=150) pca = PCA(n_components=80) sparse_pca = SparsePCA(n_components=700, max_iter=3, verbose=2) kernel_pca = KernelPCA(n_components=150) # Costs huge amounts of ram randomized_pca = RandomizedPCA(n_components=500) # REGRESSORS random_forest_regressor = RandomForestRegressor(n_estimators=256) gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60) support_vector_regressor = svm.SVR() # CLASSIFIERS support_vector_classifier = svm.SVC(probability=True, verbose=True) linear_support_vector_classifier = svm.LinearSVC(dual=False) nearest_neighbor_classifier = KNeighborsClassifier() extra_trees_classifier = ExtraTreesClassifier(n_estimators=256) bagging_classifier = BaggingClassifier( base_estimator=GradientBoostingClassifier(n_estimators=200, max_features=4), max_features=0.5,
class rPCA: """ Randomized Principal Component Analysis. Parameters ---------- data_set: array_like Input matrix M: integer Target dimensionality Methods ------- x_mean Mean value of the data pca_expvar Explained variance pca_eigvec Eigenvectors of the Covariance matrix pca_coeffs Coefficients of the data in the PC space pca_transform Projection of the data in the M dimensional space """ def __init__(self,data_set,M): self.data_set = data_set self.dimensionality = M self.rPCA = RandomizedPCA(n_components=M) self.rPCA.n_components self.rPCA.fit(data_set) def pca_eigvec(self): """ Principal components Returns ------- components: numpy array M Principal components """ return self.rPCA.components_ def pca_expvar(self): """ Percentage of variance explained by each component Returns ------- expvar: numpy array Explained variance """ return self.rPCA.explained_variance_ratio_ def pca_transform(self): """ Project the data in the M dimensional space Returns ------- transform: numpy array Transformed data """ return self.rPCA.transform(self.data_set) def x_mean(self): """ Mean vector of the observed data Returns ------- x_bar: numpy_array The mean vector of the observed data """ N,D = np.shape(self.data_set) x_bar = np.array([]) for oo in np.arange(0,D): x_bar = np.append(x_bar, np.mean(self.data_set[:,oo])) return x_bar def pca_coeffs(self,x_vector): """ Transformation of an observation vector in the M principal components Parameters ---------- x_vector: array _like Input vector of the same dimension as the data_set Returns ------- x_tilde: numpy array Transformed vector """ # Vectors as column vectors x = np.matrix(x_vector).T x_bar = np.matrix(self.x_mean()).T x_tilde = np.array([]) u_vec = self.pca_eigvec() for oo in np.arange(0,self.dimensionality): u_tmp = np.matrix(u_vec[oo,:]).T tmp = x.T*u_tmp-x_bar.T*u_tmp x_tilde = np.append(x_tilde,tmp) return x_tilde
def __init__(self,data_set,M): self.data_set = data_set self.dimensionality = M self.rPCA = RandomizedPCA(n_components=M) self.rPCA.n_components self.rPCA.fit(data_set)