def test_pca_on_uncentered_data():
    pca1 = PCA(solver='svd')
    pca1.fit(X)

    pca2 = PCA(solver='eigen')
    pca2.fit(X)
    assert_almost_equal(pca1.e_vals_normalized_, pca2.e_vals_normalized_)
def test_evals():

    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)

    pca = PCA(n_components=2, solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, [2.9, 0.9, 0.2, 0.02], decimal=1)
def test_whitening():
    pca = PCA(n_components=2)
    res = pca.fit(X_std).transform(X_std)
    diagonals_sum = np.sum(np.diagonal(np.cov(res.T)))
    assert round(diagonals_sum, 1) == 3.9, diagonals_sum

    pca = PCA(n_components=2, whitening=True)
    res = pca.fit(X_std).transform(X_std)
    diagonals_sum = np.sum(np.diagonal(np.cov(res.T)))
    assert round(diagonals_sum, 1) == 2.0, diagonals_sum
Ejemplo n.º 4
0
def test_evals():

    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X_std)

    expected = [2.93035378, 0.92740362, 0.14834223, 0.02074601]
    assert_almost_equal(pca.e_vals_, expected, decimal=5)

    pca = PCA(n_components=2, solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.e_vals_, expected, decimal=5)
def test_loadings():

    expect = np.array([[0.9, -0.4, -0.3, 0.], [-0.5, -0.9, 0.1, -0.],
                       [1., -0., 0.1, -0.1], [1., -0.1, 0.2, 0.1]])

    pca = PCA(solver='eigen')
    pca.fit(X_std)
    assert_almost_equal(pca.loadings_, expect, decimal=1)

    expect = np.array([[-0.9, -0.4, 0.3, 0.], [0.4, -0.9, -0.1, -0.],
                       [-1., -0., -0.1, -0.1], [-1., -0.1, -0.2, 0.1]])

    pca = PCA(solver='svd')
    pca.fit(X_std)
    assert_almost_equal(pca.loadings_, expect, decimal=1)
def extract_features(extraction_type, n_components):

    if extraction_type == 'pca':
        ext = PCA(n_components=n_components)

        return ext
    elif extraction_type == 'lda':
        ext = LDA(n_discriminants=n_components)

        return ext

    else:
        print("Input a valid method for (PCA or LDA)\n")
Ejemplo n.º 7
0
def extract_features(tipo, n):
    
    if tipo == 'pca':
    
        ext = PCA(n_components=n)
    
        return ext

    elif tipo == 'lda':
        
        ext = LDA(n_discriminants=n)
        
        return ext
    
    else:
        print ("Ingrese un método válido (pca o lda)\n")
Ejemplo n.º 8
0
def test_fail_array_transform():
    pca = PCA(n_components=2)
    pca.fit(X)
    exp = pca.transform(X[1])
def test_variance_explained_ratio():
    pca = PCA()
    pca.fit(X_std)
    assert math.isclose(np.sum(pca.e_vals_normalized_), 1.)
    assert math.isclose(np.sum(pca.e_vals_normalized_ < 0.), 0, abs_tol=1e-10)
def test_default_components():
    pca = PCA()
    res = pca.fit(X_std).transform(X_std)
    assert res.shape[1] == 4
def test_evals():
    pca = PCA(n_components=2, solver='eigen')
    pca.fit(X)
    res = pca.fit(X).transform(X)
    assert_almost_equal(pca.e_vals_, [2.93, 0.93, 0.15, 0.02], decimal=2)
Ejemplo n.º 12
0
def test_fail_array_fit():
    pca = PCA(n_components=2)
    pca.fit(X[1])
def test_default_2components():
    pca = PCA(n_components=2)
    res = pca.fit(X).transform(X)
    assert res.shape[1] == 2
def test_variance_explained_ratio():
    pca = PCA()
    pca.fit(X_std)
    assert np.sum(pca.e_vals_normalized_) == 1.
    assert np.sum(pca.e_vals_normalized_ < 0.) == 0
Ejemplo n.º 15
0
                        bootstrap_features=False,
                        oob_score=False,
                        warm_start=False,
                        n_jobs=1,
                        random_state=0,
                        verbose=1)
scores = cross_val_score(bag, X, y, scoring='f1_micro', cv=5, verbose=5)
print scores.mean()
'''
trees 	estims 	f1
300		10		33.4
300		20		
150		50		
'''

pca = PCA(n_components=1000)
X_pca = pca.fit(X).transform(X)

et = ExtraTreesClassifier(n_estimators=300,
                          max_depth=None,
                          random_state=0,
                          verbose=1)
bag = BaggingClassifier(base_estimator=et,
                        n_estimators=20,
                        max_samples=1.0,
                        max_features=1.0,
                        bootstrap=True,
                        bootstrap_features=False,
                        oob_score=False,
                        warm_start=False,
                        n_jobs=1,
Ejemplo n.º 16
0
#extratrees

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
et = ExtraTreesClassifier(n_estimators=300, max_depth=None, random_state=0,verbose=5)
scores = cross_val_score(et, X, y,scoring='f1_micro',cv=5,verbose=5)
print scores.mean()
#32% ,max depth=none,  n_est=300



#kernel PCA
from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
pca = PCA(n_components=700)
X_pca = pca.fit(X).transform(X)
et = ExtraTreesClassifier(n_estimators=500, max_depth=None, random_state=0,verbose=5)
scores = cross_val_score(et, X_pca, y,scoring='f1_micro',cv=5,verbose=5)
print scores.mean()



from mlxtend.feature_extraction import RBFKernelPCA as KPCA

kpca = KPCA(gamma=1.0, n_components=700)
kpca.fit(X)
X_kpca = kpca.fit(X).transform(X)
et = ExtraTreesClassifier(n_estimators=500, max_depth=None, random_state=0,verbose=5)
scores = cross_val_score(et, X_pca, y,scoring='f1_micro',cv=5,verbose=5)
print scores.mean()
Ejemplo n.º 17
0
            color='blue',
            marker='^',     #triangle marker
            alpha=0.5,
            )
plt.title('BMI vs glucose by sex')
plt.ylabel('Serum glucose concentration')
plt.xlabel('BMI')
plt.legend([sex1, sex2], ['Sex 1', 'Sex 2'])

#plt.show()
plt.savefig('../../figs/bivariate/subsetkpca1_2')
plt.close()
"""

#PCA only takes 2D axis. Look into how to deal with that later.
pca = PCA(n_components=2)  #2-component linear PCA
X_pca = pca.fit(X).transform(X)

#print(X_pca)
'''
#Graph after pca
#generate graph from matrix
for i in range(len(X)):
    if sex[i] == 1:
        #Sex 1 glucose v BMI
        sex1 = plt.scatter(X_pca[i][0], #bmi
            X_pca[i][1],    #glucose
            color ='red',
            marker='o',     #circle marker
            alpha=0.5,
            )
Ejemplo n.º 18
0
#Blue half moon
plt.scatter(X[y==1, 0], X[y==1, 1], # Start and peak/trough of each 'moon'.
            color='blue', marker='^', alpha=0.5)

plt.xlabel('x coordinate')
plt.ylabel('y coordinate')

#plt.show()
plt.savefig('../figs/tutorial/mlxtendex1_1.png')
plt.close()
# Moons are linearly inseperable so standard linear PCA will fail to accurately represent data in 1D space.

#Use PCA for dimensionality reduction

#specify number of components in PCA
pca = PCA(n_components=2)
#Transform X in accordance with 2-component PCA
X_pca = pca.fit(X).transform(X)

# Red half moon
plt.scatter(X_pca[y==0, 0], X_pca[y==0, 1], # Start and peak/troughof each 'moon'.
            color ='red', marker='o', alpha=0.5)

#Blue half moon
plt.scatter(X_pca[y==1, 0], X_pca[y==1, 1], # Start and peak/troughof each 'moon'.
            color='blue', marker='^', alpha=0.5)

plt.xlabel('PC1')
plt.ylabel('PC2')

#plt.show()
Ejemplo n.º 19
0
def test_variance_explained_ratio():
    pca = PCA()
    pca.fit(X_std)
    assert_almost_equal(np.sum(pca.e_vals_normalized_), 1.)
    assert np.sum(pca.e_vals_normalized_ < 0.) == 0
def test_eigen_vs_svd():
    pca = PCA(n_components=2, solver='eigen')
    eigen_res = pca.fit(X).transform(X)
    pca = PCA(n_components=2, solver='svd')
    svd_res = pca.fit(X).transform(X)
    assert_allclose(np.absolute(eigen_res), np.absolute(svd_res), atol=0.0001)
Ejemplo n.º 21
0
from mlxtend.feature_extraction import PrincipalComponentAnalysis as PCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from mlxtend.classifier import StackingClassifier

X = np.genfromtxt('../../contest_data/xtrain_linear_imputed.csv',
                  delimiter=',')
print 'loaded X'
y = np.genfromtxt('../../contest_data/train.csv', delimiter=',')[1:, -1]
print 'loaded y'

pca = PCA(n_components=300)
X_pca = pca.fit(X).transform(X)

et = ExtraTreesClassifier(n_estimators=1000,
                          max_depth=None,
                          random_state=0,
                          verbose=0)
svc = SVC(C=1, gamma='auto', verbose=0)
#dt = DecisionTreeClassifier(min_samples_leaf=5,random_state=0)
rf = RandomForestClassifier(n_estimators=1000,
                            max_depth=None,
                            random_state=0,
                            verbose=0)
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[et, svc, rf],
                          use_probas=True,
def test_fail_array_dimension():
    pca = PCA(n_components=2)
    assert_raises(ValueError, 'X must be a 2D array. Try X[:, numpy.newaxis]',
                  pca.transform, X[1])
Ejemplo n.º 23
0
def test_default_components():
    pca = PCA(n_components=0)
    pca.fit(X)
    res = pca.fit(X).transform(X)