def test_kernel_pca_consistent_transform():
    # X_fit_ needs to retain the old, unmodified copy of X
    state = np.random.RandomState(0)
    X = state.rand(10, 10)
    kpca = KernelPCA(random_state=state).fit(X)
    transformed1 = kpca.transform(X)

    X_copy = X.copy()
    X[:, 0] = 666
    transformed2 = kpca.transform(X_copy)
    assert_array_almost_equal(transformed1, transformed2)
Esempio n. 2
0
def main():

	#set the timer
	start = time.time()

	#load the data
	trainX = np.load('trainX.npy')
	testX = np.load('testX.npy')
	trainY = np.load('trainY.npy')
	testY = np.load('testY.npy')
	print('\n!!! Data Loading Completed !!!\n')

	#get the 1st digit zero and plot it
	zero = trainX[14].reshape(28, 28)
	plt.imshow(zero, cmap=cm.Greys_r)
	plt.savefig("original"+str(trainY[14])+".png")
	#plt.show()

	#apply kpca
	kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True)
	kpca.fit(trainX[0:3000])
	trainX_kpca = kpca.transform(trainX)
	testX_kpca = kpca.transform(testX)

	#do inverse transform and plot the result
	orig = kpca.inverse_transform(trainX_kpca)
	img = orig[14].reshape(28, 28)
	plt.imshow(img, cmap=cm.Greys_r)
	plt.savefig("reconstructed"+str(trainY[14])+".png")
	#plt.show()

	selector = SelectPercentile(f_classif, percentile=5)
	selector.fit(trainX_kpca, trainY)
	trainX = selector.transform(trainX_kpca)
	testX = selector.transform(testX_kpca)

	#fit a classifier
	parameters = {'n_neighbors' : list(np.arange(15)+1)}
	clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters)
	clf.fit(trainX, trainY)

	pred = clf.predict(testX)
	print accuracy_score(testY, pred)
	print confusion_matrix(testY, pred)
	#print(clf.best_params_)
	print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY)))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    def histogram(x, y, **kwargs):
        # Histogram kernel implemented as a callable.
        assert_equal(kwargs, {})  # no kernel_params that we didn't ask for
        return np.minimum(x, y).sum()

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly", histogram):
            # histogram kernel produces singular matrix inside linalg.solve
            # XXX use a least-squares approximation?
            inv = not callable(kernel)

            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed.size, 0)

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])

            # inverse transform
            if inv:
                X_pred2 = kpca.inverse_transform(X_pred_transformed)
                assert_equal(X_pred2.shape, X_pred.shape)
class RegionSplitter_PCA_KMean():
    def __init__(self, data, label):

        data_dim_num = len(data[0])
        label_dim_num = len(label[0])

        self.n_comp = max(1, data_dim_num)

        self.pca = PCA(n_components=self.n_comp)

        data = self.pca.fit_transform(data)
        data_zipped = list(zip(*data))

        # k-mean cluster for the dimension
        self.clusterer = KMeans(n_clusters=2, init='k-means++')

        self.clusterer.fit(list(zip(*data_zipped)))


    def classify(self, data):
        if not isinstance(data, tuple):
            raise(TypeError, "data must be a tuple")

        data = tuple(self.pca.transform(data)[0])
        group = self.clusterer.predict(data)

        return group == 0
Esempio n. 5
0
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver,
                             fit_inverse_transform=True)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed),
                                      np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed, [])

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1],
                         X_fit_transformed.shape[1])

            # inverse transform
            X_pred2 = kpca.inverse_transform(X_pred_transformed)
            assert_equal(X_pred2.shape, X_pred.shape)
    def test_compare_clinical_kernel(self):
        x_full, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1',
                                         standardize_numeric=False, to_numeric=False)

        trans = ClinicalKernelTransform()
        trans.fit(x_full)

        x = encode_categorical(standardize(x_full))

        kpca = KernelPCA(kernel=trans.pairwise_kernel)
        xt = kpca.fit_transform(x)

        nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0)
        nrsvm.fit(xt, y)

        rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel=trans.pairwise_kernel,
                                     tol=1e-8, max_iter=1000, random_state=0)
        rsvm.fit(x, y)

        pred_nrsvm = nrsvm.predict(kpca.transform(x))
        pred_rsvm = rsvm.predict(x)

        self.assertEqual(len(pred_nrsvm), len(pred_rsvm))

        c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm)
        c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm)

        self.assertAlmostEqual(c1[0], c2[0])
        self.assertTupleEqual(c1[1:], c2[1:])
Esempio n. 7
0
def main():

	#set the timer
	start = time.time()

	#load the data
	mnist = fetch_mldata('MNIST original')
	mnist.target = mnist.target.astype(np.int32)

	seed = np.random.randint(1,30000)
	rand = np.random.RandomState(seed)
	items = len(mnist.target)
	indices = rand.randint(items, size = 70000)
	trindex = indices[0:30000]
	tsindex = indices[30000:]

	#scale down features to the range [0, 1]
	mnist.data = mnist.data/255.0
	mnist.data = mnist.data.astype(np.float32)

	trainX = mnist.data[trindex]
	testX = mnist.data[tsindex]
	trainY = mnist.target[trindex]
	testY = mnist.target[tsindex]

	#extract the features using KPCA
	kpca = KernelPCA(kernel='precomputed')
	kpca_train = arc_cosine(trainX[0:1000], trainX[0:1000])
	#Fit the model from data in X
	kpca.fit(kpca_train)

	kernel_train = arc_cosine(trainX, trainX[0:1000])
	kernel_test = arc_cosine(testX, trainX[0:1000])

	trainX_kpca = kpca.transform(kernel_train)
	testX_kpca = kpca.transform(kernel_test)
	print testX_kpca.shape

	#fit the svm model and compute accuaracy measure
	clf = svm.SVC(kernel=arc_cosine)
	clf.fit(trainX_kpca, trainY)

	pred = clf.predict(testX_kpca)
	print accuracy_score(testY, pred)
	print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY)))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
Esempio n. 8
0
def train_kmeans_on_pca(train_data, train_labels, test_data, test_labels, n_components=2):
    from sklearn.decomposition import KernelPCA
    from sklearn.cluster import KMeans

    pca = KernelPCA(n_components=n_components).fit(train_data)
    
    transformed_train_data = pca.transform(train_data)
    transformed_test_data = pca.transform(test_data)

    kmeans = KMeans(n_clusters=2, random_state=0).fit(transformed_train_data)
    
    kmeans_pred = kmeans.predict(transformed_test_data)    
    kmeans_pred[kmeans_pred == 0] = -1
    
    acc = accuracy_score(test_labels, kmeans_pred)
    
    return pca, kmeans, max(acc, 1-acc)
Esempio n. 9
0
def doKernelPCA(q, components=40):
	global data

	# load test query
	loadFile('test', q)
	
	# fit model
	kpca = KernelPCA(components, kernel="rbf")
	kpca.fit(data)

	# transform and print test query
	data = kpca.transform(data)
	printFile('test{}'.format(q))

	for kind in ['train', 'vali']:
		loadFile(kind)
		data = kpca.transform(data)
		printFile(kind + str(q))
Esempio n. 10
0
def generate_kpca_compression(X, n_components=16):
    """
    Compresses the data using sklearn KernelPCA implementation.

    :param X: Data (n_samples, n_features)
    :param n_components: Number of dimensions for PCA to keep

    :return: X_prime (the compressed representation), pca
    """

    kpca = KernelPCA(n_components=n_components, kernel='rbf', eigen_solver='arpack', fit_inverse_transform=False)
    kpca.fit(X)

    return kpca.transform(X), kpca
def test_kernel_pca_sparse():
    rng = np.random.RandomState(0)
    X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
    X_pred = sp.csr_matrix(rng.random_sample((2, 4)))

    for eigen_solver in ("auto", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=False)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2))

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])
Esempio n. 12
0
def reduction(data, params):

    # parse parameters

    for item in params:
        if isinstance(params[item], str):
            exec(item+'='+'"'+params[item]+'"')
        else:
            exec(item+'='+str(params[item]))

    # apply PCA

    kpca = KernelPCA(n_components=n_components, kernel=kernel)
    kpca.fit(data)
    X = kpca.transform(data)

    return X
def kpca(data, n_components, train, test, kernel='linear', gamma=None, degree=3, coef0=1, alpha=0.1, evaluation=False):
    # Kernel PCA
    
    kpca = KernelPCA(n_components, fit_inverse_transform=True, kernel=kernel, gamma=gamma, degree=degree, 
                     coef0=coef0, alpha=alpha).fit(data[train])
    
    data_reduced = kpca.transform(data)
    
    if evaluation:
        data_rec = kpca.inverse_transform(data_reduced)
        loss = mean_squared_error(data[test], data_rec[test])
        return loss
    
    #name = 'Kernel PCA ('+kernel+')'
    name = 'Kernel PCA'

    return data_reduced, name, kpca.inverse_transform
Esempio n. 14
0
class KernelPCAReduction(AbstractReduction):
    """
    Use kernel PCA to reduce dimensionality

    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html
    """

    def __init__(self, n_components, **kwargs):
        self.pca = KernelPCA(n_components=n_components, **kwargs)
        self.n_components = n_components

    def n_components(self):
        return self.n_components

    def fit(self, X, Y=None):
        self.pca.fit(X)

    def transform(self, X):
        return self.pca.transform(X)
Esempio n. 15
0
	def __init__(self,master):
		super (Linear_PCA_vs_SPY ,self).__init__(master)

		fig = Figure(figsize=(10,10), dpi = 100)
		ax= fig.add_subplot(1,1,1, axisbg='#cccccc')
		demo_port = ['SPY','BA', 'WFC', 'PEP', 'AMGN', 'BAX', 'BK', 'FB', 'COST', 'DIS', 'LOW', 'FDX',  'TWX', 'AIG', 'MSFT', 'IBM', 'SBUX', 'FCX', 'PG', 'BMY', 'MDT', 'SPG', 'VZ', 'OXY', 'CL', 'GILD', 'CVS', 'AMZN', 'GE', 'ABT', 'JNJ', 'UTX', 'WMT', 'ALL', 'PFE', 'FOXA', 'MO', 'MCD', 'MMM', 'SO', 'MON', 'APC', 'NOV', 'APA', 'CMCSA', 'DVN', 'ACN', 'CAT', 'EXC', 'TXN', 'UNP', 'HPQ', 'V', 'LMT', 'RTN', 'CSCO', 'DOW', 'LLY', 'NSC', 'JPM', 'C', 'HAL', 'INTC', 'ABBV', 'UNH', 'MA', 'GM', 'XOM', 'KO', 'EBAY', 'MET', 'GS', 'CVX', 'HON', 'MRK', 'AXP', 'USB', 'EMC', 'DD', 'HD', 'AAPL', 'PM', 'F', 'T', 'UPS', 'SLB', 'AEP', 'EMR', 'COF', 'MDLZ', 'GOOG', 'NKE', 'COP', 'QCOM', 'TGT', 'ORCL', 'GD', 'MS', 'BAC']
	
	
		data = pd.DataFrame()
		for symbol in demo_port:
			data[symbol] = web.DataReader(symbol, data_source='yahoo')['Close']
		data = data.dropna()
	
		spy = pd.DataFrame(data.pop('SPY'))	
	
		#normalize data
		scale_func = lambda x: ( x-x.mean())/x.std()
		#apply PCA
		pca = KernelPCA().fit(data.apply(scale_func))

		get_we = lambda x: x/x.sum()
		#print (get_we(pca.lambdas_)[:20])

		pca_one = KernelPCA(n_components = 1).fit(data.apply(scale_func))
		spy['PCA_1'] = pca_one.transform(data)
		
		
		# Plotting
		spy.apply(scale_func)
		ax= fig.add_subplot(1,1,1, axisbg='#cccccc')
		lin_reg = np.polyval(np.polyfit(spy['PCA_1'],spy['SPY'],1) , spy['PCA_1'])
		
		ax.scatter(spy['PCA_1'], spy['SPY'], c = data.index)
		ax.plot(spy['PCA_1'], lin_reg, 'r', lw = 2)
		ax.set_xlabel('PCA_1')
		ax.set_ylabel('SPY')
		canvas = FigureCanvasTkAgg(fig, self)
		canvas.show()
		canvas.get_tk_widget().pack(side = tk.TOP,fill= tk.BOTH,expand = True)

		toolbar = NavigationToolbar2TkAgg(canvas, self)
		toolbar.update()
		canvas._tkcanvas.pack(side = tk.TOP, fill = tk.BOTH, expand = True)
Esempio n. 16
0
def getKPCAcomp(dict_read):
    A = np.arange(10000)
    for key in dict_read.keys():
        if key<=1000:
            [sample_rate,X] = dict_read.get(key)
            # if song doesnt have 10000 features, then add 0s at the end (this usually isnt the case)
            if (len(X)<10000):
                dif = 10000 - len(X)
                for i in range(dif):
                    X = np.hstack((X,0.0))
            A = np.vstack((A,X[:10000]))
        else:
            break
    A = np.delete(A, 0, 0)
    A = A.astype(float)
    kpca = KernelPCA(n_components=100, kernel="rbf")
    kpca.fit(A)
    A = kpca.transform(A)
    return A
def perform_kpca(input_data):
    '''
    Applying Kernal PCA on removed outliers data#
    Using scikit module for Kpca
    '''
    from sklearn.decomposition import KernelPCA
    
    # Specify  kernal fucntion  used in the K pca
    KERNEL = raw_input('Enter the kernal of kernalPCA(options are :cosine,rbf,linear,sigmoid:')
    kpca=KernelPCA(n_components=len(input_data.T),kernel=KERNEL)
    #Scaling thing for input dataset
    from sklearn.preprocessing import scale
    scld_input_data= scale(input_data, axis=0, with_mean=True, with_std=True, copy=True )
    kpca.fit(scld_input_data)
    # Transform the dataset on the given PC's
    kpca_input_data=kpca.transform(scld_input_data)
    #Percentage variance representarion
    Kpca_percent=np.array(map(lambda y: (kpca.lambdas_[y]/sum(kpca.lambdas_)),range(len(kpca.lambdas_))))
    Var_explanied=np.c_[Kpca_percent.reshape(len(Kpca_percent),1)]
    print '\nVariance explanied by eigenvalues of KPca '
    print (['Kpca'])
    print Var_explanied          
    return (kpca_input_data)
    def test_compare_rbf(self):
        x, y, _, _ = load_arff_file(WHAS500_FILE, ['fstat', 'lenfol'], '1')

        kpca = KernelPCA(kernel="rbf")
        xt = kpca.fit_transform(x)

        nrsvm = FastSurvivalSVM(optimizer='rbtree', tol=1e-8, max_iter=1000, random_state=0)
        nrsvm.fit(xt, y)

        rsvm = FastKernelSurvivalSVM(optimizer='rbtree', kernel="rbf",
                                     tol=1e-8, max_iter=1000, random_state=0)
        rsvm.fit(x, y)

        pred_nrsvm = nrsvm.predict(kpca.transform(x))
        pred_rsvm = rsvm.predict(x)

        self.assertEqual(len(pred_nrsvm), len(pred_rsvm))

        c1 = concordance_index_censored(y['fstat'], y['lenfol'], pred_nrsvm)
        c2 = concordance_index_censored(y['fstat'], y['lenfol'], pred_rsvm)

        self.assertAlmostEqual(c1[0], c2[0])
        self.assertTupleEqual(c1[1:], c2[1:])
Esempio n. 19
0
File: pca.py Progetto: DaMSL/ddc
class PCAKernel(PCAnalyzer):
  """ Non-linear PCA as wrapper over SciKitLearn Kernels """
  def __init__(self, components, ktype='poly'):
    PCAnalyzer.__init__(self)
    if isinstance(components, int):
      self.n_components = components
    self.pca = KernelPCA(kernel=ktype, n_components=components)
    self.type = 'kernel'

  def solve(self, X):
    self.dim = np.prod(X.shape[1:])
    self.pca.fit(X.reshape(len(X), self.dim))
    self.trainsize = len(X)

  def project(self, X):
    if isinstance(X, list):
      X = np.array(X)
    dimX = np.prod(X.shape[1:])
    if dimX != self.dim:
      logging.error('Projection Error in KPCA: Cannot reshape/project %s size data using PC Vects of size, %s', str(X.shape), str(self.dim))
      return None
    projection = self.pca.transform(X.reshape(len(X), dimX))
    return projection
Esempio n. 20
0
print('LDA transform_support vector machines_training score: ',
      svm.score(X_train_lda, y_train))
print('LDA transform_support vector machines_testing score: ',
      svm.score(X_test_lda, y_test))

#kPCA

gamma_space = np.logspace(-2, 0, 10)
lr_train = []
lr_test = []
svm_train = []
svm_test = []
for gamma in gamma_space:
    kPCA = KernelPCA(n_components=2, kernel='rbf', gamma=gamma)
    X_train_kpca = kPCA.fit_transform(X_train_std, y_train)
    X_test_kpca = kPCA.transform(X_test_std)
    lr = LogisticRegression()
    lr = lr.fit(X_train_kpca, y_train)
    lr_train.append(lr.score(X_train_kpca, y_train))
    lr_test.append(lr.score(X_test_kpca, y_test))
    svm = SVC(kernel='linear', C=1.0, random_state=1)
    svm.fit(X_train_kpca, y_train)
    svm_train.append(svm.score(X_train_kpca, y_train))
    svm_test.append(svm.score(X_test_kpca, y_test))

print("gamma  lr_train  lr_test  svm_train  svm_test")
for i in range(10):
    print('%.3f,   %.3f,   %.3f,    %.3f,    %.3f' % (
        gamma_space[i],
        lr_train[i],
        lr_test[i],
Esempio n. 21
0
class RegionSplitter_PCA_oudeyer():

    def __init__(self, data, label):

        self.cut_dim = 0
        self.cut_val = 0
        num_candidates = 50

        data_dim_num = len(data[0])
        label_dim_num = len(label[0])

        self.n_comp = max(1, data_dim_num)

        self.pca = PCA(n_components=self.n_comp, kernel='linear')
        # self.ica = ICA(n_components=self.n_comp)

        data = self.pca.fit_transform(data)
        #data = self.ica.fit_transform(data)

        data_zipped = list(zip(*data))

        data_dim_num = len(data[0])
        label_dim_num = len(label[0])


        # sort in each dimension
        dim_min = float("inf")
        for i in range(data_dim_num):

            for k in range(num_candidates):
                # pick a random value
                max_val = max(data_zipped[i])
                min_val = min(data_zipped[i])
                cut_val = random.choice(np.linspace(min_val, max_val, num=500))

                groups = [[label[j] for j in range(len(data_zipped[i])) if data_zipped[i][j] <= cut_val],
                          [label[j] for j in range(len(data_zipped[i])) if data_zipped[i][j] > cut_val]]

                # check if any of the group is 0
                if len(groups[0]) == 0 or len(groups[1]) == 0:
                    continue

                weighted_avg_variance = []
                for group in groups:
                    num_sample = len(group)
                    group = zip(*group)

                    variance = []
                    for group_k in group:
                        mean = math.fsum(group_k)/len(group_k)
                        norm = max(math.fsum([x**2 for x in group_k])/len(group_k), 1)
                        variance.append(math.fsum([((x - mean)**2)/norm for x in group_k]))
                    weighted_avg_variance.append(math.fsum(variance)/len(variance)*num_sample)

                in_group_variance = math.fsum(weighted_avg_variance)

                if in_group_variance < dim_min:

                    dim_min = in_group_variance
                    self.cut_dim = i
                    self.cut_val = cut_val


        # just cut in half
        #self.cut_val = exemplars[int(sample_num/2)][0][self.cut_dim]

    def classify(self, data):
        if not isinstance(data, tuple):
            raise(TypeError, "data must be a tuple")

        data = tuple(self.pca.transform(data)[0])
        # data = tuple(self.ica.transform(data)[0])
        group = data[self.cut_dim] <= self.cut_val

        return group == 0
Esempio n. 22
0
if (0):
    #%% K-PCA
    # Calculate accumulated variance
    kpca = KernelPCA(kernel="rbf",gamma=gamma)
    kpca.fit_transform(Xtrain)
    eigenvals = kpca.lambdas_[0:220]

    
    # Calculate classifiation scores for each component
    nComponents =  np.linspace(1, 500, 100, endpoint=True)
    kpcaScores = np.zeros((5,np.alen(nComponents)))
    
    kpca = KernelPCA(n_components = Ntrain,kernel="rbf",gamma=gamma)
    kpca.fit(Xtrain)
    XtrainT = kpca.transform(Xtrain)
    XtestT = kpca.transform(Xtest)
    

    for i in range(len(nComponents)):   
        kpcaScores[:,i] = util.classify(XtrainT[:,:nComponents[i]],XtestT[:,:nComponents[i]],labelsTrain,labelsTest)

    #%% Plot accuracies for kPCA
    plt.figure()
    for i in range (5):
        plt.plot(nComponents,kpcaScores[i,:],lw=3)

    plt.xlim(1,np.amax(nComponents))
    plt.title('kPCA accuracy')
    plt.xlabel('Number of components')
    plt.ylabel('accuracy')
Esempio n. 23
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

# Kernel PCA
from sklearn.decomposition import KernelPCA
Kpca = KernelPCA(n_components=2, kernel='rbf')
X_train = Kpca.fit_transform(X_train)
X_test = Kpca.transform(X_test)

# Fitting Logistic Regression to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
Esempio n. 24
0
class DimensionalityReductionPool:

    # common param
    random_state = None
    n_jobs = None
    runtime = None

    # PCA param
    pca_obj = None

    # LDA param
    lda_obj = None

    # Kernel PCA param
    gamma_kernel_pca = None
    kernel_pca_obj = None

    def __init__(self, random_state, n_jobs, gamme_kernel_pca):
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.runtime = numpy.zeros(3)
        # PCA object
        self.pca_obj = PCA(n_components='mle',
                           svd_solver='full',
                           random_state=self.random_state)
        # LDA object
        self.lda_obj = LinearDiscriminantAnalysis(n_components=None)
        # Kernel PCA object
        self.gamma_kernel_pca = gamme_kernel_pca
        self.kernel_pca_obj = KernelPCA(n_components=None,
                                        kernel='rbf',
                                        gamma=self.gamma_kernel_pca,
                                        random_state=self.random_state,
                                        n_jobs=self.n_jobs)

    def PCA_fit(self, X):
        # Record fitting runtime
        start_time = time.time()
        print("Fitting PCA on X")
        self.pca_obj.fit(X=X)
        self.runtime[0] = time.time() - start_time
        print("Fitting ended in " +
              "{0:.2f}".format(round(self.runtime[0], 2)) + " seconds")

    def LDA_fit(self, X, Y):
        # Record fitting runtime
        start_time = time.time()
        print("Fitting LDA on X and Y")
        self.lda_obj.fit(X=X, y=Y)
        self.runtime[1] = time.time() - start_time
        print("Fitting ended in " +
              "{0:.2f}".format(round(self.runtime[1], 2)) + " seconds")

    def KernelPCA_fit(self, X):
        # Record fitting runtime
        start_time = time.time()
        print("Fitting Kernel PCA on X")
        self.kernel_pca_obj.fit(X=X)
        self.runtime[2] = time.time() - start_time
        print("Fitting ended in " +
              "{0:.2f}".format(round(self.runtime[2], 2)) + " seconds")

    def Fit_all(self, X, Y):
        self.PCA_fit(X)
        self.LDA_fit(X, Y)
        self.KernelPCA_fit(X)

    def Transform_all(self, X):

        transformed_X_dict = {
            'PCA': self.pca_obj.transform(X),
            'LDA': self.lda_obj.transform(X),
            'KernelPCA': self.kernel_pca_obj.transform(X)
        }

        return transformed_X_dict

    def GetDimRedObjects(self):

        DimRedObj = {
            'PCA': self.pca_obj,
            'LDA': self.lda_obj,
            'KernelPCA': self.kernel_pca_obj
        }

        return DimRedObj
symbols = [
    'AAPL', 'AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DD', 'DIS', 'GE', 'GS', 'HD',
    'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT', 'NKE',
    'PFE', 'PG', 'TRV', 'UNH', 'UTX', 'V', 'VZ', 'WMT', 'XOM', '^DJI'
]

data = pd.DataFrame()
for sym in symbols:
    data[sym] = web.DataReader(sym, data_source='yahoo')['Adj Close']
data = data.dropna()

dji = pd.DataFrame(data.pop('^DJI'))

scale_function = lambda x: (x - x.mean()) / x.std()

pca = KernelPCA().fit(data.apply(scale_function))

pca.lambdas_[:10].round()

get_we = lambda x: x / x.sum()

get_we(pca.lambdas_)[:10]
get_we(pca.lambdas_)[:5].sum()

pca = KernelPCA(n_components=1).fit(data.apply(scale_function))
dji['PCA_1'] = pca.transform(-data)

import matplotlib.pyplot as plt
dji.apply(scale_function).plot(figsize=(20, 10))
plt.show()

sc, x_train, x_cv = feature_Scaling(x_train, x_cv)

# ### Applying Kernal PCA and fit the logistic regression model into training

# In[12]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# In[13]:

#Applying KPCA
pca = KPCA(n_components=2, kernel='rbf')
x_train = pca.fit_transform(x_train)
x_cv = pca.transform(x_cv)
# explained_varience = pca.explained_variance_ratio_

# In[14]:

# fitting logistinc regression to the training set
classifier = LogisticRegression(random_state=0)
classifier = classifier.fit(x_train, y_train)

# In[15]:

# predict y data
y_pred = classifier.predict(x_cv)

# In[16]:
Esempio n. 27
0
data = pd.read_csv('data/Social_Network_Ads.csv')

X = data.iloc[:, [2, 3]].values
Y = data.iloc[:, -1].values
# pre processing
# it's important to scaling
sc = StandardScaler()
X = sc.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

# using  PCA for dimension reduction

k_pca = KernelPCA(n_components=2, kernel='rbf')
X_train_transformed = k_pca.fit_transform(X_train)
X_test_transformed = k_pca.transform(X_test)
classifier = LogisticRegression()
classifier.fit(X_train_transformed, Y_train)
y_pre = classifier.predict(X_test_transformed)

cm = confusion_matrix(Y_test, y_pre)
accuracy = accuracy_score(Y_test, y_pre)
# visualising data

X_set, Y_set = X_train_transformed, Y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
              step=0.01),
    np.arange(start=X_set[:, 1].min() - 1,
              stop=X_set[:, 1].max() + 1,
pca.fit(X_train_noisy)
_ = kernel_pca.fit(X_train_noisy)

# %%
# Reconstruct and denoise test images
# -----------------------------------
#
# Now, we can transform and reconstruct the noisy test set. Since we used less
# components than the number of original features, we will get an approximation
# of the original set. Indeed, by dropping the components explaining variance
# in PCA the least, we hope to remove noise. Similar thinking happens in kernel
# PCA; however, we expect a better reconstruction because we use a non-linear
# kernel to learn the PCA basis and a kernel ridge to learn the mapping
# function.
X_reconstructed_kernel_pca = kernel_pca.inverse_transform(
    kernel_pca.transform(X_test_noisy))
X_reconstructed_pca = pca.inverse_transform(pca.transform(X_test_noisy))

# %%
plot_digits(X_test, "Uncorrupted test images")
plot_digits(
    X_reconstructed_pca,
    f"PCA reconstruction\nMSE: {np.mean((X_test - X_reconstructed_pca) ** 2):.2f}",
)
plot_digits(
    X_reconstructed_kernel_pca,
    "Kernel PCA reconstruction\n"
    f"MSE: {np.mean((X_test - X_reconstructed_kernel_pca) ** 2):.2f}",
)

# %%
data_matrix = np.zeros((m, n))

print "There are %d hurricanes for cross validation. So I need to perform %d comparisons." % (
    m, m * n)
count = 0

for i in range(len(cv_hurricane_list)):
    for j in range(n):
        data_matrix[i][j] = M2(cv_ts_list[i], ts_list[j], delta, eps)

        count = count + 1
        if count % 1000 == 0:
            print count

cv_feature_coords = kpca.transform((data_matrix**2) * -0.5)
print cv_feature_coords
[cv_mean, cv_mfx] = reg.fit(cv_feature_coords)
print cv_mean

cv_predicted = np.zeros(m)
cv_high_prob = np.zeros(m) + 0.5
num_high_prob = 0
thresh = 0.05

for i in range(m):
    if cv_mean[i] >= 0.5:
        cv_predicted[i] = 1

    if cv_mean[i] <= thresh:
        num_high_prob = num_high_prob + 1
Esempio n. 30
0
plt.figure()
plt.subplot(2, 2, 1, aspect="equal")
plt.title("Original space")
reds = y == 0
blues = y == 1

plt.scatter(X[reds, 0], X[reds, 1], c="red", s=20, edgecolor="k")
plt.scatter(X[blues, 0], X[blues, 1], c="blue", s=20, edgecolor="k")
plt.xlabel("$x_1$")
plt.ylabel("$x_2$")

X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))
X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
# projection on the first principal component (in the phi space)
Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
plt.contour(X1, X2, Z_grid, colors="grey", linewidths=1, origin="lower")

plt.subplot(2, 2, 3, aspect="equal")
plt.scatter(X_kpca[reds, 0], X_kpca[reds, 1], c="red", s=20, edgecolor="k")
plt.scatter(X_kpca[blues, 0], X_kpca[blues, 1], c="blue", s=20, edgecolor="k")
plt.title("Projection by KPCA")
plt.xlabel(r"1st principal component in space induced by $\phi$")
plt.ylabel("2nd component")

plt.subplot(2, 2, 4, aspect="equal")
from graspologic.embed import ClassicalMDS

plt.tight_layout()
plt.show()
Esempio n. 31
0
def kernel_pincipal_component(train_x, test_x, n_com, kernel_type = 'rbf'):
    pca = KernelPCA(n_components=n_com, kernel= kernel_type)
    train_x = pca.fit_transform(train_x)
    test_x = pca.transform(test_x)
    return train_x, test_x
def pid_pca(args):
    #  import modular data processing toolkit
    import mdp
    # load data file
    tblfilename = "bf_optimize_mavlink.h5"
    h5file = tb.open_file(tblfilename, mode = "a")
    # table = h5file.root.v1.evaluations
    #  get tabke handle
    table = h5file.root.v2.evaluations

    # sort rows
    if not table.cols.mse.is_indexed:
        table.cols.mse.createCSIndex()
        
    if not args.sorted:
        pids = [ [x["alt_p"], x["alt_i"], x["alt_d"], x["vel_p"], x["vel_i"], x["vel_d"]]
             for x in table.iterrows() ]
        mses = [ [x["mse"]] for x in table.iterrows() ]
    else:
        pids = [ [x["alt_p"], x["alt_i"], x["alt_d"], x["vel_p"], x["vel_i"], x["vel_d"]] for x in table.itersorted("mse")]
        mses = [ [x["mse"]] for x in table.itersorted("mse")]
    print "best two", pids
    mses_a = np.log(np.clip(np.array(mses), 0, 200000.))
    mses_a /= np.max(mses_a)
    # FIXME: try kernel pca on this
    from sklearn.decomposition import PCA, KernelPCA, SparsePCA
    kpca = KernelPCA(n_components = None,
                     kernel="rbf", degree=6, fit_inverse_transform=True,
                     gamma=1/6., alpha=1.)
    # kpca = SparsePCA(alpha=2., ridge_alpha=0.1)
    X_kpca = kpca.fit_transform(np.asarray(pids).astype(float))
    # X_back = kpca.inverse_transform(X_kpca)

    Z_kpca = kpca.transform(np.asarray(pids).astype(float))

    print Z_kpca.shape, X_kpca.shape
    print "|Z_kpca|", np.linalg.norm(Z_kpca, 2, axis=1)
    # for i in range(8):
    #     pl.subplot(8,1,i+1)
    #     pl.plot(Z_kpca[:,i])
    #     pl.legend()
    # pl.show()

    
    # fast PCA
    # pid_p = mdp.pca(np.array(pids).astype(float))
    pid_array = np.array(pids).astype(float)
    print "pid_array.shape", pid_array.shape
    pcanode = mdp.nodes.PCANode(output_dim = 6)
    # pcanode.desired_variance = 0.75
    pcanode.train(np.array(pids).astype(float))
    pcanode.stop_training()
    print "out dim", pcanode.output_dim

    pid_p = pcanode.execute(np.array(pids).astype(float))

    # pid_array_mse = np.hstack((np.array(pids).astype(float), mses_a))
    pid_ica = mdp.fastica(np.array(pids).astype(float))
    print "ica.shape", pid_ica.shape
    # pid_p = np.asarray(pids)[:,[0, 3]]
    # pid_p = pids[:,0:2]
    # [:,0:2]
    sl_start = 0
    sl_end = 100
    sl = slice(sl_start, sl_end)

    print "expl var", pcanode.explained_variance
    pl.subplot(111)
    colors = np.zeros((100, 3))
    # colors = np.hstack((colors, 1-(0.5*mses_a)))
    colors = np.hstack((colors, 1-(0.8*mses_a)))
    # print colors.shape
    # pl.scatter(pid_p[sl,0], pid_p[sl,1], color=colors)

    # ica spektrum
    pid_ica_sum = np.sum(np.square(pid_ica), axis=0)
    # pid_ica_sum_sort = np.sort(pid_ica_sum)
    pid_ica_sum_0 = np.argmax(pid_ica_sum)
    pid_ica_sum[pid_ica_sum_0] = 0
    pid_ica_sum_1 = np.argmax(pid_ica_sum)
    
    # pl.scatter(pid_p[sl,0], pid_p[sl,1], color=colors)
    pl.scatter(pid_ica[sl,pid_ica_sum_0], pid_ica[sl,pid_ica_sum_1], color=colors)
    # pl.scatter(X_kpca[:,0], X_kpca[:,1], color=colors)
    pl.gca().set_aspect(1)
    # pl.scatter(pid_p[:,0], pid_p[:,1], alpha=1.)
    # pl.show()

    # plot raw pid values     
    pl.subplot(411)
    pl.plot(pid_array[sl,[0,3]], "o")
    pl.xlim((sl_start - 0.2, sl_end + 0.2))
    pl.subplot(412)
    pl.plot(pid_array[sl,[1,4]], "o")
    pl.xlim((sl_start - 0.2, sl_end + 0.2))
    pl.subplot(413)
    pl.plot(pid_array[sl,[2,5]], "o")

    # plot compressed pid values: pca, ica, ...
    # pl.subplot(211)
    # pl.plot(pid_p, ".")
    # pl.plot(pid_p[sl], "o")
    # pl.plot(pid_ica[sl] + np.random.uniform(-0.01, 0.01, size=pid_ica[sl].shape), "o")
    pl.xlim((sl_start - 0.2, sl_end + 0.2))
    # pl.plot(Z_kpca[:,:], "-o", label="kpca")
    # pl.plot(Z_kpca[:,:], ".", label="kpca")
    # pl.legend()
        
    # pl.subplot(212)
    pl.subplot(414)
    pl.plot(mses_a[sl], "ko")
    # pl.gca().set_yscale("log")
    pl.xlim((sl_start - 0.2, sl_end + 0.2))
    pl.show()

    # gp fit
    x = mses_a[sl]
    x_sup = np.atleast_2d(np.arange(0, x.shape[0])).T
    x_ones = x != 1.
    x_ones[0:20] = False
    print x, x_sup, x_ones, x_ones.shape
    print "x[x_ones]", x[x_ones].shape
    print "x_sup[x_ones]", x_sup[x_ones].shape

    from sklearn.gaussian_process import GaussianProcess
    # gp = GaussianProcess(regr='constant', corr='absolute_exponential',
    #                  theta0=[1e-4] * 1, thetaL=[1e-12] * 1,
    #                  thetaU=[1e-2] * 1, nugget=1e-2, optimizer='Welch')
    gp = GaussianProcess(corr="squared_exponential",
                         theta0=1e-2, thetaL=1e-4, thetaU=1e-1,
                         nugget=1e-1/x[x_ones])
    gp.fit(x_sup[x_ones,np.newaxis], x[x_ones,np.newaxis])
    x_pred, sigma2_pred = gp.predict(x_sup, eval_MSE=True)
    print x_pred, sigma2_pred

    from sklearn import linear_model
    clf = linear_model.Ridge (alpha = .5)
    clf.fit(x_sup[x_ones,np.newaxis], x[x_ones,np.newaxis])
    x_pred = clf.predict(x_sup[20:100])
        
    pl.subplot(111)
    pl.plot(mses_a[sl], "ko")
    x_mean = np.mean(x[0:20])
    pl.plot(np.arange(0, 20), np.ones((20, )) * x_mean, "k-", alpha=0.5)
    pl.plot(np.arange(20, 100), x_pred, "k-", alpha=0.5)
    pl.axhspan(0.5, 1.1, 0, 0.19, facecolor="0.5", alpha=0.25)
    # pl.plot(x_pred + sigma2_pred, "k-", alpha=0.5)
    # pl.plot(x_pred - sigma2_pred, "k-", alpha=0.5)
    # pl.gca().set_yscale("log")
    pl.xlim((sl_start - 0.2, sl_end + 0.2))
    pl.ylim((0.5, 1.1))
    pl.text(5, 0.6, "Random\ninitialization")
    pl.text(40, 0.6, "Optimizer\nsuggestions")
    pl.xlabel("Episode #")
    pl.ylabel("MSE")
    if args.plotsave:
        pl.gcf().set_size_inches((10, 3))
        pl.gcf().savefig("%s-mse.pdf" % (sys.argv[0][:-3]), dpi=300,
                        bbox_inches="tight")
    pl.show()
Esempio n. 33
0
    x_red[0] = pca.fit_transform(x[2])
    x_red[1] = pca.transform(x[3])
    x_red[2] = pca.transform(x[4])
    print("Dimenionality Reduction methd used: ", pca)

if args.dimensionality_reduction_method == "LDA":
    lda = LDA(n_components=k)
    x_red[0] = lda.fit_transform(x[2], y[2])
    x_red[1] = lda.transform(x[3])
    x_red[2] = lda.transform(x[4])
    print("Dimenionality Reduction methd used: ", lda)

if args.dimensionality_reduction_method == "KPCA":
    kpca = KPCA(n_components=k, kernel=args.kernel_pca)
    x_red[0] = kpca.fit_transform(x[2])
    x_red[1] = kpca.transform(x[3])
    x_red[2] = kpca.transform(x[4])
    print("Dimenionality Reduction methd used: ", kpca)

# training the model
if args.C == None:
    C = [0.5, 5, 10, 20]
else:
    C = [args.C]

if args.gamma == None:
    gam = [0.01, 0.05, 0.1, 0.5, 1]
else:
    gam = [args.gamma]

if args.training_model == "LR":
Esempio n. 34
0
pca.fit(dat)

kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
kpca.fit(dat)

## project data into PC space

# 0,1 denote PC1 and PC2; change values for other PCs
xvector = pca.components_[0]  # see 'prcomp(my_data)$rotation' in R
yvector = pca.components_[1]

xs = pca.transform(dat)[:, 0]  # see 'prcomp(my_data)$x' in R
ys = pca.transform(dat)[:, 1]

kxs = kpca.transform(dat)[:, 0]  # see 'prcomp(my_data)$x' in R
kys = kpca.transform(dat)[:, 1]

## visualize projections

## Note: scale values for arrows and text are a bit inelegant as of now,
##       so feel free to play around with them

for i in range(len(xvector[:n])):
    # arrows project features (ie columns from csv) as vectors onto PC axes
    plt.arrow(0,
              0,
              xvector[i] * max(xs),
              yvector[i] * max(ys),
              color='r',
              width=0.0005,
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

#Applying kernel PCA
from sklearn.decomposition import KernelPCA
#n_components - No. of extracted features that you need that will explain most variance
kpca = KernelPCA(n_components = 2, kernel = 'rbf')
#Fitting PCA to training set
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)


#Fitting Logistic Regression to the Training Data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,Y_train)
y_pred = classifier.predict(X_test)

#Create Confusion Matrix
#Class has capitals while functions have small letters
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, y_pred)

#Visualizing Training set results
from matplotlib.colors import ListedColormap
Esempio n. 36
0
class PCAUmap:
    def __init__(
        self,
        n_neighbors=15,
        use_pca=1,
        kernel='linear',
        min_dist=0.1,
        n_components=2,
        random_state=None,
        transform_seed=None,
        scaler=True,
        metric="euclidean",
        augment_size=3,
        impute_rate=0.1,
    ):
        if kernel == 'linear':
            self.pca = PCA()
        else:
            self.pca = KernelPCA(kernel=kernel, fit_inverse_transform=True)
        self.umap = UMAP(
            random_state=random_state,
            transform_seed=transform_seed,
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            n_components=n_components,
            metric=metric,
        )
        self.use_pca = use_pca
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.data = None
        self.pca_features = None
        self.embedding = None
        self.imputer = KNNImputer()
        self.augment_size = augment_size
        self.impute_rate = impute_rate

    def fit(self, data):
        self.data = pd.DataFrame(data)
        augmented_data = self.augumentation(self.augment_size,
                                            self.impute_rate)

        if self.scaler is None:
            if self.use_pca is None:
                self.umap.fit(augmented_data)
                self.embedding = self.umap.transform(data)
            else:
                self.umap.fit(self.pca.fit_transform(augmented_data))
                self.pca_features = self.pca.transform(data)
                self.embedding = self.umap.transform(self.pca_features)
        else:
            if self.use_pca is None:
                self.umap.fit(self.scaler.fit_transform(augmented_data))
                self.embedding = self.umap.transform(
                    self.scaler.transform(data))
            else:
                self.umap.fit(
                    self.pca.fit_transform(
                        self.scaler.fit_transform(augmented_data)))
                self.pca_features = self.pca.transform(
                    self.scaler.transform(data))
                self.embedding = self.umap.transform(self.pca_features)
        return self

    def transform(self, data):
        self.data = pd.DataFrame(data)
        if self.scaler is None:
            if self.pca is None:
                self.embedding = self.umap.transform(data)
                return self.embedding
            else:
                self.pca_features = self.pca.transform(data)
                self.embedding = self.umap.transform(self.pca_features)
                return self.embedding
        else:
            if self.pca is None:
                self.embedding = self.umap.transform(
                    self.scaler.transform(data))
                return self.embedding
            else:
                self.pca_features = self.pca.transform(
                    self.scaler.transform(data))
                self.embedding = self.umap.transform(self.pca_features)
                return self.embedding

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def inverse_transform(self, embedded):
        if self.scaler is None:
            if self.pca is None:
                return self.umap.inverse_transform(embedded)
            else:
                return self.pca.inverse_transform(
                    self.umap.inverse_transform(embedded))
        else:
            if self.pca is None:
                return self.scaler.inverse_transform(
                    self.umap.inverse_transform(embedded))
            else:
                return self.scaler.inverse_transform(
                    self.pca.inverse_transform(
                        self.umap.inverse_transform(embedded)))

    def pca_summary(self, c=None):
        plt.figure(figsize=(6, 6))
        if c is None:
            plt.scatter(self.pca_features[:, 0],
                        self.pca_features[:, 1],
                        alpha=0.5)
        else:
            plt.scatter(self.pca_features[:, 0],
                        self.pca_features[:, 1],
                        alpha=0.5,
                        c=c)
        plt.xlabel("PC1 ({}%)".format(
            int(self.pca.explained_variance_ratio_[0] * 100)))
        plt.ylabel("PC2 ({}%)".format(
            int(self.pca.explained_variance_ratio_[1] * 100)))
        plt.grid()
        plt.show()
        plt.figure(figsize=(6, 6))
        plt.scatter(self.pca.components_[0],
                    self.pca.components_[1],
                    alpha=0.5)
        plt.xlabel("loading 1")
        plt.ylabel("loading 2")
        plt.grid()
        plt.show()
        plt.figure(figsize=(6, 6))
        plt.plot([0] + list(np.cumsum(self.pca.explained_variance_ratio_)),
                 "-o")
        plt.xlabel("Number of principal components")
        plt.ylabel("Cumulative contribution ratio")
        plt.grid()
        plt.show()

    def map_predicted_values(
            self,
            model,
            c=None,
            alpha=0.5,
            edgecolors="k",
            figsize=(8, 6),
            h=0.2,
            cm=plt.cm.jet,
    ):

        x_min = self.embedding[:, 0].min() - 0.5
        x_max = self.embedding[:, 0].max() + 0.5
        y_min = self.embedding[:, 1].min() - 0.5
        y_max = self.embedding[:, 1].max() + 0.5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        plt.figure(figsize=figsize)
        if hasattr(model, "predict_proba"):
            Z = model.predict_proba(
                self.inverse_transform(np.c_[xx.ravel(),
                                             yy.ravel()]))[:, 1]
        elif hasattr(model, "decision_function"):
            Z = model.decision_function(
                self.inverse_transform(np.c_[xx.ravel(),
                                             yy.ravel()]))
        else:
            Z = model.predict(
                self.inverse_transform(np.c_[xx.ravel(),
                                             yy.ravel()]))

        Z = Z.reshape(xx.shape)
        plt.contourf(xx, yy, Z, alpha=alpha, cmap=cm)
        plt.colorbar()
        if c is None:
            plt.scatter(
                self.embedding[:, 0],
                self.embedding[:, 1],
                alpha=alpha,
                edgecolors=edgecolors,
            )
        else:
            plt.scatter(
                self.embedding[:, 0],
                self.embedding[:, 1],
                alpha=alpha,
                c=c,
                edgecolors=edgecolors,
            )
        plt.grid()
        plt.show()

    def augumentation(self, augment_size, rate):
        augmented_data = pd.concat([self.data] * augment_size).values
        augmented_data = fill_randomly(augmented_data, np.nan, rate)
        augmented_data = pd.DataFrame(
            self.imputer.fit_transform(augmented_data))
        augmented_data = pd.concat([self.data, augmented_data])
        return augmented_data
Esempio n. 37
0
    np.arange(number_of_molecules_COMP))
random_indices = random_indices[:number_of_data_COMP]

frames_benchmark, Y_benchmark, mol_indices_comp = load_COMP(random_indices)

X_benchmark = compute_soap_matrix(frames_benchmark)
weights = ridge_regression(Y, X, np.ones_like(X[0, :].T))
full_errorMAE, full_errorMSE = compute_loss(X_benchmark, weights, Y_benchmark)

# In[5]:

from sklearn.decomposition import KernelPCA

pca = KernelPCA(n_components=4000, kernel='precomputed')
pca.fit(np.dot(X.T, X))
XPCA = pca.transform(X)
X_benchmarkPCA = pca.transform(X_benchmark)
XPCA.shape

# In[25]:

methods = ['F', 'FPS', 'PCA']
indices = {
    'F': ind_F_test,
    'FPS': ind_FPS,
}
numbers_steps = 2 * np.logspace(0, 3, 7).astype(int)
vecMAE_AL = np.zeros([len(numbers_steps), len(methods)])
vecMSE_AL = np.zeros([len(numbers_steps), len(methods)])

for i, number_of_feature in enumerate(numbers_steps):
Esempio n. 38
0
svm.fit(X_train_lda, y_train)
svm_pred_test_lda = svm.predict(X_test_lda)
svm_pred_train_lda = svm.predict(X_train_lda)
print(accuracy_score(svm_pred_train_lda, y_train))
print(accuracy_score(svm_pred_test_lda, y_test))
########################################kpca_lr
gamma_space = np.arange(0.01, 5, 0.05)
acc_lp_kpca_train = np.empty(len(gamma_space))
acc_lp_kpca_test = np.empty(len(gamma_space))

for j, i in enumerate(gamma_space):

    scikit_kpca = KernelPCA(n_components=2, kernel='rbf', gamma=i)
    X_train_kpca = scikit_kpca.fit_transform(X_train_std)
    X_test_kpca = scikit_kpca.transform(X_test_std)
    lr_kpca = lr.fit(X_train_kpca, y_train)
    lr_pred_train = lr.predict(X_train_kpca)
    lr_pred_test = lr.predict(X_test_kpca)
    acc_lp_kpca_train[j] = accuracy_score(lr_pred_train, y_train)
    acc_lp_kpca_test[j] = accuracy_score(lr_pred_test, y_test)

plt.title('lr accuracy varies according to gamma')
plt.plot(gamma_space, acc_lp_kpca_train, label='training accuracy')
plt.plot(gamma_space, acc_lp_kpca_test, label='testing accuracy')
_ = plt.xlabel('gamma')
_ = plt.ylabel('accuracy')
plt.show()

print(max(acc_lp_kpca_train))
print(max(acc_lp_kpca_test))
Esempio n. 39
0
#Splitting the data
d = data.values
x_train, x_test, y_train, y_test = train_test_split(d[:,0:12], d[:,12:], test_size = 0.25, random_state = 0)

#Feature Scaling
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)



#Applying PCA
pca = KernelPCA(n_components = 8, kernel='rbf')
X_train = pca.fit_transform(x_train)
X_test = pca.transform(x_test)
#explained_variance = pca.explained_variance_ratio_

#Model building

#Fitting model to KNN
knn_classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_classifier.fit(x_train, y_train)

#Fitting model to kernel SVM
svm_classifier = SVC(kernel = 'rbf', random_state = 0)
svm_classifier.fit(x_train, y_train)

#Fitting model to naive bayes
nb_classifier = GaussianNB()
nb_classifier.fit(x_train, y_train)
    #For PCA we need to normalize our data with some function
    dt_features = StandardScaler().fit_transform(dt_features)

    X_train, X_test, y_train, y_test = train_test_split(dt_features,
                                                        dt_target,
                                                        test_size=0.3,
                                                        random_state=42)
    #train_test_split part of a set of tests (30% of tests), random_state, says that by giving it a value, the model will always start from the same point

    kpca = KernelPCA(
        n_components=4, kernel='poly'
    )  #n_components (optional),tells us to look for the 4 variables that provide the most amount of info
    kpca.fit(X_train)

    dt_train = kpca.transform(X_train)
    dt_test = kpca.transform(X_test)

    logistic = LogisticRegression(solver='lbfgs')
    logistic.fit(dt_train, y_train)
    print("SCORE KPCA: ", logistic.score(dt_test, y_test))

    print(X_train.shape)  #Table shape
    print(
        y_train.shape
    )  #target data, (0-1): there is presence of heart disease or there is no
    #n_components = min(n_samples, n_features)

    pca = PCA(n_components=3)
    pca.fit(X_train)
Esempio n. 41
0
    distortion.append(sum(numpy.min(cdist(delta_noname, kmeans.cluster_centers_, 'euclidean'), axis=1)) / delta_noname.shape[0])
    
plt.plot(K, distortion, 'bx-')
plt.title('The Elbow Method showing the optimal k')
plt.show()    


# In[277]:


#PCA with RBF
from sklearn.decomposition import PCA, KernelPCA

kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10,n_components=4)
kpca.fit(delta_noname)
delta_noname_components_rbf = kpca.transform(delta_noname)


# In[278]:


#to get the variance being explained by the components
import numpy
explained_variance = numpy.var(delta_noname_components_rbf, axis=0)
explained_variance_ratio = explained_variance / numpy.sum(explained_variance)
print(explained_variance_ratio)


# In[279]:

Esempio n. 42
0
#split dataset to training and test datasets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)


#feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

#apply kernelPCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 2, kernel ='rbf')
x_train = kpca.fit_transform(x_train)
x_test = kpca.transform(x_test)

#logistic regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

#making the cpnfusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)


#model selection
"kfold cross validation"
Esempio n. 43
0
class RegionSplitter_PCA_oudeyer_modified():

    def __init__(self, data, label):

        self.cut_dim = 0
        self.cut_val = 0
        num_candidates = 500
        min_group_size = 20

        data_dim_num = len(data[0])

        self.n_comp =  max(1, data_dim_num)

        self.pca = PCA(n_components=self.n_comp, kernel='linear')
        #self.pca = ICA(n_components=self.n_comp)

        data = self.pca.fit_transform(data)

        data_dim_num = len(data[0])
        label_dim_num = len(label[0])

        data_zipped = list(zip(*data))

        # model used to evaluate the data
        model = linear_model.LinearRegression()

        # the error of whole partition
        n_fold = 2
        kf = KFold(len(data), n_folds=n_fold)
        rms_error_whole = 0
        for train_index, test_index in kf:
            data_train, data_test = np.array(data)[train_index], np.array(data)[test_index]
            label_train, label_test = np.array(label)[train_index], np.array(label)[test_index]

            model = linear_model.LinearRegression()
            model.fit(data_train, label_train)
            label_predict = model.predict(data_test)

            rms_error_whole += metrics.mean_squared_error(label_test, label_predict)

        rms_error_whole /= n_fold

        # sort in each dimension
        dim_min = float("inf")
        for i in range(data_dim_num):

            for k in range(num_candidates):
                # pick a random value
                max_val = max(data_zipped[i])
                min_val = min(data_zipped[i])
                cut_val = random.uniform(min_val, max_val)

                groups = [[[data[j], label[j]] for j in range(len(data_zipped[i])) if data_zipped[i][j] <= cut_val],
                          [[data[j], label[j]] for j in range(len(data_zipped[i])) if data_zipped[i][j] > cut_val]]

                # check if any of the group is 0 or 1

                if len(groups[0]) < min_group_size or len(groups[1]) < min_group_size:
                    continue

                avg_error = []
                weighted_avg_variance = []


                for group in groups:

                    # calculate error with a linear model
                    data_k = list(zip(*group))[0]
                    label_k = list(zip(*group))[1]

                    # the split groups error
                    n_fold = 2
                    kf = KFold(len(data_k), n_folds=n_fold)
                    rms_error_split = 0
                    for train_index, test_index in kf:
                        data_train, data_test = np.array(data_k)[train_index], np.array(data_k)[test_index]
                        label_train, label_test = np.array(label_k)[train_index], np.array(label_k)[test_index]

                        model.fit(data_train, label_train)
                        label_predict = model.predict(data_test)

                        rms_error_split += metrics.mean_squared_error(label_test, label_predict)

                    rms_error_split /= n_fold

                    avg_error.append(rms_error_split)

                    num_sample = len(group)
                    group = zip(*group[0])

                    # calculate variance of data points
                    variance = []
                    for group_k in group:
                        mean = math.fsum(group_k)/len(group_k)
                        norm = max(math.fsum([x**2 for x in group_k])/len(group_k), 1)
                        variance.append(math.fsum([((x - mean)**2)/norm for x in group_k]))
                    weighted_avg_variance.append(math.fsum(variance)/len(variance)*num_sample)



                error_diff = (avg_error[0] - avg_error[1])**2
                smallest_error = min(avg_error)
                biggest_error_reduction = max(rms_error_whole - avg_error[0], rms_error_whole-avg_error[1])
                in_group_variance = math.fsum(weighted_avg_variance)
                #print('cut_dim=%d cut_val=%f avg_err=%f var=%f'%(i, cut_val, smallest_error, in_group_variance))

                try:
                    score = ((in_group_variance+1)*(smallest_error+1)) / (error_diff*(biggest_error_reduction**0.5))
                except ZeroDivisionError:
                    score = float("inf")

                if score < dim_min:

                    dim_min = score
                    self.cut_dim = i
                    self.cut_val = cut_val


        # just cut in half
        #self.cut_val = exemplars[int(sample_num/2)][0][self.cut_dim]

    def classify(self, data):
        if not isinstance(data, tuple):
            raise(TypeError, "data must be a tuple")

        data = tuple(self.pca.transform(data)[0])
        group = data[self.cut_dim] <= self.cut_val

        return group == 0
 def apply_Kenel_PCA(self, X_training, variance, kernel):
     pca = KernelPCA(variance, kernel=kernel, degree=4)
     pca.fit(X_training)
     return pca.transform(X_training)
Esempio n. 45
0
    data=data.dropna() 
dax=pd.DataFrame(data.pop('DJIA'))
data[data.columns[:6]].head()
scale_function=lambda x:(x-x.mean())/x.std()
#考虑多个成分的pca
pca=KernelPCA().fit(data.apply(scale_function))
len(pca.lambdas_)
pca.lambdas_[:10].round() 
#规范化
get_we=lambda x:x/x.sum()
get_we(pca.lambdas_)[:10]
get_we(pca.lambdas_)[:5].sum()
#构造pca指数
#只包含第一个成分的pca指数
pca=KernelPCA(n_components=1).fit(data.apply(scale_function))
dax['PCA_1']=pca.transform(-data)
dax.apply(scale_function).plot(figsize=(8,4))
#计算单个结果成分的加权平均数
pca=KernelPCA(n_components=5).fit(data.apply(scale_function))
pca_components=pca.transform(-data)
weights=get_we(pca.lambdas_)
dax['PCA_5']=np.dot(pca_components,weights)
dax.apply(scale_function).plot(figsize=(8,4))
#散点图

mpl_dates=mpl.dates.date2num([n for n in pd.to_datetime(data.index)])
#mpl_dates=mpl.dates.date2num(data.index)
mpl_dates
plt.figure(figsize=(8,4))
plt.scatter(dax['PCA_5'],dax['DJIA'],c=mpl_dates)
lin_reg=np.polyval(np.polyfit(dax['PCA_5'],dax['DJIA'],1),dax['PCA_5'])
#PCA降维
#from sklearn.decomposition import PCA
#pca = PCA(n_components=3)
# np_data_3d = pca.fit(np_data)
# #返回所保留的n个成分各自的方差百分比
# print(pca.explained_variance_ratio_)
# print(pca.explained_variance_)
# 核 PCA

from sklearn.decomposition import KernelPCA

pca = KernelPCA(n_components=6, kernel='rbf', gamma=15)
np_data_3d = pca.fit(np_data)

data_new_3d = pca.transform(np_data)

#显示处理后数据大小
print(data_new_3d.shape)

#各种聚类算法、评价 metrics
#k-means
from sklearn.cluster import KMeans
import sklearn.metrics as metrics
from sklearn.cluster import DBSCAN

## k-means++
y_pred = KMeans(n_clusters=3, random_state=9).fit_predict(data_new_3d)

# y_pred = DBSCAN(eps=0.4,  # 邻域半径
# min_samples=5,    # 最小样本点数,MinPts
Esempio n. 47
0
    def pca_transform(self,
                      nb_PC=4,
                      remove_mean0=False,
                      remove_mean1=False,
                      standard=False,
                      sklearn=False,
                      sklearn_kernel=False,
                      cov=True):
        """
        Perform the Principal component analysis with SKlearn
        using singular value fft
        The dataframe is standardize
        
        
        
        parameters:
            standard: default = True, standardize the dataframe
            nb_PC: default = 4, number of principal components to be used
            sklearn: if True (default=False) use svd by sklearn
            cov: if true (by default) sue the correlation matrix to perform the PCA analysis
        
        Stock in the object
            Dataframe with:
                eigenvalues
                eigenvectors
                scores
            list of vectors:
                eigenpairs
        
        NOTE:
            By default sklearn remove the mean from the dataset. So I cant use it to perform the downscalling
        
        References:
            http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html#projection-onto-the-new-feature-space
        """
        df = self.df
        self.nb_PC = nb_PC

        if remove_mean0:
            print('remove_mean0')
            df = df.subtract(df.mean(axis=0), axis='columns')

        if remove_mean1:
            print('remove_mean1')
            df = df.subtract(df.mean(axis=1), axis='index')
            print(df)

        if standard:
            # standardize
            #             df_std = StandardScaler().fit_transform(df)
            self.standard = True
            df = (df - df.mean(axis=0)) / df.std(
                axis=0)  # another way to standardise

        #=======================================================================
        # Sklearn
        #=======================================================================
        if sklearn:
            print("o" * 80)
            print("SVD sklearn used")
            print("o" * 80)

            if sklearn_kernel:
                print('sklearn_kernel')
                pca = KernelPCA(nb_PC,
                                kernel="rbf",
                                fit_inverse_transform=True,
                                gamma=10)

            #Create a PCA model with nb_PC principal components
            else:
                pca = PCA(nb_PC)
            # fit data
            pca.fit(df)

            #Get the components from transforming the original data.
            scores = pca.transform(df)  #  or PCs
            eigenvalues = pca.explained_variance_
            eigenvectors = pca.components_  # or loading

            # Make a list of (eigenvalue, eigenvector) tuples
            self.eigpairs = [(np.abs(self.eigenvalues[i]),
                              self.eigenvector[i, :])
                             for i in range(len(self.eigenvalues))]

        #=======================================================================
        # Covariance Matrix
        #=======================================================================
        if cov:
            print("o" * 80)
            print("Covariance used")
            print("o" * 80)

            X = df.values
            cov_mat = np.cov(X.T)
            eigenvalues, eigenvectors = np.linalg.eig(cov_mat)

            scores = X.dot(eigenvectors)
            scores = pd.DataFrame(scores,
                                  columns=np.arange(1,
                                                    len(df.columns) + 1),
                                  index=df.index)
            eigenvalues = pd.Series(eigenvalues,
                                    index=np.arange(1,
                                                    len(df.columns) + 1))
            eigenvectors = pd.DataFrame(eigenvectors.T,
                                        columns=df.columns,
                                        index=np.arange(
                                            1,
                                            len(df.columns) + 1))

        self.scores = scores.iloc[:, 0:nb_PC]
        self.eigenvalues = eigenvalues  #[0:nb_PC]
        self.eigenvectors = eigenvectors[0:nb_PC]

        tot = sum(eigenvalues)
        self.var_exp = [(i / tot) * 100
                        for i in sorted(eigenvalues, reverse=True)]
Esempio n. 48
0
    coeff_slices = []
    for (i, x) in enumerate(X):
        norm = normalize(x[1][:, np.newaxis], axis=0).ravel()
        coeffs = pywt.wavedec(norm, 'sym13', level=2)
        arr, coeff_slice = pywt.coeffs_to_array(coeffs)
        arrays.append(arr)
        coeff_slices.append(coeff_slice)

    plt.plot(arrays[1])
    plt.savefig('ja_dwt')
    plt.clf()
    plt.plot(arrays[0])
    plt.savefig('tymo_dwt')
    plt.clf()
    pca = KernelPCA(kernel='sigmoid').fit(arrays)
    transformed_X = pca.transform(arrays)
    plt.plot(transformed_X[1])
    plt.savefig('ja_pca')
    plt.clf()
    plt.plot(transformed_X[0])
    plt.savefig('tymo_pca')
    plt.clf()
    plt.scatter(transformed_X[1:21][:, 0],
                transformed_X[1:21][:, 1],
                c=y,
                cmap=matplotlib.colors.ListedColormap(["red", "blue"]))
    # plt.scatter(transformed_X[0][:, 0], transformed_X[0][:, 1])
    plt.title("2D")
    plt.savefig('2d.png')

    clf = SVC()
Esempio n. 49
0
X = dataset.iloc[:,2:4].values
Y = dataset.iloc[:, 4].values

# Split of Data into training and testing 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 0)

# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) 

# Applying Kernal PCA
from sklearn.decomposition import KernelPCA
kernalpca = KernelPCA(n_components = 2, kernel = 'rbf')
kernalpca.fit_transform(X_train)
kernalpca.transform(X_test)


# Logistic regression
from sklearn.linear_model import LogisticRegression
regression = LogisticRegression(random_state = 0)
regression.fit(X_train, Y_train)

Y_pred = regression.predict(X_test)

# Confusion metrics
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)
Esempio n. 50
0
plt.figure(figsize=(10, 10))
plt.plot(np.cumsum(pca.explained_variance_ratio_),
         marker='o',
         markerfacecolor='blue',
         markersize=12,
         color='blue',
         linewidth=4)
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

#Applying Kernel PCA #Please Turn Off when applying PCA
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=32, kernel='rbf')
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

#Applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=35)
X_train = lda.fit_transform(X_train, Y_train)
X_test = lda.fit_transform(X_test, Y_test)

#Fitting SVM to the Training Set
from sklearn.svm import SVC
classifier = SVC(
    kernel='rbf',
    random_state=0)  #kernel can be changed to linear for linear SVM
classifier.fit(X_train, Y_train)

#Fitting Decision Tree to the Training Set
class RGNN_PCA():
    """
    (Kernel) PCA module with fit and transform functions
    
    embedding_model = RGNN-based model to compute embeddings {'deep', 'arma', 'pool'}
    T = depth of each RGNN stack
    K = number of parallel stacks
    in_scaling = scaling of the input weights in the RGNNs in the first layer
    hid_scaling = scaling of the input weights in the RGNNs in all the other layers
    return_last = use as representation only the states from the last layer in the stack
    aggregation = global pooling method to obtain a graph embedding {'sum', 'average'}
    kwargs = dict specifying RGNN hyperparams
    """
    def __init__(self,
                 embedding_model=None,
                 T=None,
                 K=None,
                 in_scaling=None,
                 hid_scaling=None,
                 return_last=None,
                 aggregation=None,
                 **kwargs):

        if embedding_model == 'deep':
            model = deep
        elif embedding_model == 'arma':
            model = ARMA
        elif embedding_model == 'pool':
            model = pool
        else:
            raise NotImplementedError('unsupported model type')

        # Reservoir-based model
        self.embedding_model = model(K=K,
                                     T=T,
                                     in_scaling=in_scaling,
                                     hid_scaling=hid_scaling,
                                     return_last=return_last,
                                     aggregation=aggregation,
                                     **kwargs)

        self.pca = None
        self.embeddings_tr = None

    def fit(self, *args):
        print('Fitting model')

        # Generate embeddings
        embeddings = []
        for elem in tqdm.tqdm(zip(*args)):
            emb = self.embedding_model.get_embeddings(*elem)
            embeddings.append(emb)
        embeddings = np.vstack(embeddings)

        self.embeddings_tr = embeddings

        # Compute empirical covariance matrix (linear kernel) - Train vs Train
        K_tr = np.dot(self.embeddings_tr, self.embeddings_tr.T)
        self.pca = KernelPCA(n_components=2, kernel='precomputed')
        embeddings_pca = self.pca.fit_transform(K_tr)

        # self.pca = umap.UMAP()
        # embeddings = StandardScaler().fit_transform(embeddings)
        # embeddings_pca = self.pca.fit_transform(embeddings)

        return embeddings_pca

    def transform(self, *args):
        print('Evaluating model')

        # Generate embeddings
        embeddings = []
        for elem in tqdm.tqdm(zip(*args)):
            emb = self.embedding_model.get_embeddings(*elem)
            embeddings.append(emb)
        embeddings = np.vstack(embeddings)

        # Compute empirical covariance matrix (linear kernel) - Test vs Train
        K_te = np.dot(embeddings, self.embeddings_tr.T)

        embeddings_pca = self.pca.transform(K_te)

        return embeddings_pca
Esempio n. 52
0
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA, KernelPCA
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedKFold
import matplotlib.cm as cm

# Import test data and labels
import_test = sio.loadmat(file_loc + 'Test.mat')
import_train = sio.loadmat(file_loc + 'Train.mat')
X_train = import_train['Xtrain']
X_testing = import_test['Xtest']
Y_train = import_train['Ytrain']
pca = KernelPCA(kernel="rbf", degree=5, gamma=10)
pca.fit_transform(X_train)
#print(pca.explained_variance_ratio_) 
X_train = pca.transform(X_train)
#k_fold = cross_validation.KFold(len(X_train), 5)
Y_kf = Y_train.ravel()
k_fold = StratifiedKFold(Y_kf, n_folds=5)
print(k_fold)
#X, X_test, Y, Y_test = cross_validation.train_test_split(X_train, Y_train, test_size=0.2, random_state=0)
#y = Y.ravel()

#X_test = X[401:,:]
#X = X[:400,:]
#X = X[:, :2]
#Y_test = Y[401:,:]
#Y = Y[:400,:]

'''
eventsTrain = import_train['eventsTrain']
Esempio n. 53
0
def classifyPHC():
    data = readFile()
    #data = equalizeClasses(data)
    features, labels = splitData(data)

    #determine the training and testing size in the range of 1, 1 = 100%
    validation_size = 0.2

    #here we are splitting our data based on the validation_size into training and testing data
    features_train, features_validation, labels_train, labels_validation = model_selection.train_test_split(
        features, labels, test_size=validation_size)

    #normalize data in the range [-1,1]
    scaler = MinMaxScaler(feature_range=(-1, 1))
    #fit only th training data in order to find the margin and then test to data without normalize them
    scaler.fit(features_train)

    features_train_scalar = scaler.transform(features_train)

    #trnasform the validation features without fitting them
    features_validation_scalar = scaler.transform(features_validation)

    #determine the pca, and determine the dimension you want to end up
    pca = KernelPCA(n_components=5, kernel='rbf', fit_inverse_transform=True)

    #fit only the features train
    pca.fit(features_train_scalar)

    #dimensionality reduction of features train
    features_train_pca = pca.transform(features_train_scalar)

    #dimensionality reduction of fatures validation
    features_validation_pca = pca.transform(features_validation_scalar)

    #reconstruct data training error
    reconstruct_data = pca.inverse_transform(features_train_pca)

    error_percentage = (
        sum(sum(error_matrix)) /
        (len(features_train_scalar) * len(features_train_scalar[0]))) * 100

    #len(features_train_scalar) = len(reconstruct_data) = 89
    #len(features_train_scalar[0]) = len(reconstruct_data[0]) = 13

    #len(error_matrix) = 89, which means for all the samples
    #len(error_matrix[0]) = 13, for every feature of every sample
    #we take the sum and we conlcude in an array which has the sum for every feature (error)
    #so we take the sum again and we divide it with the 89 samples * 13 features
    print 'Information loss of KernelPCA:', error_percentage, '% \n'

    lda = LinearDiscriminantAnalysis()

    lda.fit(features_train_pca, labels_train)

    features_train_pca = lda.transform(features_train_pca)

    features_validation_pca = lda.transform(features_validation_pca)

    #we can see the shapes of the array just to check
    print 'feature training array: ', features_train_pca.shape, 'and label training array: ', labels_train.shape
    print 'feature testing array: ', features_validation_pca.shape, 'and label testing array: ', labels_validation.shape, '\n'

    #take the best couple of parameters from the procedure of greedy search
    #paramTuning(features_train, labels_train, 5)

    #we initialize our model
    #svm = SVC(kernel='rbf',C=10,gamma=0.0001,decision_function_shape='ovo')
    svm = KNeighborsClassifier(n_neighbors=3)

    #train our model with the data that we previously precessed
    svm.fit(features_train_pca, labels_train)

    #now test our model with the test data
    predicted_labels = svm.predict(features_validation_pca)
    accuracy = accuracy_score(labels_validation, predicted_labels)
    print 'Classification accuracy: ', accuracy * 100, '\n'

    #see the accuracy in training procedure
    predicted_labels_train = svm.predict(features_train_pca)
    accuracy_train = accuracy_score(labels_train, predicted_labels_train)
    print 'Training accuracy: ', accuracy_train * 100, '\n'

    #confusion matrix to illustrate the faulty classification of each class
    conf_matrix = confusion_matrix(labels_validation, predicted_labels)
    print 'Confusion matrix: \n', conf_matrix, '\n'
    print 'Support    class 0   class 1    class2:'
    #calculate the support of each class
    print '          ', conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[0][
        2], '       ', conf_matrix[1][0] + conf_matrix[1][1] + conf_matrix[1][
            2], '        ', conf_matrix[2][0] + conf_matrix[2][
                1] + conf_matrix[2][2], '\n'

    #calculate the accuracy of each class
    edema = (conf_matrix[0][0] /
             (conf_matrix[0][0] + conf_matrix[0][1] + conf_matrix[0][2])) * 100
    paralysis = (
        conf_matrix[1][1] /
        (conf_matrix[1][0] + conf_matrix[1][1] + conf_matrix[1][2])) * 100
    normal = (
        conf_matrix[2][2] /
        (conf_matrix[2][0] + conf_matrix[2][1] + conf_matrix[2][2])) * 100

    #see the inside details of the classification
    print 'For class 0 edema cases:', conf_matrix[0][
        0], 'classified correctly and', conf_matrix[0][1] + conf_matrix[0][
            2], 'missclassified,', edema, 'accuracy \n'
    print 'For class 1 paralysis cases:', conf_matrix[1][
        1], 'classified correctly and', conf_matrix[1][0] + conf_matrix[1][
            2], 'missclassified,', paralysis, 'accuracy\n'
    print 'For class 0 normal cases:', conf_matrix[2][
        2], 'classified correctly and', conf_matrix[2][0] + conf_matrix[2][
            1], 'missclassified,', normal, 'accuracy \n'

    #try 5-fold cross validation
    scores = cross_val_score(svm, features_train_pca, labels_train, cv=5)
    print 'cross validation scores for 5-fold', scores, '\n'
    print 'parameters of the model: \n', svm.get_params(), '\n'
Esempio n. 54
0
#Perform PCA
pca=KernelPCA().fit(data.apply(scale_function).fillna(0))

#Review eigenvalues (only look at first ten)
pca.lambdas_[:10].round()

#Get relative weight
get_we=lambda x: x/x.sum()
get_we(pca.lambdas_)[:10]

#First compoonent explains~65% of variability

#consutruct pca index with just the first component
pca=KernelPCA(n_components=1).fit(data.apply(scale_function).fillna(0))

dax['PCA_1']=pca.transform(-data.fillna(0))

dax.apply(scale_function).plot(figsize=(8,4))

#Add in more components
pca=KernelPCA(n_components=5).fit(data.apply(scale_function).fillna(0))

pca_components=pca.transform(data.fillna(0))

weights=get_we(pca.lambdas_)

dax['PCA_5']=np.dot(pca_components,weights)

dax.apply(scale_function).plot(figsize=(8,4))

#############################
Esempio n. 55
0
pl.figure()
pl.subplot(2, 2, 1, aspect='equal')
pl.title("Original space")
reds = y == 0
blues = y == 1

pl.plot(X[reds, 0], X[reds, 1], "ro")
pl.plot(X[blues, 0], X[blues, 1], "bo")
pl.xlabel("$x_1$")
pl.ylabel("$x_2$")

X1, X2 = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))
X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T
# projection on the first principal component (in the phi space)
Z_grid = kpca.transform(X_grid)[:, 0].reshape(X1.shape)
pl.contour(X1, X2, Z_grid, colors='grey', linewidths=1, origin='lower')

pl.subplot(2, 2, 2, aspect='equal')
pl.plot(X_pca[reds, 0], X_pca[reds, 1], "ro")
pl.plot(X_pca[blues, 0], X_pca[blues, 1], "bo")
pl.title("Projection by PCA")
pl.xlabel("1st principal component")
pl.ylabel("2nd component")

pl.subplot(2, 2, 3, aspect='equal')
pl.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro")
pl.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo")
pl.title("Projection by KPCA")
pl.xlabel("1st principal component in space induced by $\phi$")
pl.ylabel("2nd component")
Esempio n. 56
0
kpca_0.fit(data)
kpca_data = kpca_0.fit_transform(data)
fig = plt.figure(4)
if kpca_num == 2:
    for i in range(len(labels_predict)):
        plt.scatter(kpca_data[i,0],kpca_data[i,1],c=color[labels_predict[i]]
                    ,marker='o')
else:
    ax = fig.add_subplot(111,projection='3d')
    for i in range(len(labels_predict)):
        ax.scatter(kpca_data[i,0],kpca_data[i,1],kpca_data[i,2]
                   ,c=color[labels_predict[i]],marker='o')
       
kpca_1 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma,degree=degree)
kpca_1.fit(data_fs1)
kpca_data_fs1 = kpca_1.transform(data_fs1)
fig = plt.figure(5)
if kpca_num == 2:
    for i in range(len(labels_fs1_predict)):
        plt.scatter(kpca_data_fs1[i,0],kpca_data_fs1[i,1]
                    ,c=color[labels_fs1_predict[i]],marker='o')
else:
    ax = fig.add_subplot(111,projection='3d')
    for i in range(len(labels_fs1_predict)):
        ax.scatter(kpca_data_fs1[i,0],kpca_data_fs1[i,1],kpca_data_fs1[i,2]
                   ,c=color[labels_fs1_predict[i]],marker='o') 

kpca_2 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma,degree=degree)
kpca_2.fit(data_fs2)
kpca_data_fs2 = kpca_2.transform(data_fs2)
fig = plt.figure(6)
Esempio n. 57
0
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    sparse_corpus_tfidf_transpose,
    df.ix[:, 1],
    test_size=0.2,
    random_state=seed)

from sklearn.decomposition import KernelPCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# reduce dimensions
print('Starting dimensionality reduction')
reducer = KernelPCA(n_components=1500, kernel="cosine", random_state=seed)
corpus_train_tfidf_kpca = reducer.fit_transform(X_train)
corpus_test_tfidf_kpca = reducer.transform(X_test)

print('Finished dimensionality reduction')

#Initialize Logistic Regression
log_reg = LogisticRegression(C=1.0)
log_reg.fit(corpus_train_tfidf_kpca, y_train)

a = log_reg.score(corpus_test_tfidf_kpca, y_test)

print('Starting logistic regression 2')
log_reg.fit(X_train, y_train)

b = log_reg.score(X_test, y_test)
Esempio n. 58
0
svm_lda = clf.best_estimator_
svm_lda.fit(X_train_lda, y_train)

y_train_pred = svm_lda.predict(X_train_lda)
print('Accruacy of LDA SVM model(training set):')
print(metrics.accuracy_score(y_train, y_train_pred))

y_test_pred = svm_lda.predict(X_test_lda)
print('Accruacy of LDA SVM model(testing set):')
print(metrics.accuracy_score(y_test, y_test_pred))
print("")

#kpca
kpca = KernelPCA(n_components = 10, kernel = 'rbf')
X_train_kpca = kpca.fit_transform(X_train_std, y_train)
X_test_kpca = kpca.transform(X_test_std)

# LR after kpca
tuned_parameters = [{'C':np.arange(0.01,1.0,0.01).tolist(),
                     'multi_class':['ovr']}]
clf=GridSearchCV(LogisticRegression(),tuned_parameters,scoring='accuracy',cv=5) 

clf.fit(X_train_kpca,y_train)


lr_kpca = clf.best_estimator_
lr_kpca. fit(X_train_kpca,y_train)

y_train_pred = lr_kpca.predict(X_train_kpca)
print('Accruacy of kPCA Logistic Regression model(training set):')
print(metrics.accuracy_score(y_train, y_train_pred))
Esempio n. 59
0
        plt.clf()  # Clear any existing figure
        axes = scatter_matrix(pca_sample_df, diagonal=d, **scatter_kwds)
        plt.savefig(scatter_matrix_fp_fmt.format('pca_' + d))

    # Kernel PCA feature reduction
    kernels = [
        'linear',
        'poly',
        'rbf',
        # 'sigmoid',
        'cosine',
    ]
    # Kernel PCAs are compute and memory intensive so fit on a random sample
    X_sample = X.sample(n=1000)
    print('Kernel PCA sample shape: {}'.format(X_sample.shape))
    for kernel in kernels:
        kpca = KernelPCA(n_components=3, kernel=kernel, n_jobs=4)
        start_time = time.perf_counter()
        kpca.fit(X_sample)
        end_time = time.perf_counter()
        print('Time to fit: {:.1f}s'.format(end_time - start_time))
        kpca_df = pd.DataFrame(data=kpca.transform(X_sample),
                               index=X_sample.index)
        kpca_df[data.DEPENDENT] = y
        for d in diagonals:
            plt.clf()  # Clear any existing figure
            axes = scatter_matrix(kpca_df, diagonal=d, **scatter_kwds)
            plt.savefig(scatter_matrix_fp_fmt.format(kernel + '_pca_' + d))
        # kernel_pcas[kernel] = pca
        print(kernel, dt.datetime.now())
Esempio n. 60
0
components = 0.99  #canshu
if PCAflag == 1:
    pca = PCA(n_components=components, svd_solver='full')
    pca.fit(train)
    train_new = pca.transform(train)
    sim_new = pca.transform(sim)
    print('pca.explained_variance_ratio_', pca.explained_variance_ratio_)
    print('sum(pca.explained_variance_ratio_)',
          sum(pca.explained_variance_ratio_))
    print(pca.singular_values_)
else:
    kpca = KernelPCA(n_components=components,
                     kernel="rbf",
                     fit_inverse_transform=True)
    kpca.fit(train)
    train_new = kpca.transform(train)
    sim_new = kpca.transform(sim)
    print('pca.explained_variance_ratio_', pca.explained_variance_ratio_)
    print('sum(pca.explained_variance_ratio_)',
          sum(pca.explained_variance_ratio_))
    print(pca.singular_values_)
print('train.shape', train.shape)
print('train_new.shape', train_new.shape)

if plotflag == 1:
    plt.figure(figsize=(10, 8))
    for i in range(0, category):
        plt.subplot(1, 2, 1)
        plt.scatter(train_new[i * traindataset:(i + 1) * traindataset, 0],
                    train_new[i * traindataset:(i + 1) * traindataset, 1],
                    marker='o',