def generate_kpca_compression(X, n_components=16): """ Compresses the data using sklearn KernelPCA implementation. :param X: Data (n_samples, n_features) :param n_components: Number of dimensions for PCA to keep :return: X_prime (the compressed representation), pca """ kpca = KernelPCA(n_components=n_components, kernel='rbf', eigen_solver='arpack', fit_inverse_transform=False) kpca.fit(X) return kpca.transform(X), kpca
def main(): #set the timer start = time.time() #load the data trainX = np.load('trainX.npy') testX = np.load('testX.npy') trainY = np.load('trainY.npy') testY = np.load('testY.npy') print('\n!!! Data Loading Completed !!!\n') #get the 1st digit zero and plot it zero = trainX[14].reshape(28, 28) plt.imshow(zero, cmap=cm.Greys_r) plt.savefig("original"+str(trainY[14])+".png") #plt.show() #apply kpca kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True) kpca.fit(trainX[0:3000]) trainX_kpca = kpca.transform(trainX) testX_kpca = kpca.transform(testX) #do inverse transform and plot the result orig = kpca.inverse_transform(trainX_kpca) img = orig[14].reshape(28, 28) plt.imshow(img, cmap=cm.Greys_r) plt.savefig("reconstructed"+str(trainY[14])+".png") #plt.show() selector = SelectPercentile(f_classif, percentile=5) selector.fit(trainX_kpca, trainY) trainX = selector.transform(trainX_kpca) testX = selector.transform(testX_kpca) #fit a classifier parameters = {'n_neighbors' : list(np.arange(15)+1)} clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters) clf.fit(trainX, trainY) pred = clf.predict(testX) print accuracy_score(testY, pred) print confusion_matrix(testY, pred) #print(clf.best_params_) print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY))) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def gogo_kpca( fxpath, mpath ): kpca_params = {'n_components':256, 'kernel':'rbf', 'gamma':None, 'degree':3, 'coef0':1, 'kernel_params':None, 'alpha':1.0, 'fit_inverse_transform':False, 'eigen_solver':'auto', 'tol':0, 'max_iter':None, 'remove_zero_eig':True} kpca_fname = '%s/kpca_rbf_{0}_{1}.pkl' % mpath for i in range(7): if i < 5: nbreed = 1 sbreed = 'dog' nsubject = i+1 else: nbreed = 2 sbreed = 'human' nsubject = 1 + abs(5-i) print 'breed%d.subject%d..' % ( nbreed, nsubject ) X_ictal = load_features( fxpath, nbreed, nsubject, 1 ) X_inter = load_features( fxpath, nbreed, nsubject, 2 ) X = vstack((X_inter, X_ictal)) del X_inter, X_ictal; gc.collect() X_test = load_features( fxpath, nbreed, nsubject, 3 ) X = vstack((X, X_test)) del X_test; gc.collect() kpca = KernelPCA(**kpca_params) skip_interval = get_skip_interval(X) X = kpca_preprocess_features(X) kpca.fit(X[::skip_interval]) with open(kpca_fname.format(sbreed,nsubject),'wb') as f: cPickle.dump(kpca,f) del X, kpca; gc.collect()
def test_kernel_conditioning(): """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA Non-regression test for issue #12140 (PR #12145). """ # create a pathological X leading to small non-zero eigenvalue X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]] kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True) kpca.fit(X) # check that the small non-zero eigenvalue was correctly set to zero assert kpca.lambdas_.min() == 0 assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
def test_kernel_pca(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("auto", "dense", "arpack"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=True) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # non-regression test: previously, gamma would be 0 by default, # forcing all eigenvalues to 0 under the poly kernel assert_not_equal(X_fit_transformed, []) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1]) # inverse transform X_pred2 = kpca.inverse_transform(X_pred_transformed) assert_equal(X_pred2.shape, X_pred.shape)
def test_kernel_pca(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) def histogram(x, y, **kwargs): # Histogram kernel implemented as a callable. assert_equal(kwargs, {}) # no kernel_params that we didn't ask for return np.minimum(x, y).sum() for eigen_solver in ("auto", "dense", "arpack"): for kernel in ("linear", "rbf", "poly", histogram): # histogram kernel produces singular matrix inside linalg.solve # XXX use a least-squares approximation? inv = not callable(kernel) # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # non-regression test: previously, gamma would be 0 by default, # forcing all eigenvalues to 0 under the poly kernel assert_not_equal(X_fit_transformed.size, 0) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1]) # inverse transform if inv: X_pred2 = kpca.inverse_transform(X_pred_transformed) assert_equal(X_pred2.shape, X_pred.shape)
def test_kernel_pca_sparse(): """Test that kPCA works on a sparse data input. Same test as ``test_kernel_pca except inverse_transform`` since it's not implemented for sparse matrices. """ rng = np.random.RandomState(0) X_fit = sp.csr_matrix(rng.random_sample((5, 4))) X_pred = sp.csr_matrix(rng.random_sample((2, 4))) for eigen_solver in ("auto", "arpack", "randomized"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA( 4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=False, random_state=0, ) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # transform new data X_pred_transformed = kpca.transform(X_pred) assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1] # inverse transform: not available for sparse matrices # XXX: should we raise another exception type here? For instance: # NotImplementedError. with pytest.raises(NotFittedError): kpca.inverse_transform(X_pred_transformed)
def test_kernel_pca(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("auto", "dense", "arpack"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=True) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1]) # inverse transform X_pred2 = kpca.inverse_transform(X_pred_transformed) assert_equal(X_pred2.shape, X_pred.shape)
def test_leave_zero_eig(): """Non-regression test for issue #12141 (PR #12143) This test checks that fit().transform() returns the same result as fit_transform() in case of non-removed zero eigenvalue. """ X_fit = np.array([[1, 1], [0, 0]]) # Assert that even with all np warnings on, there is no div by zero warning with pytest.warns(None) as record: with np.errstate(all="warn"): k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense") # Fit, then transform A = k.fit(X_fit).transform(X_fit) # Do both at once B = k.fit_transform(X_fit) # Compare assert_array_almost_equal(np.abs(A), np.abs(B)) for w in record: # There might be warnings about the kernel being badly conditioned, # but there should not be warnings about division by zero. # (Numpy division by zero warning can have many message variants, but # at least we know that it is a RuntimeWarning so lets check only this) assert not issubclass(w.category, RuntimeWarning)
def main(): #set the timer start = time.time() #load the data mnist = fetch_mldata('MNIST original') mnist.target = mnist.target.astype(np.int32) seed = np.random.randint(1,30000) rand = np.random.RandomState(seed) items = len(mnist.target) indices = rand.randint(items, size = 70000) trindex = indices[0:30000] tsindex = indices[30000:] #scale down features to the range [0, 1] mnist.data = mnist.data/255.0 mnist.data = mnist.data.astype(np.float32) trainX = mnist.data[trindex] testX = mnist.data[tsindex] trainY = mnist.target[trindex] testY = mnist.target[tsindex] #extract the features using KPCA kpca = KernelPCA(kernel='precomputed') kpca_train = arc_cosine(trainX[0:1000], trainX[0:1000]) #Fit the model from data in X kpca.fit(kpca_train) kernel_train = arc_cosine(trainX, trainX[0:1000]) kernel_test = arc_cosine(testX, trainX[0:1000]) trainX_kpca = kpca.transform(kernel_train) testX_kpca = kpca.transform(kernel_test) print testX_kpca.shape #fit the svm model and compute accuaracy measure clf = svm.SVC(kernel=arc_cosine) clf.fit(trainX_kpca, trainY) pred = clf.predict(testX_kpca) print accuracy_score(testY, pred) print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY))) print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def PCA_rotate_data(feature_vector, n_points=256, nonlinear=False): """ Rotate the data to align with the principal components of our acceleration data. feature vector is in the form [a_x_0, a_y_0, a_z_0, a_x_1,......] Component with largest eigenvector will be z-axis Second largest will be y-axis, I guess. """ accXYZ = feature_vector.reshape(n_points,-1) if nonlinear: pca = KernelPCA(n_components=3, kernel='polynomial') else: pca = PCA(n_components = 3) pca.fit(accXYZ) # pca.explained_variance_: importance of data on each axis aka their important # tells us direction of vector, they are the eigenvalues if nonlinear: eigVals = pca.lambdas_ eigVects = pca.alphas_ else: eigVals = pca.explained_variance_ eigVects = pca.components_ x_index = np.argmin(eigVals) z_index = np.argmax(eigVals) y_index = list(set([0,1,2]) - set([x_index, z_index]))[0] new_x_hat = eigVects[x_index]/norm(eigVects[x_index]) new_y_hat = eigVects[y_index]/norm(eigVects[y_index]) new_z_hat = eigVects[z_index]/norm(eigVects[z_index]) rotPCAData = [] for i in range(len(accXYZ)): v = accXYZ[i] new_x = np.dot(new_x_hat, v ) new_y = np.dot(new_y_hat, v ) new_z = np.dot(new_z_hat, v ) rotPCAData += [new_x, new_y, new_z] return rotPCAData, [new_x_hat, new_y_hat, new_z_hat]
def __pca_test(dem, X_tr, X_te, y_train, y_test): reg = KernelPCA(kernel='linear', n_components=dem, random_state=1) re = reg.fit(np.vstack((X_tr, X_te))) X_train = re.transform(X_tr) X_test = re.transform(X_te) reg, score = __mlp_test(X_train, X_test, y_train, y_test) print(score) return reg, score
def reduction(data, params): # parse parameters for item in params: if isinstance(params[item], str): exec(item+'='+'"'+params[item]+'"') else: exec(item+'='+str(params[item])) # apply PCA kpca = KernelPCA(n_components=n_components, kernel=kernel) kpca.fit(data) X = kpca.transform(data) return X
def decom_kernel_pca_n20(): filename = test_bench_path data = loadcsv(filename, 1, 4) print("data shape : ", data.shape) my_pca = KernelPCA(n_components=20, kernel='rbf') my_pca.fit(data) reduced_data = my_pca.transform(data) print("reduced data shape : ", reduced_data.shape) ''' fig = plt.figure() ax = Axes3D(fig) ax.scatter(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2]) plt.show() ''' return my_pca
def kernel_pca_fit(n_components, train, test, shape, kernel="linear"): # Available kernels: # "linear", "poly", "rbf", "sigmoid", "cosine", "precomputed" # Set and fit KernelPCA kpca = KernelPCA(n_components=n_components, kernel=kernel, fit_inverse_transform=True) kpca.fit(train) # Reduce dimension test_reduced = kpca.transform(test) # Recover data from the lower dimension test_recovered = kpca.inverse_transform(test_reduced) # Calculate the MSE mse = np.mean((test_recovered - test)**2) # Reshape into a matrix test_recovered = test_recovered.reshape(shape) return kpca, test_recovered, mse
def main(args): df = pd.load(args.df) y = integer_labels(df) pca = KernelPCA(None, kernel=args.kernel) pca.fit(df) X = pca.transform(df) nonzero_components = X.shape[1] seed = int(time.time() * 1000) gmm = GMM(4, n_init=10, random_state=seed) gmm.fit(X) c = gmm.predict(X) score, _ = compare_clusters(c, y) best = score with open(args.out, 'w') as fh: fh.write('{} {} {} {}\n'.format(args.kernel, nonzero_components, seed, best)) n_comps = range( 2, 16) + [int(i) for i in np.linspace(16, nonzero_components, 20)] for n in n_comps: pca = KernelPCA(n, kernel=args.kernel) pca.fit(df) X = pca.transform(df) for i in range(128): seed = int(time.time() * 1000) gmm = GMM(4, random_state=seed) gmm.fit(df) c = gmm.predict(df) score, _ = compare_clusters(c, y) if score > best: best = score with open(args.out, 'a'): fh.write('{} {} {} {}\n'.format(args.kernel, n, seed, best))
def doKernelPCA(q, components=40): global data # load test query loadFile('test', q) # fit model kpca = KernelPCA(components, kernel="rbf") kpca.fit(data) # transform and print test query data = kpca.transform(data) printFile('test{}'.format(q)) for kind in ['train', 'vali']: loadFile(kind) data = kpca.transform(data) printFile(kind + str(q))
def plot_kernel_pca_variance(): for dataset in datasets: X_train, X_test, y_train, y_test, target_names = dataset.get_data( model='KMeans') pca = KernelPCA(n_components=2, kernel='rbf', gamma=15) pca.fit(X_train) X_skernpca = pca.transform(X_train) plt.scatter(X_skernpca[y_train == 0, 0], X_skernpca[y_train == 0, 1], color='red', marker='^', alpha=0.5) plt.scatter(X_skernpca[y_train == 1, 0], X_skernpca[y_train == 1, 1], color='blue', marker='o', alpha=0.5) plt.show()
class KernelPCAReduction(AbstractReduction): """ Use kernel PCA to reduce dimensionality http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html """ def __init__(self, n_components, **kwargs): self.pca = KernelPCA(n_components=n_components, **kwargs) self.n_components = n_components def n_components(self): return self.n_components def fit(self, X, Y=None): self.pca.fit(X) def transform(self, X): return self.pca.transform(X)
def testKernel(n_components, kernel, degree): print(n_components, kernel) kpca = KernelPCA(n_components, kernel=kernel, degree=degree) kpca_data = kpca.fit(data).transform(data) plt.scatter(kpca_data[:, 0], kpca_data[:, 1], c=labels, cmap='nipy_spectral') plt.show()
def decom_kernel_pca_n3(stage): data = [] if stage=='assembly': data = loadcsv("data/assembly_training.csv", 1, 2) else: data = loadcsv("data/test_bench_training.csv", 1, 4) my_pca = KernelPCA(n_components=3, kernel='rbf') my_pca.fit(data) reduced_data = my_pca.transform(data) print("reduced data shape : ", reduced_data.shape) ''' fig = plt.figure() ax = Axes3D(fig) ax.scatter(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2]) plt.show() ''' return my_pca
def KPCA10Fold(X, y): acc = [] kf = KFold(X.shape[0], n_folds=10, shuffle=True) i = 0 for train_index, test_index in kf: yTest = y[test_index] yTrain = y[train_index] clf = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10) clf.fit(X[train_index]) newRepTrain = clf.transform(X[train_index]) newRepTest = clf.transform(X[test_index]) nclf = neighbors.KNeighborsClassifier(n_neighbors=2) nclf.fit(newRepTrain, yTrain) XPred = nclf.predict(newRepTest) acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0]) # print i,":",acc[i] i += 1 return np.mean(acc), np.std(acc)
def rf_assemble(): assemble_pos, assemble_neg, assemble_X, assemble_Y = read_label_data() k_pca = KernelPCA(n_components=3, kernel='rbf') k_pca.fit(assemble_X) assemble_X_reduced = k_pca.transform(assemble_X) rfr = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight={0: 80}) rfr.fit(assemble_X_reduced, assemble_Y) validate_pos, validate_neg, validate_X, validate_Y = read_label_validation( ) validate_X_reduced = k_pca.transform(validate_X) rfr_res = rfr.predict(validate_X_reduced) print('precision : ', precision_score(rfr_res, validate_Y)) validate_neg_reduced = k_pca.transform(validate_neg) print('miss classified : ', np.sum(np.abs(rfr.predict(validate_neg_reduced))))
def getKPCAcomp(dict_read): A = np.arange(10000) for key in dict_read.keys(): if key<=1000: [sample_rate,X] = dict_read.get(key) # if song doesnt have 10000 features, then add 0s at the end (this usually isnt the case) if (len(X)<10000): dif = 10000 - len(X) for i in range(dif): X = np.hstack((X,0.0)) A = np.vstack((A,X[:10000])) else: break A = np.delete(A, 0, 0) A = A.astype(float) kpca = KernelPCA(n_components=100, kernel="rbf") kpca.fit(A) A = kpca.transform(A) return A
def calc_kpca(xyz, kerneltype=None, title=None, n_comp=None): n_dim = np.prod(xyz.shape[1:]) result = [] if kerneltype is None: klist = ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'] else: klist = [kerneltype] for ktype in klist: kpca = KernelPCA(kernel=ktype, n_components=n_comp) st = dt.datetime.now() kpca.fit(xyz.reshape(len(xyz), n_dim)) if title is not None: with open('kpca_%s_%s.dat' % (title, ktype), 'wb') as out: out.write(pickle.dumps(kpca)) result.append(kpca) if kerneltype is None: return result else: return result[0]
def runwSkPCA(train_x, train_y, test_x, test_y): dim_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] score_list = [] for dim in dim_list: pca = KernelPCA(n_components=dim, kernel="sigmoid") pca.fit(train_x) train_x_r = pca.transform(train_x) test_x_r = pca.transform(test_x) model = Sequential() model.add( Dense(500, input_shape=(train_x_r.shape[1], ), activation="relu")) #28*28=784 model.add(Dropout(0.5)) model.add(Dense(500, activation="relu"), ) model.add(Dropout(0.5)) model.add(Dense(10)) model.add(Activation("softmax")) sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy']) model.fit(train_x_r, train_y, batch_size=200, epochs=400, shuffle=True, verbose=0, validation_split=0.1) print("start test") scores = model.evaluate(test_x_r, test_y, batch_size=200, verbose=1) print("The test loss is: " + str(scores[0])) print('Test accuracy:', str(scores[1])) score_list.append(scores[1]) print("NN with " + str(dim) + "-dim sigmoid kPCA score: " + str(scores[1])) plt.plot(dim_list, score_list, color='red') plt.title("NN Score with sigmoid kernel PCA") plt.xlabel("Dimension") plt.ylabel("Score") plt.show()
class KPCA: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = KernelPCA(*args, **kwargs) def fit(self, X, y): Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any( axis=1)] if Z.shape[0] != X.shape[0]: print( 'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: self.model.fit(X_) def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: raise Exception("PCA could not be processed with RFE_CV") else: predicted = self.model.transform(X_) Z = numpy.full(shape=(X.shape[0], predicted.shape[1]), fill_value=numpy.nan, dtype=numpy.float64) Z[nan_mask, :] = predicted return Z
def getProjectionMatrixKPCA(dim=50): """ Kernel PCA : see paper for detailed description""" # Create an X for the hierarchy X = np.zeros((len(labelDict), len(labelDict))) for item in labelDict: pars = getPathToRoot(item) for par in pars: X[labelIndex[item]][labelIndex[par]] = 1 kpca = KernelPCA(n_components=dim, fit_inverse_transform=True) X_kpca = kpca.fit(X) return kpca, kpca.alphas_
class KPCA(BaseEstimator, TransformerMixin): def __init__(self, kernel='linear', is_on=1): self.is_on = is_on self.kernel = kernel self.model = KernelPCA(kernel=self.kernel) def fit(self, X, y=None): if (self.is_on == 1): X = check_array(X) self.model.fit(X) print("PCA fitted") return self def transform(self, X, y=None): if (self.is_on == 1): X_new = self.model.transform(X) print("PCA transformed") return X_new else: return X
def runPCA(X_train, X_test, y_train, y_test, comp_range, Kernel): C = SVMmodel.getBestParam(Kernel) scores = [] for n_comp in comp_range: print("\nn_comp=%d\n" % (n_comp)) transformer = KernelPCA(n_components=n_comp, kernel=Kernel, copy_X=True, n_jobs=8) transformer.fit(X_train) X_train_proj = transformer.transform(X_train) X_test_proj = transformer.transform(X_test) if n_comp == 2: np.save('X_train_proj_2d_' + Kernel, X_train_proj) np.save('X_test_proj_2d_' + Kernel, X_test_proj) score = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test, C, Kernel) scores.append(score.mean()) print(scores) return scores
def DoKPCA(kernel, pcaData, varN=None): # do pca print('Task KPCA : START TIME:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) kpca = KernelPCA(varN, kernel=kernel, fit_inverse_transform=True) X_r = kpca.fit(pcaData).transform(pcaData) # print('explained variance ratio (first two components): %s' % str(kpca.explained_variance_ratio_)) print(np.shape(X_r)) print('Task KPCA : END TIME:' + time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) return X_r
def test_kernel_pca_n_components(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("dense", "arpack"): for c in [1, 2, 4]: kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver) shape = kpca.fit(X_fit).transform(X_pred).shape assert shape == (2, c)
def test_kernel_pca_n_components(): rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) X_pred = rng.random_sample((2, 4)) for eigen_solver in ("dense", "arpack"): for c in [1, 2, 4]: kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver) shape = kpca.fit(X_fit).transform(X_pred).shape assert_equal(shape, (2, c))
def create_model(params): kpca = KernelPCA(kernel=params['kernel']['ktype'], n_components=params['n_components']) print('---------------------------------------') print('Kernel: {}'.format(params['kernel']['ktype'])) ensemble_kernel.append(params['kernel']['ktype']) print('N comp: {}'.format(params['n_components'])) ensemble_comp.append(params['n_components']) kpca.fit(x_train) train_img = kpca.transform(x_train) X_kpca = kpca.transform(x_valid) # Run Random Forest Classifier and plot result if n < 4 validation_acc = run_rf(train_img, y_train, X_kpca, y_valid, "KPCA-RF ") ensemble_acc.append(validation_acc) # save for later print('Val. Acc: {}'.format(validation_acc)) return {'loss': -validation_acc, 'status': STATUS_OK, 'model': kpca}
def plot_state_space_3d(self,nmax=2000,kernal=None,interpol_n=10): from scipy.interpolate import interp1d import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D fig = plt.figure(figsize=(6,6)) ax = fig.add_subplot(111, projection='3d') if self.nx>3: if kernal is None: from sklearn.decomposition import KernelPCA kernal = KernelPCA(n_components=3,fit_inverse_transform=True) kernal.fit(self.x) F = kernal.transform(self.x[:nmax]) else: F = self.x[:nmax] f = interp1d(np.arange(F.shape[0]),F.T,kind='quadratic') out = f(np.arange(0,F.shape[0]-1,1/interpol_n)) ax.plot(out[0],out[1],out[2]) # plt.show() return kernal
def Compute_var_ratio(HE_MI_train_test, kernel, invTran, degree): MyDataSet = HE_MI_train_test my_HEtraining = MyDataSet[0] my_MItraining = MyDataSet[1] my_HEtest = MyDataSet[2] my_MItest = MyDataSet[3] kpca = KernelPCA(kernel=kernel, fit_inverse_transform=invTran, degree=degree) HE_train_kpca = kpca.fit(my_HEtraining) HE_train_var = HE_train_kpca.lambdas_ MI_train_kpca = kpca.fit(my_MItraining) MI_train_var = MI_train_kpca.lambdas_ HE_test_kpca = kpca.fit(my_HEtest) HE_test_var = HE_test_kpca.lambdas_ MI_test_kpca = kpca.fit(my_MItest) MI_test_var = MI_test_kpca.lambdas_ return [HE_train_var, MI_train_var, HE_test_var, MI_test_var]
def kernel_kpca(): start_time = time.time() data_array_all, sample_number_list = read_csv(data_dir) shape = data_array_all.shape feature_num = shape[1] data_num = shape[0] kpca = KernelPCA(n_components=feature_num / 8, kernel="rbf", fit_inverse_transform=True, gamma=10) kpca.fit(data_array_all) dimension_reduction = kpca.transform(data_array_all) save_dir = './dr_results/{1}-kpca-{0}.csv'.format(feature_num / 8, target) save_csv_data(dir=save_dir, data=dimension_reduction, sample_number_list=sample_number_list, name='kpca') # return dimension_reduction print("---kpca for {0} {1} seconds ---".format(target, time.time() - start_time))
def kpca_run(kernel='linear'): pca = KernelPCA(n_components=2, kernel=kernel) pca_data = pca.fit(data).transform(data) fig, axs = plt.subplots(1, 1) axs.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='rainbow') axs.set_xlabel('PC1') axs.set_ylabel('PC2') axs.set_title(kernel) plt.show()
class PCAKernel(PCAnalyzer): """ Non-linear PCA as wrapper over SciKitLearn Kernels """ def __init__(self, components, ktype='poly'): PCAnalyzer.__init__(self) if isinstance(components, int): self.n_components = components self.pca = KernelPCA(kernel=ktype, n_components=components) self.type = 'kernel' def solve(self, X): self.dim = np.prod(X.shape[1:]) self.pca.fit(X.reshape(len(X), self.dim)) self.trainsize = len(X) def project(self, X): if isinstance(X, list): X = np.array(X) dimX = np.prod(X.shape[1:]) if dimX != self.dim: logging.error('Projection Error in KPCA: Cannot reshape/project %s size data using PC Vects of size, %s', str(X.shape), str(self.dim)) return None projection = self.pca.transform(X.reshape(len(X), dimX)) return projection
def perform_kpca(input_data): ''' Applying Kernal PCA on removed outliers data# Using scikit module for Kpca ''' from sklearn.decomposition import KernelPCA # Specify kernal fucntion used in the K pca KERNEL = raw_input('Enter the kernal of kernalPCA(options are :cosine,rbf,linear,sigmoid:') kpca=KernelPCA(n_components=len(input_data.T),kernel=KERNEL) #Scaling thing for input dataset from sklearn.preprocessing import scale scld_input_data= scale(input_data, axis=0, with_mean=True, with_std=True, copy=True ) kpca.fit(scld_input_data) # Transform the dataset on the given PC's kpca_input_data=kpca.transform(scld_input_data) #Percentage variance representarion Kpca_percent=np.array(map(lambda y: (kpca.lambdas_[y]/sum(kpca.lambdas_)),range(len(kpca.lambdas_)))) Var_explanied=np.c_[Kpca_percent.reshape(len(Kpca_percent),1)] print '\nVariance explanied by eigenvalues of KPca ' print (['Kpca']) print Var_explanied return (kpca_input_data)
def test_kernel_pca_sparse(): rng = np.random.RandomState(0) X_fit = sp.csr_matrix(rng.random_sample((5, 4))) X_pred = sp.csr_matrix(rng.random_sample((2, 4))) for eigen_solver in ("auto", "arpack"): for kernel in ("linear", "rbf", "poly"): # transform fit data kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=False) X_fit_transformed = kpca.fit_transform(X_fit) X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit) assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2)) # transform new data X_pred_transformed = kpca.transform(X_pred) assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])
def test_leave_zero_eig(): """This test checks that fit().transform() returns the same result as fit_transform() in case of non-removed zero eigenvalue. Non-regression test for issue #12141 (PR #12143)""" X_fit = np.array([[1, 1], [0, 0]]) # Assert that even with all np warnings on, there is no div by zero warning with pytest.warns(None) as record: with np.errstate(all='warn'): k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense") # Fit, then transform A = k.fit(X_fit).transform(X_fit) # Do both at once B = k.fit_transform(X_fit) # Compare assert_array_almost_equal(np.abs(A), np.abs(B)) for w in record: # There might be warnings about the kernel being badly conditioned, # but there should not be warnings about division by zero. # (Numpy division by zero warning can have many message variants, but # at least we know that it is a RuntimeWarning so lets check only this) assert not issubclass(w.category, RuntimeWarning)
def kpca(Y, k, params): """KPCA driver. Runs KPCA on the input data matrix and UPDATES the KPCA parameters given by the user. See [1] B. Schoelkopf, A. Smola, and K. R. Muller. "Nonlinear component analysis as a kernel eigenvalue problem", Neural Computation, vol. 10, pp. 1299-1319, 1998 for technical details on KPCA. Parameters: ----------- Y : numpy array, shape = (N, D) Input matrix of D N-dimensional signals. k : int Compute k KPCA components. params : KPCAParam instance KPCA parameters. Upon completion, the params is updated. The following fields are set: _data : numpy.array, shape = (N, D) - Original data _A : numpy.array, shape = (N, k) - KPCA weight matrix _l : numpy.array, shape = (k,) - Eigenvalues of kernel matrix The following fields need to be set already: _kPar : Kernel parameters (depends on the kernel) _kFun : Kernel function (depends on the kernel) Since the kernel will be called interally, the kernel parameters will also be updated (see kernel documentation). Returns: -------- Xhat : numpy array, shape (k, D) NLDS state parameters. """ if not isinstance(params, KPCAParam): raise ErrorDS('wrong KCPA parameters!') if (params._kPar is None or params._kFun is None): raise ErrorDS('KPCA not properly configured!') # save data params._data = Y # calls kernel fun params._kFun(Y, Y, params._kPar) kpcaObj = KernelPCA(kernel="precomputed") kpcaObj.fit(params._kPar._kMat) params._A = kpcaObj.alphas_[:,0:k] params._l = kpcaObj.lambdas_[0:k] if np.any(np.where(kpcaObj.lambdas_ <= 0)[0]): dsinfo.warn("some unselected eigenvalues are negative!") if np.any(np.where(params._l < 0)[0]): dsinfo.warn("some eigenvalues are negative!") # normalize KPCA weight vectors normalize(params._A, params._l) return params._A.T*params._kPar._kMat
#nmi_0 = nmi_revised(gold,labels_predict) #nmi_1 = nmi_revised(gold,labels_fs1_predict) #nmi_2 = nmi_revised(gold,labels_fs2_predict) print nmi_0,nmi_1,nmi_2 #PCA gamma = 1.0/(2*sigma_rbf**2) degree = 3 color = ['b','r','g','m','y','k'] kpca_0 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma ,degree=degree) kpca_0.fit(data) kpca_data = kpca_0.fit_transform(data) fig = plt.figure(4) if kpca_num == 2: for i in range(len(labels_predict)): plt.scatter(kpca_data[i,0],kpca_data[i,1],c=color[labels_predict[i]] ,marker='o') else: ax = fig.add_subplot(111,projection='3d') for i in range(len(labels_predict)): ax.scatter(kpca_data[i,0],kpca_data[i,1],kpca_data[i,2] ,c=color[labels_predict[i]],marker='o') kpca_1 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma,degree=degree) kpca_1.fit(data_fs1) kpca_data_fs1 = kpca_1.transform(data_fs1)
rbf_svc = svm.SVC(kernel='rbf', gamma=0.00005, C=50).fit(X, y) #poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y) lin_svc = svm.LinearSVC(C=C).fit(X, y) for i, clf in enumerate((rbf_svc,lin_svc)): # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. z = clf.score(X_test, Y_test) if clf == rbf_svc : print("RBF",z) else: print("Linear",z) svm.SVC(kernel='rbf', gamma=0.00005, C=50).fit(X_train, Y_train.ravel()) pca = PCA(n_components=20) pca.fit(X_testing) print(pca.explained_variance_ratio_) X_testing = pca.transform(X_testing) Y_test = clf.predict(X_testing) Y_testing = np.zeros((len(Y_test),3)) for i in range (0,len(Y_test)): if (Y_test[i] == 0): Y_testing[i,0] = 1 elif(Y_test[i] == 1): Y_testing[i,1] = 1 elif(Y_test[i] == 3): Y_testing[i,2] = 1 Y_testing
gamma = 1/(2*sigma**2) if (0): #%% K-PCA # Calculate accumulated variance kpca = KernelPCA(kernel="rbf",gamma=gamma) kpca.fit_transform(Xtrain) eigenvals = kpca.lambdas_[0:220] # Calculate classifiation scores for each component nComponents = np.linspace(1, 500, 100, endpoint=True) kpcaScores = np.zeros((5,np.alen(nComponents))) kpca = KernelPCA(n_components = Ntrain,kernel="rbf",gamma=gamma) kpca.fit(Xtrain) XtrainT = kpca.transform(Xtrain) XtestT = kpca.transform(Xtest) for i in range(len(nComponents)): kpcaScores[:,i] = util.classify(XtrainT[:,:nComponents[i]],XtestT[:,:nComponents[i]],labelsTrain,labelsTest) #%% Plot accuracies for kPCA plt.figure() for i in range (5): plt.plot(nComponents,kpcaScores[i,:],lw=3) plt.xlim(1,np.amax(nComponents)) plt.title('kPCA accuracy') plt.xlabel('Number of components')
# mtr_l[j,i] = mtr_l[i,j] #eig_val, eig_vec = np.linalg.eig(mtr_l) clf = SpectralClustering(n_clusters=K,affinity='precomputed') clf.fit(affinity) labels_predict = clf.labels_ draw_similarity_matrix(affinity,labels_predict,K) #PCA gamma = 1.0/(2*sigma_rbf**2) degree = 3 color = ['b','r','g','m','y','k'] kpca_2 = KernelPCA(n_components=2,kernel='rbf',gamma=gamma,degree=degree) kpca_2.fit(data_use) kpca_2_data = kpca_2.fit_transform(data_use) fig = plt.figure(1) for i in range(len(labels_predict)): plt.scatter(kpca_2_data[i,0],kpca_2_data[i,1],c=color[labels_predict[i]],\ marker='o') kpca_3 = KernelPCA(n_components=3,kernel='rbf',gamma=gamma,degree=degree) kpca_3.fit(data_use) kpca_3_data = kpca_3.fit_transform(data_use) fig = plt.figure(2) ax = fig.add_subplot(111,projection='3d') for i in range(len(labels_predict)): ax.scatter(kpca_3_data[i,0],kpca_3_data[i,1],kpca_3_data[i,2],\ c=color[labels_predict[i]],marker='o')
class FeatureSet(object): def __init__(self, index_dir, allowed_terms=None, # list of allowed terms which will be used (need for testing) disallowed_terms=None, # list of disallowed terms which will be ignored ft_number_of_words=False, # use number of regular words as feature ft_number_of_hash_tags=False, # use number of hash-tags as feature ft_number_of_user_names=False, # use number of twitter user names as feature ft_number_of_bad_words=False, # use number of bad words as feature ft_number_of_links=False, # ft_number_of_nes=False, # use number of named entities as feature ft_number_of_punct=False, # ft_emoticons=False, # ft_total_hate_score=False, # use total hate score as feature ft_terms_binary=False, # use vector space model with binary function as feature ft_terms_tf=False, # use vector space model with frequency function as feature ft_terms_tfidf=False, # use vector space model with tfidf function as feature ft_scale=False, terms_max_df=0.5, # specifies max document frequency in feature selection (normalized) terms_min_df=50, # specifies min document frequency in feature selection tfidf_model=None, # pca=False, # apply pca to output vector pca_model=None, # data_n7_dir=N7_DATA_DIR, # dtype=np.float32, # verbose=False): # logging.info("GENERATING MODEL") self.tweet_id_index_map = dict() self.index_tweet_id_map = dict() self.index = TextIndex(index_dir) self.full_index = TextIndex(index_dir) self.full_index.load_terms(0, 1.0) self.searcher = Searcher(self.index, terms_min_df, terms_max_df) self.verbose = verbose self.allowed_terms = allowed_terms self.disallowed_terms = disallowed_terms self.ft_number_of_words = ft_number_of_words self.ft_number_of_hash_tags = ft_number_of_hash_tags self.ft_number_of_user_names = ft_number_of_user_names self.ft_number_of_bad_words = ft_number_of_bad_words self.ft_number_of_nes = ft_number_of_nes self.ft_number_of_links = ft_number_of_links self.ft_total_hate_score = ft_total_hate_score self.ft_terms_binary = ft_terms_binary self.ft_terms_tf = ft_terms_tf self.ft_terms_tfidf = ft_terms_tfidf self.ft_scale = ft_scale self.ft_number_of_punct = ft_number_of_punct self.ft_emoticons = ft_emoticons self.terms_max_df = terms_max_df self.terms_min_df = terms_min_df self.data_n7_dir = data_n7_dir self.tfidf_model = tfidf_model self.pca = pca self.pca_model = pca_model self.dtype = dtype self.twitter = TwitterTextUtil() if self.allowed_terms: allowed_terms = dict() for term in self.allowed_terms: if term in self.index.term_id_map: allowed_terms[self.index.term_id_map[term]] = term self.allowed_terms = allowed_terms if self.verbose: logging.info("ALLOWED TERMS: %r" % self.allowed_terms) # create <term id> :-> <vector index> map if ft_terms_binary or ft_terms_tf or ft_terms_tfidf: for term_id in self.index.id_term_map.iterkeys(): if self.allowed_terms is not None: if term_id not in self.allowed_terms: continue if self.disallowed_terms is not None: term = self.index.term_id_map.get(term_id) if term in self.disallowed_terms: continue new_index_value = len(self.tweet_id_index_map) self.index_tweet_id_map[new_index_value] = term_id self.tweet_id_index_map[term_id] = new_index_value if self.verbose: print "ADDED: %d as %d" % (term_id, new_index_value) if self.verbose: print self.tweet_id_index_map print "\tMODEL: %d terms" % len(self.tweet_id_index_map) loader = Loader(data_n7_dir) if self.ft_number_of_bad_words: self.bad_words = loader.bad_words(add_hashtags=False) print "\tMODEL: %d bad words" % len(self.bad_words) def text_to_vector(self, text, allow_pca=True): tokens = self.index.tokenize(text) if self.verbose: print tokens return self.terms_to_vector(text, tokens, allow_pca=allow_pca) def terms_to_vector(self, text, terms, allow_pca=True): term_ids = [] outputs = [] # PREPROCESSING for term in terms: term_id = self.index.term_id_map.get(term) if term_id is not None: term_ids.append(term_id) if self.verbose: print term_ids if self.allowed_terms: term_ids = filter(lambda term_id: term_id in self.allowed_terms, term_ids) if self.verbose: print term_ids # COMPUTING FEATURES if self.ft_terms_binary: bin_vector = self.__ft_bin_vector__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "bin_vector", bin_vector outputs.append(bin_vector) if self.ft_terms_tf: tf_vector = self.__ft_tf_vector__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "tf_vector", tf_vector outputs.append(tf_vector) if self.ft_terms_tfidf: tfifd_vector = self.__ft_tfidf_vector__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "tfifd_vector", tfifd_vector outputs.append(tfifd_vector) if self.ft_number_of_words: number_of_words = self.__ft_number_of_words__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "number_of_words", number_of_words outputs.append(number_of_words) if self.ft_number_of_hash_tags: number_of_hash_tags = self.__ft_number_of_hash_tags__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "number_of_hash_tags", number_of_hash_tags outputs.append(number_of_hash_tags) if self.ft_number_of_user_names: number_of_user_names = self.__ft_number_of_user_names__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "number_of_user_names", number_of_user_names outputs.append(number_of_user_names) if self.ft_number_of_links: number_of_links = self.__ft_number_of_links__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "number_of_links", number_of_links outputs.append(number_of_links) if self.ft_number_of_bad_words: number_of_bad_words = self.__ft_number_of_bad_words__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "number_of_bad_words", number_of_bad_words outputs.append(number_of_bad_words) if self.ft_number_of_punct: number_of_punct = self.__ft_number_of_punct__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "number_of_punct", number_of_punct outputs.append(number_of_punct) if self.ft_emoticons: emoticons_vector = self.__ft_emoticons_vector__(term_ids, terms, scale=self.ft_scale) if self.verbose: print "emoticons_vector", emoticons_vector outputs.append(emoticons_vector) outputs = np.concatenate(outputs) if allow_pca and self.pca: print "PCA IS ALLOWED" outputs = np.asarray(self.pca_model.transform(outputs)).reshape(-1) if self.verbose: print outputs return outputs def __scale_array__(self, array): return 1 - 1 / (array + 1) def __ft_emoticons_vector__(self, term_ids, terms, scale=False): vector = np.zeros(2, dtype=self.dtype) for term in terms: if Sad_RE.match(term): vector[0] += 1 if Happy_RE.match(term): vector[0] += 1 return vector def __ft_bin_vector__(self, term_ids, terms, scale=False): vector = np.zeros(len(self.tweet_id_index_map), dtype=self.dtype) for term_id in term_ids: term_index = self.tweet_id_index_map.get(term_id) if term_index is not None: vector[term_index] = 1 return vector def __ft_tf_vector__(self, term_ids, terms, scale=False): vector = np.zeros(len(self.tweet_id_index_map), dtype=self.dtype) for term_id in term_ids: term_index = self.tweet_id_index_map.get(term_id) if term_index is not None: vector[term_index] += 1 return vector def __ft_tfidf_vector__(self, term_ids, terms, scale=False): vector = np.zeros(len(self.tweet_id_index_map), dtype=self.dtype) for term_id in term_ids: term_index = self.tweet_id_index_map.get(term_id) if term_index is not None: vector[term_index] += 1 vector = self.tfidf_model.transform(vector).toarray()[0] return vector def __ft_number_of_words__(self, term_ids, terms, scale=False): nw = np.zeros(1, dtype=self.dtype) for term in terms: is_word = True if self.twitter.is_hashtag(term): is_word = False if self.twitter.is_link(term): is_word = False if self.twitter.is_username(term): is_word = False if self.twitter.is_punct(term): is_word = False if self.verbose: if is_word: print "%s is a word" % term else: print "%s is not a word" % term if is_word: nw[0] += 1 return self.__scale_array__(nw) if scale else nw def __ft_number_of_hash_tags__(self, term_ids, terms, scale=False): nw = np.zeros(1, dtype=self.dtype) for term in terms: if self.twitter.is_hashtag(term): nw[0] += 1 return self.__scale_array__(nw) if scale else nw def __ft_number_of_user_names__(self, term_ids, terms, scale=False): nw = np.zeros(1, dtype=self.dtype) for term in terms: if self.twitter.is_username(term): nw[0] += 1 return self.__scale_array__(nw) if scale else nw def __ft_number_of_links__(self, term_ids, terms, scale=False): nw = np.zeros(1, dtype=self.dtype) for term in terms: if self.twitter.is_link(term): if self.verbose: print "%s is is link" % term nw[0] += 1 return self.__scale_array__(nw) if scale else nw def __ft_number_of_punct__(self, term_ids, terms, scale=False): nw = np.zeros(1, dtype=self.dtype) for term in terms: if self.twitter.is_punct(term): nw[0] += 1 return self.__scale_array__(nw) if scale else nw def __ft_number_of_bad_words__(self, term_ids, terms, scale=False): nw = np.zeros(1, dtype=self.dtype) for term in terms: for bad_w in self.bad_words: if len(bad_w) > 3: if bad_w in term: if self.verbose: print "%s is bad word" % term nw[0] += 1 break else: if bad_w == term: if self.verbose: print "%s is bad word" % term nw[0] += 1 break return self.__scale_array__(nw) if scale else nw def fit_pca(self, X, n_components=64, kernel="sigmoid"): self.pca_model = KernelPCA(n_components=n_components, kernel=kernel) logging.info("FITTING PCA(%s-%d) MODEL FROM %d EXAMPLES" % (kernel, n_components, X.shape[0])) self.pca_model.fit(X) logging.info("FITTING DONE") def fit_pca_from_index(self, training_examples=10, n_components=64, kernel="sigmoid"): X = self.fm_from_index(training_examples) self.fit_pca(X, n_components, kernel) def fit_tfidf(self, X): self.tfidf_model = FeatureSet.do_fit_tfidf(X) @staticmethod def do_fit_tfidf(X): tfidf_model = TfidfTransformer() logging.info("FITTING TFIDF MODEL FROM %d EXAMPLES" % X.shape[0]) tfidf_model.fit(X) logging.info("FITTING DONE") return tfidf_model def fit_tfidf_from_index(self, training_examples=10): X = self.fm_from_index(training_examples) self.fit_tfidf(X) def save_pca_model(self, file_path=None): if file_path is None: file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir else: file_path = "%s/models/%s" % (self.data_n7_dir, file_path) joblib.dump(self.pca_model, file_path, compress=9) def load_pca_model(self, file_path=None): if file_path is None: file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir else: file_path = "%s/models/%s" % (self.data_n7_dir, file_path) self.pca_model = joblib.load(file_path) logging.info("LOADED PCA MODEL %r" % self.pca_model) def save_tfidf_model(self, file_path=None): if file_path is None: file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir else: file_path = "%s/models/%s" % (self.data_n7_dir, file_path) joblib.dump(self.tfidf_model, file_path, compress=9) def load_tfidf_model(self, file_path=None): if file_path is None: file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir else: file_path = "%s/models/%s" % (self.data_n7_dir, file_path) self.tfidf_model = joblib.load(file_path) logging.info("LOADED TFIDF MODEL %r" % self.tfidf_model) def fm_from_tokens(self, i_tokens, training_examples=10): v_size = len(self.text_to_vector("", allow_pca=False)) logging.info("INITIALIZING %dx%d MATRIX" % (training_examples, v_size)) X = np.zeros((training_examples, v_size), dtype=self.dtype) # X = matrix((training_examples, v_size), dtype=self.dtype) i = 0 for tokens in i_tokens: f_vect = self.terms_to_vector(None, tokens, allow_pca=False) print "EXTRACTED %d/%d" % (i, training_examples) X[i,:] = f_vect i += 1 if i >= training_examples: break if self.pca: print "APPLYING PCA %r" % self.pca_model X = self.pca_model.transform(X) if self.verbose: print X return X def fm_from_index(self, training_examples=10): i_verctors = imap(lambda x: x[1], self.searcher.iterate()) i_tokens = imap(lambda v: [self.full_index.id_term_map[tid] for tid in v], i_verctors) return self.fm_from_tokens(i_tokens, training_examples=training_examples) @staticmethod def save_fm(X, file_path=None, sparse=False): if file_path is None: file_path = "%s/models/X.pkl" % N7_DATA_DIR else: file_path = "%s/models/%s" % (N7_DATA_DIR, file_path) if sparse: logging.info("CONVERTING TO SPARSE REPRESENTATION") X = sparse_matrix(X, dtype=X.dtype) logging.info("SAVING FEATURE MATRIX %r -> %s" % (X.shape, file_path)) joblib.dump(X, file_path, compress=9) @staticmethod def load_fm(file_path=None): if file_path is None: file_path = "%s/models/X.pkl" % N7_DATA_DIR else: file_path = "%s/models/%s" % (N7_DATA_DIR, file_path) X = joblib.load(file_path) return X def load_from_csv(self, file_paths, labeled=True): Y = [] texts = [] i = 1 for fl_path in file_paths: reader = csv.reader(open(fl_path, "rb")) for row in reader: text = row[-1] texts.append(text) if labeled: cl = row[0] if cl == "?": Y.append(0) else: if int(cl) > 0: Y.append(1) else: Y.append(0) i += 1 i_tokens = imap(self.index.tokenize, texts) X = self.fm_from_tokens(i_tokens, 500) if labeled: return X, Y return X def info(self): pass
pca_X_new = pca.fit_transform(X) print 'pca explained', pca.explained_variance_ratio_ print 'pca explained sum', sum(pca.explained_variance_ratio_) joblib.dump(pca_model, 'pca_model.pkl') joblib.dump(pca_X_new, 'pca_X_new.pkl') print pca_model sparse_pca = SparsePCA(n_components=50) sparse_pca_model = pca.fit(sparse_pca_data) sparse_pca_X_new = pca.fit_transform(X) joblib.dump(sparse_pca_model, 'sparse_pca_model.pkl') joblib.dump(sparse_pca_X_new, 'sparse_pca_X_new.pkl') print sparse_pca_model kernel_pca = KernelPCA(n_components=50) kernel_pca_model = kernel_pca.fit(kernel_pca_data) kernel_X_new = kernel_pca.fit_transform(X) joblib.dump(kernel_pca_model, 'kernel_pca_model.pkl') joblib.dump(kernel_X_new, 'kernel_X_new.pkl') fast_ica = FastICA(n_components=None) fast_ica_start = time.time() fast_ica_model = fast_ica.fit(fast_ica_data) fast_ica_end = time.time() print 'fast_ica fit time', fast_ica_end - fast_ica_start fast_ica_X_new = fast_ica.transform(X) joblib.dump(fast_ica_model, 'fast_ica_model.pkl') joblib.dump(fast_ica_X_new, 'fast_ica_X_new.pkl') print fast_ica_model '''
# SYNOPSIS: # python cmd_model_index.py <path to index directory> <dataset size for TFIDF> <dataset size for PCA> import sys import logging from n7.model import FSetLoader from sklearn.decomposition import KernelPCA if __name__ == "__main__": logging.basicConfig(level=logging.INFO) kpca_size = int(sys.argv[1]) if len(sys.argv) > 1 else 15000 input_matrix_name = sys.argv[2] if len(sys.argv) > 2 else "X_tfidf.pkl" output_model_name = sys.argv[3] if len(sys.argv) > 3 else "model_kpca.pkl" loader = FSetLoader() X = loader.load_model(input_matrix_name)[0:kpca_size,:] model = KernelPCA(n_components=128, kernel="sigmoid") logging.info("FITTING PCA on %dx%d examples" % (X.shape[0], X.shape[1])) model.fit(X.toarray()) logging.info("FITTING DONE: %r" % model) loader.save_model(model, output_model_name) loader.save_model(model.lambdas_, output_model_name + ".ev")
predictions = np.concatenate(predictions, axis=0) acc = sum([prd == sv for (prd, sv) in zip(predictions, titanic["Survived"])]) / float(len(predictions)) print("[Adaboost: Perceptron-RF-GB-LinearSVM-KNN] {0:.2f}%".format(100*acc)) titanic_test = read_dataset(TEST_PATH) predictions = adaboost_predict(beta, algs, titanic_test) submission = pd.DataFrame({"PassengerId": titanic_test["PassengerId"], "Survived": predictions}) submission.to_csv(SUBMISSION_PATH, index=False) # KernelPCA titanic_test = read_dataset(TEST_PATH) p = ["Pclass", "Sex", "Age", "Fare", "FamilySize"] #pairwise_plot(titanic[p], titanic["Survived"]) kpca = KernelPCA(kernel="poly", tol=1e-3, gamma=100) T_kpca = kpca.fit(titanic_test[p]) X_kpca = kpca.transform(titanic[p]) print("Found {0} columns".format(len(X_kpca[0]))) N = len(X_kpca) fN = 10 X_kpca = pd.DataFrame({'kpca'+str(j+1): pd.Series(stats.zscore([X_kpca[i][j] for i in range(N)]), index=range(N)) for j in range(fN)}) #pairwise_plot(X_kpca, titanic["Survived"]) alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=2, min_samples_leaf=2) alg.fit(X_kpca, titanic["Survived"]) pred = alg.predict(X_kpca) acc = sum([prd == sv for (prd, sv) in zip(pred, titanic["Survived"])]) / float(len(pred)) print("[KernelPCA] {0:.2f}%".format(100*acc)) scores = cross_validation.cross_val_score(alg, X_kpca, titanic["Survived"], cv=10) print("[KernelPCA-CV] {0:.2f}%".format(100*scores.mean()))
print '... loading FOLD %d'%fold_id fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed.pkl'%(fold_id), "rb" ) ) X_train, y_train, id_train = load_X_from_fold_to_3dtensor(fold, 'train', NUM_OUTPUT) X_test, y_test, id_test = load_X_from_fold_to_3dtensor(fold, 'test', NUM_OUTPUT) X_concat_train = np.reshape(X_train, (X_train.shape[0]*X_train.shape[1], X_train.shape[2]), order='C') X_concat_test = np.reshape(X_test, (X_test.shape[0]*X_test.shape[1], X_test.shape[2]), order='C') np.random.seed(321) perm = np.random.permutation(X_concat_train.shape[0]) subset_ind = perm[0:max_nb_samples] X_concat_train_SUBSET = X_concat_train[subset_ind] start_time = time.time() pca_model = pca.fit(X_concat_train_SUBSET) print("--- kPCA fitting: %.2f seconds ---" % (time.time() - start_time)) start_time = time.time() pca_X_concat_train = pca_model.transform(X_concat_train) print("--- kPCA transforming TRAIN: %.2f seconds ---" % (time.time() - start_time)) start_time = time.time() pca_X_concat_test = pca_model.transform(X_concat_test) print("--- kPCA transforming TEST: %.2f seconds ---" % (time.time() - start_time)) print 'dims: ', pca_X_concat_train.shape, pca_X_concat_test.shape new_dim = pca_X_concat_train.shape[1] X_train = np.reshape(pca_X_concat_train, (X_train.shape[0], X_train.shape[1], new_dim), order='C') X_test = np.reshape(pca_X_concat_test, (X_test.shape[0], X_test.shape[1], new_dim), order='C')
# standardize test data melody_concat_test = np.reshape(melody_test, (melody_test.shape[0]*melody_test.shape[1], melody_test.shape[2]), order='C') melody_concat_test_normed, _ = standardize(melody_concat_test, scaler) # print concat_test_normed.shape melody_test_normed = np.reshape(melody_concat_test_normed, (melody_test.shape[0], melody_test.shape[1], melody_test.shape[2]), order='C') del melody_concat_test, melody_concat_test_normed # concat with the other features X_train = np.concatenate((X_train, melody_train_normed), axis=2) X_test = np.concatenate((X_test, melody_test_normed), axis=2) if usePCA: X_concat_train = np.reshape(X_train, (X_train.shape[0]*X_train.shape[1], X_train.shape[2]), order='C') X_concat_test = np.reshape(X_test, (X_test.shape[0]*X_test.shape[1], X_test.shape[2]), order='C') pca_model = pca.fit(X_concat_train) pca_X_concat_train = pca_model.transform(X_concat_train) pca_X_concat_test = pca_model.transform(X_concat_test) print 'dims: ', pca_X_concat_train.shape, pca_X_concat_test.shape reduced_dim = pca_X_concat_train.shape[1] X_train = np.reshape(pca_X_concat_train, (X_train.shape[0], X_train.shape[1], reduced_dim), order='C') X_test = np.reshape(pca_X_concat_test, (X_test.shape[0], X_test.shape[1], reduced_dim), order='C') # print id_test.shape # X_train = X_train[0:100,:,:] # y_train = y_train[0:100,:,:] # X_train = X_train[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]] # X_test = X_test[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]] # X_train = X_train[:,[13,85,103,142,214]] # X_test = X_test[:,[13,85,103,142,214]]
import numpy as np import pandas as pd import matplotlib.pyplot as plt from plot import plotGraph, plotGraph3D from sklearn.decomposition import PCA, KernelPCA train_data = pd.read_csv('datasets/train.csv') train_pts = train_data.drop('Activity', axis=1) train_labels = train_data['Activity'] # test_data = pd.read_csv('datasets/test.csv') # test_pts = test_data.drop('Activity', axis=1) # test_labels = test_data['Activity'] pca = KernelPCA(n_components=100) train_pca = pca.fit(train_pts,train_labels) y = train_pca.lambdas_ x = range(1,101) plt.plot(x,y) plt.xlabel("No. of components") plt.ylabel("Eigen values") plt.title("Data preserved w.r.t no. of components") plt.show() # comp = [] # for i in range(0,100): # comp.append('comp'+str(i)) # pca = KernelPCA(n_components=100, kernel='rbf', gamma=0.1) # train_pca = pca.fit_transform(train_pts,y=train_labels) # train_pca = train_pca.tolist() # print(type(train_pca))