def run_pca_clustering(self): from sklearn.decomposition import PCA as pca comp_no_required = int(self.pca_cluster_comp_no.text()) bkfreecars_2d = anscombe.tran4d_2d(self.bkfreecars) if self.pca_cluster_whiten.checkState is True: pca_model = pca(n_components=comp_no_required, whiten=True) elif self.pca_cluster_whiten.checkState is False: pca_model = pca(n_components=comp_no_required, whiten=False) self.pca_clusterW = pca_model.fit_transform(bkfreecars_2d.s2 - bkfreecars_2d.s1) self.pca_clusterH = pca_model.components_ yi, xi, zi, si = self.bkfreecars.shape self.pca_clusterW = np.reshape(self.pca_clusterW, (yi, xi, comp_no_required)) self.pca_cluster_choose.setValue(1) self.pca_cluster_choose.setMinimum(1) self.pca_cluster_choose.setMaximum(comp_no_required) self.pca_cluster_image_win.setImage(self.pca_clusterW[:, :, 0]) self.pca_cluster_spectrum_win.plot(self.pca_clusterH[:, 0])
def test_pca(self): django.setup() from koe.models import Feature, Aggregation, FullTensorData, Database from koe.ts_utils import bytes_to_ndarray, get_rawdata_from_binary database = Database.objects.get(name='Bellbird_TMI') features = Feature.objects.all().order_by('id') aggregations = Aggregation.objects.all().order_by('id') features_hash = '-'.join( list(map(str, features.values_list('id', flat=True)))) aggregations_hash = '-'.join( list(map(str, aggregations.values_list('id', flat=True)))) full_tensor = FullTensorData.objects.filter( database=database, features_hash=features_hash, aggregations_hash=aggregations_hash).first() if full_tensor is None: raise Exception('Tensor not found') full_sids_path = full_tensor.get_sids_path() full_bytes_path = full_tensor.get_bytes_path() sids = bytes_to_ndarray(full_sids_path, np.int32) full_data = get_rawdata_from_binary(full_bytes_path, len(sids)) with tictoc('PCA'): dim_reduce_func = pca(n_components=50) dim_reduce_func.fit_transform(full_data)
def select(self, n, n_components=20): # drop stocks that are not listed, and save stock index in a dict self.df = self.df.dropna(axis=1, how='any') ticker_name = self.df.columns ticker_dict = {x: y for x, y in enumerate(ticker_name)} # construct the pca model and fit the model with data sample = self.df.values model = pca(n_components=n_components) model.fit(sample) # compute PCA components and corresponding variance ratio pcs = model.components_ pcs_mat = np.matrix(pcs) var_ratio = model.explained_variance_ratio_ var_ratio_mat = np.matrix(var_ratio) # compute overall loadings for each stock load_mat = var_ratio_mat * pcs_mat # find top 20 stocks with largest loadings load_arr = np.asarray(load_mat).reshape(-1) load_dict = {y: x for x, y in enumerate(load_arr)} sort_load = sorted(load_arr, key=abs, reverse=True) top_load = sort_load[:n] ticker_num = [load_dict[x] for x in top_load] selected_ticker = [ticker_dict[x] for x in ticker_num] return selected_ticker
def cars_pca(cars, num_comp, gauss_std=0): cars_ansc = cars_anscombe(cars, gauss_std) cars_ansc2d = tran4d_2d(cars_ansc) pca_tran2 = pca(n_components=int(num_comp)) cars_pca1 = pca_tran2.fit_transform(cars_ansc2d.s1) cars_pca1 = pca_tran2.inverse_transform(cars_pca1) cars_pca2 = pca_tran2.fit_transform(cars_ansc2d.s2) cars_pca2 = pca_tran2.inverse_transform(cars_pca2) cars_pca_v = EmptyClass() cars_pca_v.s1 = np.reshape( cars_pca1, (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d)) cars_pca_v.s2 = np.reshape( cars_pca2, (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d)) cars_pcainv = cars_invanscombe(cars_pca_v, gauss_std) pcacars = EmptyClass() pcacars.s1 = np.reshape( cars_pcainv.s1, (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d)) pcacars.s2 = np.reshape( cars_pcainv.s2, (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d)) return pcacars
def plot_data(self, samples, fig_name): # plot training data # pca # fit the normalized dataset to a pca object, and reduce dimensions from 16 to 2 fig = plt.figure() normalized_samples = data_prep.normalize_data(samples, self.col_names) y_pred_samples = self.get_predictions( data_prep.get_scaled_data(samples, self.col_names, self.scale_method)) pca_obj = pca(n_components=2) pca_transformed = pandas.DataFrame( pca_obj.fit_transform(normalized_samples)) plt.scatter(pca_transformed[y_pred_samples == -1][0], pca_transformed[y_pred_samples == -1][1], label='Outlier', c='red') plt.scatter(pca_transformed[y_pred_samples == 1][0], pca_transformed[y_pred_samples == 1][1], label='Inlier', c='blue') ts = dt.now().strftime('%Y%m%d-%H%M%S') # Use timestamp as file id plt.legend(loc=2) plt.title( 'PCA Plot of %d Samples\n kernel = RBF nu = %s gamma = %s\nDimension reduction from %d to 2' % (len(y_pred_samples), str(self.nu_parameter), str(self.gamma_parameter), len(self.col_names))) plt.savefig('../data/diagrams/%s_%s.png' % (fig_name, ts), bbox_inches='tight')
def PCA(*args, n_comp=None): ''' takes a tuple of arrays and applies pca to alll of them returns concatenated version of array with pca applied ''' from sklearn.decomposition import PCA as pca d = np.concatenate(*args, axis=0) pc = pca(n_components=n_comp) return pc.fit_transform(d)
def mypca(filename1, number): number = int(number) x, y = data_format(filename1) pcafif = pca(n_components=number, whiten=False, random_state=7) pcafif.fit(x) result = pcafif.explained_variance_ratio_ list = result.tolist() json_str = json.dumps(list) return (json_str)
def plot_cluster_pca( Xmat, Xcluster_label=None, metric="euclidean", dimpca=2, whiten=True, isprecompute=False, savefile="", doreturn=1, ): """ :return: """ from sklearn.decomposition import pca if isprecompute: Xmat_dist = Xmat else: Xmat_dist = sci.spatial.distance.squareform( sci.spatial.distance.pdist(Xmat, metric=metric, p=dimpca, w=None, V=None, VI=None)) model = pca(n_components=dimpca, whiten=whiten) X_pca = model.fit_transform(Xmat) # plot the result xx, yy = X_pca[:, 0], X_pca[:, 1] if Xcluster_label is None: Yclass = np.zeros(X_pca.shape[0]) else: Yclass = Xcluster_label plot_XY(xx, yy, zcolor=Yclass, labels=Yclass, color_dot="plasma", savefile=savefile) if doreturn: return X_pca
def find_pca_comp(cars, gauss_std=0): cars_ansc = cars_anscombe(cars, gauss_std) cars_ansc2d = tran4d_2d(cars_ansc) cars_diff = cars_ansc2d.s2 - cars_ansc2d.s1 # PCA carried out on the difference data first to choose the number of components to keep pca_tran = pca() pca_tran.fit_transform(cars_diff) diff_pca_var = pca_tran.explained_variance_ratio_ plt.figure() plt.title('Variance contribution of principle components') plt.xlabel('Principle component no.') plt.ylabel('Variance contribution (between 0-1)') plt.plot(diff_pca_var) plt.draw() plt.show(block=False)
def analysis(df): scaled_df = preprocessing.scale(df) pc = pca() try: pc.fit(scaled_df) except: pc.fit(scaled_df) pca_data = pc.transform(scaled_df) per_var = np.round(pc.explained_variance_ratio_ * 100, decimals=1) labels = ['PC' + str(x) for x in range(1, len(per_var) + 1)] plt.bar(x=range(1, len(per_var) + 1), height=per_var, tick_label=labels) plt.ylabel('Percentage of Explained Variance') plt.xlabel('Principal Component') plt.title('Scree Plot') plt.show() pca_df = pd.DataFrame(pca_data, columns=labels) plt.scatter(pca_df.PC1, pca_df.PC2) plt.title('My PCA Graph') plt.xlabel('PC1 - {0}%'.format(per_var[0])) plt.ylabel('PC2 - {0}%'.format(per_var[1])) for sample in pca_df.index: plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample])) plt.show() print(pc.components_[0])
def _get_values(self, pop): nsp = pop.numSubPop() all_alleles = [] for subpop in range(nsp): for ind in pop.individuals(subPop=subpop): geno = ind.genotype() n_markers = len(geno) // 2 for mi in range(n_markers): if len(all_alleles) <= mi: all_alleles.append(set()) a1 = geno[mi] a2 = geno[mi + n_markers] all_alleles[mi].add(a1) all_alleles[mi].add(a2) for i, alleles in enumerate(all_alleles): all_alleles[i] = sorted(list(alleles)) inds = defaultdict(list) for mi in range(n_markers): for subpop in range(nsp): for i, ind in enumerate(pop.individuals(subPop=subpop)): geno = ind.genotype() a1 = geno[mi] a2 = geno[mi + n_markers] for a in all_alleles[mi]: inds[(subpop, i)].append([a1, a2].count(a)) ind_order = sorted(list(inds.keys())) arr = [] for ind in ind_order: arr.append(inds[ind]) my_pca = pca(n_components=2) X = np.array(arr) my_pca.fit(X) X_r = my_pca.transform(X) my_components = {} for i, ind in enumerate(ind_order): my_components[ind] = X_r[i] return my_components
Method = input( "Choose Method: Enter PCA for Principal Component Analysis, EM for Probabilistic PCA and KERNEL for Kernel Method: " ) Dimension = int(input("Give Dimensionality of Projection:")) M = Dimension if Method == 'PCA': PCA(X, y, M) elif Method == 'EM': PPCA(X, y, M) else: KERNEL(X, y, M) # --- PCA with Built-in python class from sklearn.decomposition import PCA as pca n_components = Dimension my_pca = pca(n_components) Projected_Data = my_pca.fit_transform(X.T).T if Dimension == 2: Practical_Plots(Projected_Data) else: D_Plots(Projected_Data) print("\n\n*****____End of Process____*****\n\n") Time = time.process_time() print(Time)
}] hospital_data = merged_data.append(newrows, ignore_index=True, sort=False) hospital_data.shape #Using numerical columns, conduct PCA and obtain the eigenvalues ##Define numerical columns: hospital_data.dtypes hospital_data.describe(include=['number']) ##PCA hospital_reduct_pca = hospital_data[[ 'HospitalID', 'FullTimeCount', 'NetPatientRevenue', 'InpatientOperExp', 'OutpatientOperExp', 'Operating_Revenue', 'Operating_Income', 'AvlBeds', 'Compensation', 'MaxTerm' ]] pca_result = pca(n_components=10).fit(hospital_reduct_pca) #Obtain eigenvalues pca_result.explained_variance_ #Components from the PCA pca_result.components_ #Scree plot plt.figure(figsize=(7, 5)) plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], pca_result.explained_variance_ratio_, '-o') plt.ylabel('Proportion of Variance Explained') plt.xlabel('Principal Component') plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
def __init__(self, m, **kwargs): self.model = pca(n_components=m, **kwargs)
#data.get_dataframe() #corr_dat = pd.concat([corr_dat, data.data_frame]) print('file %i' % file) n_components = 2 for taste in range(4): this_off = data.normal_off_firing[taste] this_off = this_off[:, :, 80:160] total_off = this_off[0, :, :] for nrn in range(1, this_off.shape[0]): total_off = np.concatenate((total_off, this_off[int(nrn), :, :]), axis=1) reduced_off_pca = pca(n_components=15).fit(total_off) print(sum(reduced_off_pca.explained_variance_ratio_)) reduced_off = reduced_off_pca.transform(total_off) gmm = GaussianMixture(n_components=n_components, covariance_type='full', n_init=200).fit(reduced_off) print(gmm.predict(reduced_off)) this_groups = gmm.predict(reduced_off) trial_order = np.argsort(this_groups) # Pull out and cluster distance matrices this_dist = off_stim_dists[taste] clust_dist = this_dist[trial_order, :] clust_dist = clust_dist[:, trial_order]
""" Genetic Algorithm """ from __future__ import division import numpy as np from sklearn.metrics import silhouette_samples as score from feng.Genetic.mutation import mutation from feng.Genetic.selection import selection from feng.Genetic.crossover import crossover from feng.Genetic import utils from scipy.sparse import hstack from sklearn.decomposition import TruncatedSVD as pca import pickle trn_term_doc = pickle.load(open('../data2/tfidf.txt', 'rb')) p=pca(n_components=5000,random_state=42) y=pickle.load(open('../data2/label.txt', 'rb')) trn_term_doc=p.fit_transform(trn_term_doc,y) pickle.dump(trn_term_doc,open('../data2/pca.txt', 'wb')) size=trn_term_doc.shape[1] def aimFunction(list): r=[] for index,l in enumerate(list): if l=="1": r.append(trn_term_doc.getcol(index)) temp=hstack(r) s=score(temp,y) print(s) return s
rows, columns = reduce_data.shape reduce_data.columns reduce_data.dtypes reduce_data.head() reduce_data['Teaching'] = reduce_data['Teaching'].astype('category') reduce_data['TypeControl'] = reduce_data['TypeControl'].astype('category') reduce_data['DonorType'] = reduce_data['DonorType'].astype('category') reduce_data_pca = reduce_data[[ 'NoFTE', 'NetPatRev', 'InOperExp', 'OutOperExp', 'OperRev', 'OperInc', 'AvlBeds', 'Compensation', 'MaxTerm' ]] sc = StandardScaler() reduce_data_std = sc.fit_transform(reduce_data_pca) pca_result = pca(n_components=9).fit(reduce_data_std) pca_result.explained_variance_ pca_result.components_.T * np.sqrt(pca_result.explained_variance_) plt.figure(figsize=(7, 5)) plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9], pca_result.explained_variance_ratio_, '-o') plt.ylabel('Proportion of Variance Explained') plt.xlabel('Principal Component') plt.xlim(0.25, 4.25) plt.ylim(0, 1.05) plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9]) ###Factor Analysis### reduce_data_fac = reduce_data[[
""" @ Filename: PCA_TEST.py @ Author: Danc1elion @ Create Date: 2019-06-03 @ Update Date: 2019-06-03 @ Description: Implement PCA_TEST """ from DimensionReduction import PCA import numpy as np from sklearn.decomposition import PCA as pca import time data = np.array([[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0], [2.3, 2.7], [2, 1.6], [1, 1.1], [1.5, 1.6], [1.1, 0.9]]) time_start1 = time.time() clf1 = PCA() clf1.train(data) print(clf1.transformData(data)) time_end1 = time.time() print("Runtime of PCA:", time_end1 - time_start1) time_start2 = time.time() clf1 = pca(1) x = clf1.fit_transform(data) print(x) time_end2 = time.time() print("Runtime of sklearn PCA:", time_end2 - time_start2)
n_classes = target_names.shape[0] print("Total dataset size:") print("n_samples: %d" % n_samples) print("n_features: %d" % n_features) print("n_classes: %d" % n_classes) # ############################################################################# # Compute a PCA (eigenmris) on the mri dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 25 print("Extracting the top %d eigenmris from %d mris" % (n_components, X_train.shape[0])) t0 = time() PCA = pca(n_components=n_components, svd_solver='randomized', whiten=True).fit(X_train) print("done in %0.3fs" % (time() - t0)) eigenmris = PCA.components_.reshape((n_components, h*16, w*16)) print("Projecting the input data on the eigenmris orthonormal basis") t0 = time() X_train_pca = PCA.transform(X_train) print("done in %0.3fs" % (time() - t0)) clf = SVC(C=50, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True,
print(Eigen_Data.shape) # a=Original_Data.values # b=Eigen_Data.values c = np.dot(Original_Data.values, Eigen_Data.values) Reduced_Data = pd.DataFrame(c) return Reduced_Data Dimension_Reduction_PCA = Reduced_Data(Train.iloc[:, 0:1024], Eigen_PCA) Dimension_Reduction_PCA = pd.concat([Dimension_Reduction_PCA, Train['Label']], axis=1) # In[33]: from sklearn.decomposition import PCA as pca lda1 = pca(n_components=2) inbuilt_pca = lda1.fit_transform(Train) # In[35]: inbuilt_pca = pd.DataFrame(inbuilt_pca) inbuilt_pca = pd.concat([inbuilt_pca, Train['Label']], axis=1) # In[39]: import matplotlib.pyplot as plt c = [ 'Red', "blue", 'green', 'orange', 'yellow', 'purple', 'black', 'magenta', 'navy', 'skyblue', 'pink', 'brown' ] for i in range(11):
from sklearn.cluster import KMeans k = 32 kmeans = KMeans(n_clusters = k) kmeans.fit(X) df = df[:len(vector_list)] df['clusters'] = kmeans.labels_ centroids = pd.DataFrame(kmeans.cluster_centers_) from sklearn.decomposition import RandomizedPCA as pca pca = pca(n_components=2) Y = centroids.as_matrix() pca.fit(Y) subspace = pd.DataFrame(pca.transform(Y),columns=["x","y"]) euclidean_distance = [] hog_points = pd.DataFrame(X) for i in range(len(df)): tmp = hog_points.loc[i].as_matrix() cluster_integer = int(df.clusters.loc[i]) euclidean_distance_i = np.linalg.norm(tmp - centroids.loc[cluster_integer].as_matrix()) euclidean_distance.append(euclidean_distance_i)
pd.set_option('precision', 19) print(temp) # calcualte_tf('1.csv',task0_output_dir) ndarr = df.T.to_numpy()[0] np.savetxt("foo.csv", ndarr, delimiter=",") def pca(feature_matrix,k): pca = PCA(n_components=k) principalComponents = pca.fit_transform(feature_matrix) print(pca.explained_variance_ratio_) principalDf = pd.DataFrame(data=principalComponents) data_scaled = pd.DataFrame(preprocessing.scale(df), columns=df.columns) word_score_df = pd.DataFrame(pca.components_, columns=data_scaled.columns) print(word_score_df.iloc[0].sort_values(ascending=False)) # print(principalDf) return 0 def svd(feature_matrix,k): svd = TruncatedSVD(n_components=k) components = svd.fit_transform(feature_matrix) print(svd.explained_variance_ratio_) principalDf = pd.DataFrame(data=components) # print(principalDf) return 0 pca(df,5) # svd(df,5)
def __init__(self, A): scaled = preprocessing.StandardScaler().fit(A).transform(A) pca_res = pca().fit(scaled) self.U = pca_res.components_.transpose() # loadings self.P = pca_res.transform( scaled ) # scores
stimulus = 2000 identity_center = stimulus + 500 palatability_center = stimulus + 1000 window_radius = 125 iden_firing = dat.all_normalized_firing[...,(identity_center - window_radius)//dt:(identity_center + window_radius)//dt] pal_firing = dat.all_normalized_firing[...,(palatability_center - window_radius)//dt:(palatability_center + window_radius)//dt] def imshow(array): plt.imshow(array,interpolation='nearest',aspect='auto') iden_firing_long = np.reshape(iden_firing,(iden_firing.shape[0],-1)) pal_firing_long = np.reshape(pal_firing,(pal_firing.shape[0],-1)) n_components = 3 red_iden_obj = pca(n_components = n_components).fit(iden_firing_long.T) red_pal_obj = pca(n_components = n_components).fit(pal_firing_long.T) # Plot eigenvectors for both states fig,ax = plt.subplots(2) ax[0].imshow(red_iden_obj.components_) ax[1].imshow(red_pal_obj.components_) plt.show() # Matrix multiplication of both martices is dot product # of all pairs of eigenvectors orthogonal_distance = np.matmul(red_iden_obj.components_, red_pal_obj.components_.T) #fig,ax=plt.subplots(3) #plt.sca(ax[0])
testData = np.array( pd.read_table(os.path.join(DAT, 'test.txt'), header=None, encoding='gb2312', delim_whitespace=True)) train_y = trainData[:, -1] train_x = np.delete(trainData, -1, axis=1) test_y = testData[:, -1] test_x = np.delete(testData, -1, axis=1) time_start1 = time.time() clf1 = PCA() clf1.train(train_x) train_x = clf1.transformData(train_x) test_x = clf1.transformData(test_x) clf = LogisticRegression(solver='liblinear', multi_class='ovr') clf.fit(train_x, train_y) print("Accuracy of PCA:", clf.score(test_x, test_y)) time_end1 = time.time() print("Runtime of PCA:", time_end1 - time_start1) time_start2 = time.time() clf2 = pca(n_components=1) train_x = clf2.fit_transform(train_x) test_x = clf2.fit_transform(test_x, test_y) clf = LogisticRegression(solver='liblinear', multi_class='ovr') clf.fit(train_x, train_y) print("Accuracy of sklearn PCA:", clf.score(test_x, test_y)) time_end2 = time.time() print("Runtime of sklearn PCA:", time_end2 - time_start2)
def pca_analysis(dataset, dropcols=[], imputenans=True, scale=True, rem_outliers=True, out_thresh=10, n_components=5, existing_model=False, model_file='Optional'): """Performs a primary component analysis on an input dataset Parameters ---------- dataset : pandas.core.frame.DataFrame, shape (n, p) Input dataset with n samples and p features dropcols : list Columns to exclude from pca analysis. At a minimum, user must exclude non-numeric columns. imputenans : bool If True, impute NaN values as column means. scale : bool If True, columns will be scaled to a mean of zero and a standard deviation of 1. n_components : int Desired number of components in principle component analysis. Returns ------- pcadataset : diff_classifier.pca.Bunch Contains outputs of PCA analysis, including: scaled : numpy.ndarray, shape (n, p) Scaled dataset with n samples and p features pcavals : pandas.core.frame.DataFrame, shape (n, n_components) Output array of n_component features of each original sample final : pandas.core.frame.DataFrame, shape (n, p+n_components) Output array with principle components append to original array. prcomps : pandas.core.frame.DataFrame, shape (5, n_components) Output array displaying the top 5 features contributing to each principle component. prvals : dict of list of str Output dictionary of of the pca scores for the top 5 features contributing to each principle component. components : pandas.core.frame.DataFrame, shape (p, n_components) Raw pca scores. """ pd.options.mode.chained_assignment = None # default='warn' dataset_num = dataset.drop(dropcols, axis=1) dataset_num = dataset_num.replace([np.inf, -np.inf], np.nan) if rem_outliers: for i in range(10): for col in dataset_num.columns: xmean = np.mean(dataset_num[col]) xstd = np.std(dataset_num[col]) counter = 0 for x in dataset_num[col]: if x > xmean + out_thresh * xstd: dataset[col][counter] = np.nan dataset_num[col][counter] = np.nan if x < xmean - out_thresh * xstd: dataset[col][counter] = np.nan dataset_num[col][counter] = np.nan counter = counter + 1 dataset_raw = dataset_num.values # Fill in NaN values if imputenans: imp = Imputer(missing_values='NaN', strategy='mean', axis=0) imp.fit(dataset_raw) dataset_clean = imp.transform(dataset_raw) else: dataset_clean = dataset_raw # Scale inputs if scale: if existing_model: scaler = model_file.scaler dataset_scaled = model_file.scaler.transform(dataset_clean) else: scaler = stscale() scaler.fit(dataset_clean) dataset_scaled = scaler.transform(dataset_clean) else: dataset_scaled = dataset_clean pcadataset = Bunch(scaled=dataset_scaled) if existing_model: pca1 = model_file.pcamodel else: pca1 = pca(n_components=n_components) pca1.fit(dataset_scaled) if not existing_model: # Cumulative explained variance ratio cum_var = 0 explained_v = pca1.explained_variance_ratio_ print('Cumulative explained variance:') for i in range(0, n_components): cum_var = cum_var + explained_v[i] print('{} component: {}'.format(i, cum_var)) prim_comps = {} pcadataset.prvals = {} comps = pca1.components_ pcadataset.components = pd.DataFrame(comps.transpose()) for num in range(0, n_components): highest = np.abs( pcadataset.components[num]).values.argsort()[-5:][::-1] pels = [] pcadataset.prvals[num] = pcadataset.components[num].values[highest] for col in highest: pels.append(dataset_num.columns[col]) prim_comps[num] = pels # Main contributors to each primary component pcadataset.prcomps = pd.DataFrame.from_dict(prim_comps) pcadataset.pcavals = pd.DataFrame(pca1.transform(dataset_scaled)) pcadataset.final = pd.concat([dataset, pcadataset.pcavals], axis=1) pcadataset.pcamodel = pca1 pcadataset.scaler = scaler return pcadataset
day3_data = np.asarray(data3.normal_off_firing)[:,day3_nrns,:,0:269].swapaxes(-1,-2) all_data = np.concatenate((day1_data,day3_data[:,:,:,:32]),axis=0) # ============================================================================= # Take means of data and generate coordinate transformation based on that # ============================================================================= day1_mean = np.mean(day1_data,axis=-1) day3_mean = np.mean(day3_data,axis=-1) all_data_mean = np.concatenate((day1_mean,day3_mean),axis=0) all_data_long = all_data_mean[0,:,:] for taste in range(1,all_data_mean.shape[0]): all_data_long = np.concatenate((all_data_long,all_data_mean[taste,:,:]),axis=-1) all_mean_red_pca = pca(n_components = 3).fit(all_data_long.T) all_mean_red = all_mean_red_pca.transform(all_data_long.T) # Convert mean data back to array all_mean_red_array = np.zeros((all_data_mean.shape[0],all_mean_red.shape[1],all_data_mean.shape[2])) all_mean_red_list = np.split(all_mean_red,6) for taste in range(len(all_mean_red_list)): all_mean_red_array[taste,:,:] = all_mean_red_list[taste].T # Smooth mean data smooth_mean_dat = np.zeros(all_mean_red_array.shape) for taste in range(smooth_mean_dat.shape[0]): for dim in range(smooth_mean_dat.shape[1]): smooth_mean_dat[taste,dim,:] = scipy.ndimage.filters.gaussian_filter(all_mean_red_array[taste,dim,:],1) # Use same transformation to reduce single trials
total_dat = np.asarray(total_dat) total_dat_long = total_dat[0, :, :] for comp in range(1, n_components): total_dat_long = np.concatenate((total_dat_long, total_dat[comp, :, :]), axis=0) # ____ _ #| _ \ __ _ _ __ __| | ___ _ __ ___ #| |_) / _` | '_ \ / _` |/ _ \| '_ ` _ \ #| _ < (_| | | | | (_| | (_) | | | | | | #|_| \_\__,_|_| |_|\__,_|\___/|_| |_| |_| # # ____ _ _ _ #| _ \ _ __ ___ (_) ___ ___| |_(_) ___ _ __ #| |_) | '__/ _ \| |/ _ \/ __| __| |/ _ \| '_ \ #| __/| | | (_) | | __/ (__| |_| | (_) | | | | #|_| |_| \___// |\___|\___|\__|_|\___/|_| |_| # |__/ # Compare with PCA pca_transformer = pca(n_components=2) pca_data = pca_transformer.fit_transform(total_dat_long) plt.subplot(211) plt.scatter(pca_data[:, 0], pca_data[:, 1]) # Random projection transformer = sparse_random(n_components=2) X_new = transformer.fit_transform(total_dat_long) plt.subplot(212) plt.scatter(X_new[:, 0], X_new[:, 1])
data.get_data() data.get_firing_rates() # ============================================================================= # ============================================================================= ## Visualize change in representation because of Opto opto = np.asarray(np.sort([1,2]*60)) all_firing = np.concatenate([data.all_normal_off_firing,data.all_normal_on_firing], axis=1) all_firing = all_firing[:,:,80:200] all_firing_long = all_firing[0,:,:] for nrn in range(1,all_firing.shape[0]): all_firing_long = np.concatenate((all_firing_long,all_firing[int(nrn),:,:]),axis=1) all_reduced_pca = pca(n_components = 15).fit(all_firing_long) all_reduced = all_reduced_pca.transform(all_firing_long) # ============================================================================= # plt.plot(all_reduced_pca.explained_variance_ratio_/np.max(all_reduced_pca.explained_variance_ratio_),'-o') # plt.xlabel('PCA number');plt.ylabel('Variance Explained Ratio') # # plt.set_cmap('viridis') # # plt.figure() # plt.scatter(all_reduced[:,0],all_reduced[:,1], # c =opto, s=20) # plt.colorbar() # plt.xlabel('PCA1');plt.ylabel('PCA2') # #
pg_data.dtypes pg_data.columns pg_data.head() ################################################# #==========Principal Component Analysis=========# # Perform a PCA for PU, PEOU, and Intention # ################################################# reduc_data_pca = reduc_data[[ 'peruse01', 'peruse02', 'peruse03', 'peruse04', 'peruse05', 'peruse06', 'pereou01', 'pereou02', 'pereou03', 'pereou04', 'pereou05', 'pereou06', 'intent01', 'intent02', 'intent03' ]] pca_result = pca(n_components=15).fit(reduc_data_pca) #Obtain eigenvalues pca_result.explained_variance_ #Components from the PCA pca_result.components_ plt.figure(figsize=(7, 5)) plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], pca_result.explained_variance_ratio_, '-o') plt.ylabel('Proportion of Variance Explained') plt.xlabel('Principal Component') plt.xlim(0.75, 4.25) plt.ylim(0, 1.05) plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
off_firing = data.normal_off_firing print('file %i' % file) n_components = 2 for taste in range(4): stim_off = data.normal_off_firing[taste] stim_off = stim_off[:, :, stimulus_inds] total_stim_off = stim_off[0, :, :] for nrn in range(1, stim_off.shape[0]): total_stim_off = np.concatenate( (total_stim_off, stim_off[int(nrn), :, :]), axis=1) reduced_stim_pca = pca(n_components=10).fit(total_stim_off) #print(sum(reduced_stim_pca.explained_variance_ratio_)) reduced_stim = reduced_stim_pca.transform(total_stim_off) gmm = GaussianMixture(n_components=n_components, covariance_type='full', n_init=500).fit(reduced_stim) #print(gmm.predict(reduced_stim)) groups = gmm.predict(reduced_stim) all_groups.append(sum(groups)) trial_order = np.argsort(groups) # Train LDA classifier on firing from both clusters repeats = 500
def pca_cal(dataset, labels, attNames, **kwargs): p = pca(n_components=2) trained = p.fit_transform(dataset) plot(trained, labels, attNames, **kwargs)