def fit(self, hyperparameters, X, y): # PCA only works if the n_components is less than the min of the # of samples and # of columns if hyperparameters['n_components'] < np.min(X.shape): self.pca = sklearnPCA(**hyperparameters).fit(X) else: # Select the fewest number of features in all other cases self.pca = sklearnPCA(**self.grid[0]).fit(X)
def get_pca(): pca_matrix=[] error=0 filepath=session['filepath'] filetype=session['filetype'] labeltype=session['label'] request_data = json.loads(request.data) #print "request data in /pca:" #print request_data ordering = request_data["order"] distance_type=request_data["distance_type"] dimension_factor=request_data["dimension_factor"] pca_dimension_count=request_data["pca_dimension_count"] order=ex.parse_order(ordering) order=list(int(k) for k in order) alpha=ex.readFromFile(filepath, filetype, labeltype,distance_type,dimension_factor) dataset=np.array(alpha) for orderno in order: pca_matrix.append(dataset[orderno]) data_columns=len(pca_matrix[0]) if (data_columns >= pca_dimension_count): sklearn_pca = sklearnPCA(n_components=pca_dimension_count) Y_sklearn = sklearn_pca.fit_transform(pca_matrix) final_pca_values=np.transpose(Y_sklearn) final_pca_values=list(list(float(f) for f in d) for d in final_pca_values) elif(data_columns == 2): sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(pca_matrix) final_pca_values=np.transpose(Y_sklearn) final_pca_values=list(list(float(f) for f in d) for d in final_pca_values) else: error=1 final_pca_values=[] #print "final pca values:" #print np.array(final_pca_values) filePath=os.path.join(app.config['UPLOAD_FOLDER'], "results.txt") f = open(filePath, "a") f.write("PCA matrix for the ordering"+str(order)+"for "+str(len(final_pca_values))+"principal components:\n") f.write(str(np.transpose(final_pca_values))+"\n\n") f.close() response_data={} response_data["pca_values"]=final_pca_values response_data["error_value"]=error return json.dumps(response_data)
def pca_step_na(trans_std,promo_std): from sklearn.decomposition import PCA as sklearnPCA trans_pca = sklearnPCA(n_components=8) trans_new = trans_pca.fit_transform(trans_std) # promo PCA promo_pca = sklearnPCA(n_components=12) promo_new = promo_pca.fit_transform(promo_std) pca_dict = {"trans":trans_pca,"promo":promo_pca} return trans_new,promo_new,pca_dict
def pca_step(trans_std,food_std,promo_std): from sklearn.decomposition import PCA as sklearnPCA trans_pca = sklearnPCA(n_components=9) trans_new = trans_pca.fit_transform(trans_std) #food pca food_pca = sklearnPCA(n_components=24) food_new = food_pca.fit_transform(food_std) # promo PCA promo_pca = sklearnPCA(n_components=13) promo_new = promo_pca.fit_transform(promo_std) pca_dict = {"trans":trans_pca,"food":food_pca,"promo":promo_pca} return trans_new,food_new,promo_new,pca_dict
def Linear_PCA(HE_MI_train_test, numdim=2): ''' 개요 - PCA: EIG value 순서대로 Linear Transform을 하여 그 순서대로 sorting INPUT - numdim : Dimension OUTPUT - 1. sklearn_HE_train_fit : numdim * numHEtrain - 2. sklearn_MI_train_fit : numdim * numMItrain - 3. sklearn_HE_test_fit : numdim * numHEtest - 4. sklearn_MI_test_fit : numdim * numMItest ''' MyDataSet = HE_MI_train_test my_HEtraining = MyDataSet[0] my_MItraining = MyDataSet[1] my_HEtest = MyDataSet[2] my_MItest = MyDataSet[3] from sklearn.decomposition import PCA as sklearnPCA sklearn_pca = sklearnPCA(n_components=numdim) sklearn_HE_train_fit = sklearn_pca.fit_transform(my_HEtraining) sklearn_MI_train_fit = sklearn_pca.fit_transform(my_MItraining) sklearn_HE_test_fit = sklearn_pca.fit_transform(my_HEtest) sklearn_MI_test_fit = sklearn_pca.fit_transform(my_MItest) return [sklearn_HE_train_fit, sklearn_MI_train_fit, sklearn_HE_test_fit, sklearn_MI_test_fit]
def plotPCA(labels, data, inputFile, outputFile, store=False): sklearn_pca = sklearnPCA(n_components=2) sklearn_pca.fit(data) newData = sklearn_pca.transform(data) xval = newData[:, 0] yval = newData[:, 1] lbls = set(labels) #(predicted_labels) fig1 = plt.figure(1) #print(lbls) for lbl in lbls: #cond = predicted_labels == lbl cond = [i for i, x in enumerate(labels) if x == lbl] plt.plot(xval[cond], yval[cond], linestyle='none', marker='o', label=lbl, markersize=3) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.legend(numpoints=1, loc=0, fontsize='x-small') plt.subplots_adjust(bottom=.20, left=.20) plt.grid() fig1.suptitle("PCA plot for DBSCAN in " + inputFile.split("/")[-1], fontsize=20) if store: fig1.savefig("_".join( [outputFile, inputFile.split("/")[-1].split(".")[0]]) + ".png") else: plt.show()
def create_pca(self, input_tsv_file_for_pca, output_html): df = pd.read_csv(filepath_or_buffer=input_tsv_file_for_pca, header=0, sep='\t') X = df.ix[:, 1:24].values y = df.ix[:, 23].values #WARNING OCCURANCE standardised_X = StandardScaler().fit_transform(X) sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(standardised_X) traces = [] factor_group = df['Group'].unique() print "The factors (groups) found: ", factor_group for name in factor_group: trace = Scatter(x=Y_sklearn[y == name, 0], y=Y_sklearn[y == name, 1], mode='markers', name=name, marker=Marker( size=12, line=Line(color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8)) traces.append(trace) data = Data(traces) layout = Layout(xaxis=XAxis(title='PC1', showline=False), yaxis=YAxis(title='PC2', showline=False)) fig = Figure(data=data, layout=layout) plot(fig, show_link=False, filename=output_html, auto_open=False)
def reduceDataset(self,nr=3,method='PCA'): '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library Methods available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] #dataset=self.dataset[Model.in_columns] #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']] #PCA if method=='PCA': sklearn_pca = sklearnPCA(n_components=nr) reduced = sklearn_pca.fit_transform(dataset) #Factor Analysis elif method=='FactorAnalysis': fa=FactorAnalysis(n_components=nr) reduced=fa.fit_transform(dataset) #kernel pca with rbf kernel elif method=='KPCArbf': kpca=KernelPCA(nr,kernel='rbf') reduced=kpca.fit_transform(dataset) #kernel pca with poly kernel elif method=='KPCApoly': kpca=KernelPCA(nr,kernel='poly') reduced=kpca.fit_transform(dataset) #kernel pca with cosine kernel elif method=='KPCAcosine': kpca=KernelPCA(nr,kernel='cosine') reduced=kpca.fit_transform(dataset) #kernel pca with sigmoid kernel elif method=='KPCAsigmoid': kpca=KernelPCA(nr,kernel='sigmoid') reduced=kpca.fit_transform(dataset) #ICA elif method=='IPCA': ipca=IncrementalPCA(nr) reduced=ipca.fit_transform(dataset) #Fast ICA elif method=='FastICAParallel': fip=FastICA(nr,algorithm='parallel') reduced=fip.fit_transform(dataset) elif method=='FastICADeflation': fid=FastICA(nr,algorithm='deflation') reduced=fid.fit_transform(dataset) elif method == 'All': self.dimensionalityReduction(nr=nr) return self self.ModelInputs.update({method:reduced}) self.datasetsAvailable.append(method) return self
def plotPCA(labels, data, inputFile, outputFile, store=False): # apply PCA sklearn_pca = sklearnPCA(n_components=2) newData = sklearn_pca.fit_transform(data) # get x and y values xval = newData[:, 0] yval = newData[:, 1] lbls = set(labels) fig1 = plt.figure(1) # plot for each label for lbl in lbls: cond = [i for i, x in enumerate(labels) if x == lbl] plt.plot(xval[cond], yval[cond], linestyle='none', marker='o', label=lbl) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.legend(numpoints=1, loc=0) plt.subplots_adjust(bottom=.20, left=.20) fig1.suptitle("PCA plot for centroids in " + inputFile.split("/")[-1], fontsize=20) # if PCA output parameter given then store the plot else display it if store: fig1.savefig("_".join( [outputFile, inputFile.split("/")[-1].split(".")[0]]) + ".png") else: plt.show()
def vPca(filepath): df = pd.read_csv(filepath_or_buffer=filepath, sep=',') x = df.ix[:, :].values x = x.transpose() sklearn_pca = sklearnPCA(n_components=2) x2 = sklearn_pca.fit_transform(x) plots(x2)
def PCA(X, labels): sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X) fig = plt.figure() # fig.suptitle(band, fontweight = 'bold') ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) km = KMeans(n_clusters=2) km.fit(Y_sklearn) #ax.scatter(Y_sklearn[:,0], Y_sklearn[:,1]) #tresh = [0 if i >4 else 1 for i in Y_sklearn[:,0]] colors = ['blue', 'red'] for idx, row in enumerate(Y_sklearn): ax2.scatter(row[0], row[1], color=colors[km.labels_[idx]], alpha=0.5) ax1.plot(X[idx, :], color=colors[km.labels_[idx]], alpha=0.2) labels.iloc[km.labels_ == 1].to_csv( '/Users/ryszardcetnarski/Desktop/Nencki/Badanie_NFB/Dane/miesniowcy_pca.csv ' ) return km.labels_, labels.iloc[km.labels_ == 1]
def PCA(x,x_test): X_std = StandardScaler().fit_transform(x) sklearn_pca = sklearnPCA(n_components='mle', svd_solver='full') x = sklearn_pca.fit_transform(X_std) x_test = sklearn_pca.transform(x_test) return x,x_test
def ABC_summaryStatistics_PCA(Surveys): """ Heavily inspired by https://plot.ly/ipython-notebooks/principal-component-analysis/ """ [A, B] = Surveys newdist = lambda x: dbc.measurand( x.Dproj_pix()/x.GCl.R200(), 'Dproj', label='$D_\mathrm{proj,rel}$', un='$R_{200}$' ) plotmeasures = [lambda x: x.LLS, lambda x: x.P_rest, lambda x: x.Mach, newdist] X1 = A.fetch_pandas(plotmeasures, surname=False).dropna().as_matrix() #.data() #, kwargs_FilterCluster={}, kwargs_FilterObjects={} X2 = B.fetch_pandas(plotmeasures, surname=False).dropna().as_matrix() #.data() #, kwargs_FilterCluster={}, kwargs_FilterObjects={} X1_std = StandardScaler().fit_transform(X1) X2_std = StandardScaler().fit_transform(X2) # # http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html # plt.cla() # pca = decomposition.PCA(n_components=3) # pca.fit(X) # X = pca.transform(X) sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X1_std) """ This gives you a proxy for the average summed square error in the 2-D dimensional reduction via pca """ distance = np.sum(Y_sklearn**2)/len(Y_sklearn[0]) return distance
def Q1(self): # part one class1 = np.random.multivariate_normal(self.m1, self.cov, 1000).T class2 = np.random.multivariate_normal(self.m2, self.cov, 1000).T plt.plot(class1[0,:], class1[1,:], 'x') plt.plot(class2[0,:], class2[1,:], 'x') # part two : calculate pca samples = np.concatenate((class1, class2), axis=1) mlab_pca = mlabPCA(samples.T) plt.figure(2) plt.plot(mlab_pca.Y[0:1000, 0], 'o', markersize=7, color='blue', alpha=0.5, label='class1') plt.plot(mlab_pca.Y[1000:2000, 0], '^', markersize=7, color='yellow', alpha=0.5, label='class2') # part three plt.figure(1) sklearn_pca = sklearnPCA(n_components=1) sklearn_transf = sklearn_pca.fit_transform(samples.T) p = sklearn_pca.inverse_transform(sklearn_transf) plt.figure(1) plt.plot(p[0:1000, 0], p[0:1000, 1], 'x') plt.plot(p[1000:2000, 0], p[1000:2000, 1], 'x') error = ((p - samples.T) ** 2).mean() print((error)) print (np.math.sqrt (error)) plt.show()
def pcaTransform(context, mesh, features, K=5): # X_std = features;#StandardScaler().fit_transform(X); X_std = StandardScaler().fit_transform(features) sklearn_pca = sklearnPCA(n_components=K) Y_sklearn = sklearn_pca.fit_transform(X_std) mu = sklearn_pca.mean_ mu.shape = (mu.shape[0], 1) D = sklearn_pca.explained_variance_ D_ratio = sklearn_pca.explained_variance_ratio_ V = sklearn_pca.components_ print('*' * 40) print('DATA ENTRIES SHAPE ::: ', features.shape) print('MEAN MATRIX SHAPE ::: ', mu.shape) print('EIGEN VALUES SHAPE ::: ', D.shape) print('EIGEN VECTORS SHAPE ::: ', V.shape) print('TRANSFORMED SHAPE ::: ', Y_sklearn.shape) sio.savemat( bpy.path.abspath('%s/%s.mat' % (mesh.signatures_dir, mesh.name)), { 'eigenvectors': V.T, 'eigenvalues': D, 'mu': mu, 'X': X_std, 'XMinusMu': (X_std.T - mu), 'transformed': Y_sklearn }) print('FINISHED SAVING ::: %s/%s.mat' % (mesh.signatures_dir, mesh.name)) return mu, Y_sklearn
def f(train,threshold,test): hi=h(train) h_score=pd.DataFrame(hi, index=np.array(range(1,21149))) gene_ls=h_score.index[h_score.iloc[:,0]>1].tolist() candidate_genes=['V{0}'.format(element) for element in gene_ls] # qualified genes were selected stdsc = preprocessing.StandardScaler() np_scaled_train = stdsc.fit_transform(train.loc[:,candidate_genes]) np_scaled_test = stdsc.transform(test.loc[:,candidate_genes]) pca = sklearnPCA(n_components=1) X_train_pca = pca.fit_transform(np_scaled_train) # This is the result X_test_pca = pca.transform(np_scaled_test) eigen_val=pca.explained_variance_ #eigen value is the explained variance #assign pca score to the test dataset test=test.assign(w=pd.Series(np.ones(len(test.patient_id)))) test['w']=X_test_pca testset_surv=test[['event_free_survival_time_days','death','w']] #do cox-regression # Using Cox Proportional Hazards model cph = CoxPHFitter() cph.fit(testset_surv,'event_free_survival_time_days',event_col='death') return cph.print_summary()
def format_data(self): kdd_train_data = np.concatenate([ self.train_kdd_numeric, self.train_kdd_binary, self.train_kdd_nominal ], axis=1) kdd_test_data = np.concatenate([ self.test_kdd_numeric, self.test_kdd_binary, self.test_kdd_nominal ], axis=1) kdd_train_data = np.concatenate( [kdd_train_data, self.train_kdd_label_2classes], axis=1) # kdd_test_data = np.concatenate([self.test_kdd_numeric, self.test_kdd_binary, self.test_kdd_nominal, self.test_kdd_label_2classes], axis=1) kdd_test_data = np.concatenate( [kdd_test_data, self.test_kdd_label_2classes], axis=1) self.X_train, self.X_test, y_train, y_test = kdd_train_data[:, : -1], kdd_test_data[:, : -1], kdd_train_data[:, -1], kdd_test_data[:, -1] data_pca = sklearnPCA(n_components=15) data_pca = data_pca.fit(self.X_train) # numeric_pca = numeric_pca.fit(np.concatenate((self.train_kdd_numeric, self.test_kdd_numeric), axis=0)) self.X_train = data_pca.transform(self.X_train) self.X_test = data_pca.transform(self.X_test) self.y_train = np.array(list(map(int, y_train))) self.y_test = np.array(list(map(np.int64, y_test)))
def run_pca(expression): # Load Expression data df = pd.read_table(expression, header=0, index_col=0) run_ids = list(df.columns.values) dataMatrix = np.transpose(np.array(df)) run_ids = [s.replace('.htseq', '') for s in run_ids] # Run PCA sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = sklearn_pca.fit_transform( preprocessing.maxabs_scale(dataMatrix, axis=0)) with sns.axes_style("whitegrid"): for run, pca_data in zip(run_ids, sklearn_transf): plt.plot(pca_data[0], pca_data[1], 'o', markersize=7, alpha=0.5, color='gray') plt.text(pca_data[0], pca_data[1], run) plt.xlabel('PC 1 (%0.2f %%)' % (sklearn_pca.explained_variance_ratio_[0] * 100)) plt.ylabel('PC 2 (%0.2f %%)' % (sklearn_pca.explained_variance_ratio_[1] * 100)) plt.show()
def PCA(df, class_name): # Cannot do PCA on an empty matrix if len(list(df)) < 2: return df, [], [], [] # Figure out which columns can be considered (only float or int columns but not the class column) cols = [] for item in list(df): if 'float' in df[item].dtypes.name: if item != class_name: cols.append(df.columns.get_loc(item)) # Get new dataframe df_new = df[df.columns[cols]] # Set this as the data to analyze X = df_new.values # Standardize the data X_std = StandardScaler().fit_transform(X) # Do PCA pca = sklearnPCA(n_components=len(list(df_new))) Y = pca.fit_transform(X_std) ## Get variance contributions var_exp = pca.explained_variance_ratio_ cum_var_exp = pca.explained_variance_ratio_.cumsum() return Y, var_exp, cum_var_exp
def pca(self,winSize): data = np.zeros((len(self.data),len(self.data['FUNC'][winSize]))) i=0 for dEl in sorted(self.data): self.data[dEl][winSize] = normalizeMaxMin(self.data[dEl][winSize]) data[i] = self.data[dEl][winSize] i+=1 X_std = StandardScaler().fit_transform(np.transpose(data)) sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X_std) traces = [] trace = go.Scatter( x=Y_sklearn[:,0], y=Y_sklearn[:,1], mode='markers', marker = go.Marker( size=12, line= go.Line( color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8)) traces.append(trace) data = go.Data(traces) layout = go.Layout(xaxis = go.XAxis(title='PC1', showline=False), yaxis = go.YAxis(title='PC2', showline=False)) fig = go.Figure(data=data, layout=layout) if self.outputType=='file': print(py.plot(fig, filename='pca.html')) else: return py.plot(fig, output_type='div')
def dataframe_components(df2,lon,columns): import numpy as np import pandas as pd from sklearn import tree from sklearn import metrics from sklearn import cross_validation import matplotlib.pyplot as plt from sklearn.decomposition import PCA as sklearnPCA X=df2.values from sklearn.preprocessing import StandardScaler X_std = StandardScaler().fit_transform(X) pca=sklearnPCA(n_components=lon).fit_transform(X_std) list_comp_pca=[] # CREACCION DATAFRAME CON COMPONENTES PRINCIPALES for i in range(0,lon): v="Componente"+str(i) list_comp_pca.append(v) dd1=pd.DataFrame(X_std,columns=columns) dd2=pd.DataFrame(pca,columns=list_comp_pca) df3=pd.concat([dd1,dd2],axis=1) return df3
def pca(self): # remove WHERE when table cleaned up to remove header rows statement = ( """SELECT transcript_id, TPM, sample_id FROM %s where transcript_id != 'Transcript' """ % self.table ) # fetch data df = self.getDataFrame(statement) # put dataframe so row=genes, cols = samples, cells contain TPM pivot_df = df.pivot("transcript_id", "sample_id")["TPM"] # filter dataframe to get rid of genes where TPM == 0 across samples filtered_df = pivot_df[pivot_df.sum(axis=1) > 0] # add +1 to counts and log transform data. logdf = np.log(filtered_df + 0.1) # Scale dataframe so variance =1 across rows logscaled = sklearn_scale(logdf, axis=1) # turn array back to df and add transcript id back to index logscaled_df = pd.DataFrame(logscaled) logscaled_df.index = list(logdf.index) # Now do the PCA - can change n_components sklearn_pca = sklearnPCA(n_components=self.n_components) sklearn_pca.fit(logscaled_df) index = logdf.columns return sklearn_pca, index
def pca_analysis(indexname,dataframe): df = dataframe column_count = len(df.columns) X = df.ix[:,1:column_count].values zip = df.ix[:,0].values #Standardize Data X_std = StandardScaler().fit_transform(X) #Generate PCA Components sklearn_pca = sklearnPCA(n_components=1) Y_sklearn = sklearn_pca.fit_transform(X_std) explained_ratio = sklearn_pca.explained_variance_ratio_ covariance_array = sklearn_pca.get_covariance() df_final = pd.DataFrame({'zip5':zip,indexname:Y_sklearn[:,0]}) #Normalize Data on a 0 to 1 scale #zip5_final = df_final['zip5'].values #minmax_scale = preprocessing.MinMaxScaler().fit(df_final[[indexname]]) #minmax = minmax_scale.transform(df_final[[indexname]]) #df_minmax = pd.DataFrame({'zip5':zip5_final,indexname:minmax[:,0]}) return df_final
def __init__(self): """ c'tor """ self.pca = sklearnPCA(n_components=2) self.fit_executed = False self.X_tr = None self.Xn_tr = None
def testPCA(self, dist): sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = -1 * sklearn_pca.fit_transform(dist) print('MYPCA') print(self.PCA(dist, 2)[0]) print('PCALibrary') print(sklearn_transf)
def read_dataset(Normalize=1): data = pd.read_csv('../../Dataset/Iris/Iris_2_classes.csv') #read dataset data['Species'] = data['Species'].replace( ["Iris-setosa", "Iris-versicolor"], (0, 1)) #encode label y = data['Species'] #lable x = data.drop(['Species', 'Id'], axis=1) #drop lable and id x = np.asarray(x) #put it in array y = np.asarray(y) #put it in array x = np.nan_to_num(x) #convert any nan to 0 train_x, test_x, train_y, test_y = train_test_split( x, y, test_size=0.2, random_state=50) #split dataset to 80 train and 20 test if Normalize == 1: #normalize dataset scaler = MinMaxScaler() train_x = scaler.fit_transform(train_x) test_x = scaler.transform(test_x) # draw data pca = sklearnPCA(n_components=2) # 2-dimensional PCA transX = pd.DataFrame(pca.fit_transform(x)) plt.scatter(transX[y == 0][0], transX[y == 0][1], label='Class 1', c='red') plt.scatter(transX[y == 1][0], transX[y == 1][1], label='Class 2', c='blue') plt.legend() plt.show() ################## return x, y, train_x, train_y, test_x, test_y
def pcaWiki(self,file): self.formDataPCA(file) X_std = StandardScaler().fit_transform(self.X) sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X_std) traces = [] for name in self.names: trace = go.Scatter( x=Y_sklearn[self.y==name,0], y=Y_sklearn[self.y==name,1], mode='markers', name=name, marker=go.Marker( size=12, line=go.Line( color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8)) traces.append(trace) data = go.Data(traces) layout = go.Layout(xaxis=go.XAxis(title='PC1', showline=False), yaxis=go.YAxis(title='PC2', showline=False)) fig = go.Figure(data=data, layout=layout) if (self.outputType == 'file'): print(py.plot(fig,filename='pca')) else: return (py.plot(fig,include_plotlyjs='False',output_type='div'))
def main(): with open('dataset.pkl', 'rb') as f: xr, y, features = pickle.load(f) #y = xr.label #Xw = xr.drop('label',axis = 1 ) Xw = xr Xw = Xw.fillna(method='ffill') Xw = Xw.fillna(method='bfill') #ax = sns.countplot(y, label="Count") #plt.show() Wk, Tr, Bs, Cr, Nt, Bk = y.value_counts() print('Number of Bike: ',Bk) print('Number of Bus: ',Bs) print('Number of Car: ',Cr) print('Number of Nothing: ',Nt) print('Number of Train: ',Tr) print('Number of Walk: ',Wk) pca = sklearnPCA(n_components=2) #2-dimensional PCA transformed = pd.DataFrame(pca.fit_transform(Xw)) plt.scatter(transformed[y=='Walk'][0], transformed[y=='Walk'][1], label='Walk', c='darkgreen') plt.scatter(transformed[y=='Bike'][0], transformed[y=='Bike'][1], label='Bike', c='red') plt.scatter(transformed[y=='Train'][0], transformed[y=='Train'][1], label='Train', c='yellow') plt.scatter(transformed[y=='Bus'][0], transformed[y=='Bus'][1], label='Bus', c='blue') plt.scatter(transformed[y=='Car'][0], transformed[y=='Car'][1], label='Car', c='lightgreen') plt.scatter(transformed[y=='Nothing'][0], transformed[y=='Nothing'][1], label='Nothing', c='black') plt.legend() plt.show()
def pca(X, y, labels, pic_file, PCA=True): from mpl_toolkits.mplot3d import Axes3D # fit the data for PCA and plotting # when PCA=False, plot the original data into 3D pca = sklearnPCA (n_components = 3) if PCA==True: X_pca = pd.DataFrame (pca.fit_transform (X)) else: X_pca=X # colors and markers for each class colors = {0:"b", 1:"r", 2:"g", 3:"c", 4:"m", 5:"k"} markers = {0:"o", 1:"^", 2:"D", 3:"*", 4:"x", 5:"p"} num_labels = len(labels) #Set a figure object ans=input("Save 3D plot of samples as a picture? (y/n): ") if (ans=="y") or (ans=="Y"): save=True else: save=False fig = plt.figure (figsize = (10, 10)) ax = fig.add_subplot (111, projection = "3d") for i in range(num_labels): X_in_cls=X_pca[y == labels[i]] ax.scatter(X_in_cls[0], X_in_cls[1], X_in_cls[2], \ c = colors[i], marker = markers[i], label = labels[i]) ax.legend () plt.title (pic_file.split("_")[0]) plt.show() if save==True: fig.savefig("%s%s_pca.png" % (VAR.out_path, pic_file), \ bbox_inches="tight")
def cluster_model(users_data, num_cluster=3): array_users = users_data.values X = array_users[:, 1:17] X_std = StandardScaler().fit_transform(X) sklearn_pca = sklearnPCA(n_components=3) Y_sklearn = sklearn_pca.fit_transform(X_std) eigenValues = sklearn_pca.explained_variance_ratio_ loadings = sklearn_pca.components_ mu = np.mean(X, axis=0) nComp = 2 Xhat = np.dot( sklearn_pca.transform(X)[:, :nComp], sklearn_pca.components_[:nComp, :]) Xhat = mu + Xhat Xhat = pd.DataFrame(Xhat) # X = PCA on previous data X = Xhat.ix[:, '0':'1'] k = num_cluster # Define the number of clusters in which we want to partion the data kmeans = KMeans(n_clusters=k) # Run the algorithm kmeans kmeans.fit(X) ##sklearn.preprocessing.StandardScaler centroids = kmeans.cluster_centers_ # Get centroid's coordinates for each cluster labels = kmeans.labels_ # Get labels assigned to each data final_labels = users_data[['user_id']] final_labels['labels'] = pd.DataFrame(labels) return final_labels
def run_pca(table, n_components): ''' ##This function is broken!## Runs PCA on a given table for a given number of components Params: table (2d array): array of traces n_components (int): Number of components to be visualized Returns: covar_matrix: Covariance matrix variance (list): list of variances components (list): list of components ''' #calculate variance explained and cumulative variance explained covar_matrix = sklearnPCA(n_components=10) covar_matrix.fit(table) variance = covar_matrix.explained_variance_ratio_ var = np.cumsum( np.round(covar_matrix.explained_variance_ratio_, decimals=3) * 100) #print graph of the variance explained with [n] features plt.ylabel('%variance explained') plt.xlabel('# of principle components') plt.title('Variance Explained') ## plt.ylim(70,100.5) plt.style.context('seaborn-whitegrid') # plt.plot(variance[:n_components]) variance = covar_matrix.explained_variance_ratio_ components = covar_matrix.components_ return covar_matrix return variance return components
def pca_built(self, all_samples): from sklearn.decomposition import PCA as sklearnPCA sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = sklearn_pca.fit_transform(all_samples.T) sklearn_transf = sklearn_transf * (-1) plt.plot(sklearn_transf[0:20, 0], sklearn_transf[0:20, 1], 'o', markersize=7, color='yellow', alpha=0.5, label='class1') plt.plot(sklearn_transf[20:40, 0], sklearn_transf[20:40, 1], '^', markersize=7, color='black', alpha=0.5, label='class2') plt.xlabel('x_values') plt.ylabel('y_values') plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.legend() plt.title('Transformed samples with class labels from built PCA') plt.draw() plt.show()
def implement_pca_betweem_two_frames(image1, image2): #read image pic1 = cv2.imread(image1) pic2 = cv2.imread(image2) #convert BGR to Gray prvs = cv2.cvtColor(pic1, cv2.COLOR_BGR2GRAY) next = cv2.cvtColor(pic2, cv2.COLOR_BGR2GRAY) #calculate optical flow flow = cv2.calcOpticalFlowFarneback(prvs, next, None, 0.5, 3, 15, 3, 5, 1.2, 0) #obtain angle matrix: _ is magnitude and angle_matrix is measure by degree now. _, angle_matrix = cv2.cartToPolar(flow[..., 0], flow[..., 1], angleInDegrees=True) #implement normal PCA based on the coarse foreground sklearn_pca = sklearnPCA() angle_std = StandardScaler().fit_transform(angle_matrix) sklearn_pca.fit_transform(angle_std) #convert to uint8 pca_implement = angle_std.astype(np.uint8) #write image cv2.imwrite('pca_fore_ground_matrix_' + str(image1) + '.png', pca_implement) #destroy table cv2.destroyAllWindows()
def pca(self): # remove WHERE when table cleaned up to remove header rows statement = ("""SELECT transcript_id, TPM, sample_id FROM %s where transcript_id != 'Transcript' """ % self.table) # fetch data df = self.getDataFrame(statement) # put dataframe so row=genes, cols = samples, cells contain TPM pivot_df = df.pivot('transcript_id', 'sample_id')['TPM'] # filter dataframe to get rid of genes where TPM == 0 across samples filtered_df = pivot_df[pivot_df.sum(axis=1) > 0] # add +1 to counts and log transform data. logdf = np.log(filtered_df + 0.1) # Scale dataframe so variance =1 across rows logscaled = sklearn_scale(logdf, axis=1) # turn array back to df and add transcript id back to index logscaled_df = pd.DataFrame(logscaled) logscaled_df.index = list(logdf.index) # Now do the PCA - can change n_components sklearn_pca = sklearnPCA(n_components=self.n_components) sklearn_pca.fit(logscaled_df) index = logdf.columns return sklearn_pca, index
def kmeans(): yeast_t = 7 yeast_k = 6 yeastData = np.empty([614, 7], dtype = float) with open('YeastGene.csv', 'rb') as yeastdata: yeastreader = csv.reader(yeastdata, delimiter=',') i = 0 for row in yeastreader: yeastData[i] = row i += 1 #print yeastData yeastCentroid = np.empty([yeast_k, 7], dtype = float) with open('YeastGene_Initial_Centroids.csv', 'rb') as yeastdata: yeastreader = csv.reader(yeastdata, delimiter=',') i = 0 for row in yeastreader: yeastCentroid[i] = row i += 1 #print yeastCentroid for t in range(0, yeast_t): yeast_c = [[] for i in range(0,yeast_k)] minCentroid = [] for arr in yeastData: for cen in yeastCentroid: minCentroid.append(np.linalg.norm(arr - cen)) yeast_c[minCentroid.index(min(minCentroid))].append(arr) minCentroid = [] for k in range(0,yeast_k): yeastCentroid[k] = [float(sum(l))/len(l) for l in zip(*yeast_c[k])] #print "The new yeast Centroid values\n" #print yeastCentroid #print "The cluster sizes are - " print len(yeast_c[0]), len(yeast_c[1]), len(yeast_c[2]), len(yeast_c[3]), len(yeast_c[4]), len(yeast_c[5]) clusters = np.zeros([614, 7], dtype=float) prev_len = 0 for i in range(0,6): for j in range(0,len(yeast_c[i])): clusters[prev_len] = yeast_c[i][j] prev_len += 1 sklearn_pca = sklearnPCA(n_components = 2) transf = sklearn_pca.fit_transform(clusters) plt.plot(transf[0:140, 0], transf[0:140, 1],'*', markersize = 7, color='blue', alpha=0.5, label='cluster 1') plt.plot(transf[140:191, 0], transf[140:191, 1],'*', markersize = 7, color='red', alpha=0.5, label='cluster 2') plt.plot(transf[191:355, 0], transf[191:355, 1],'*', markersize = 7, color='green', alpha=0.5, label='cluster 3') plt.plot(transf[355:376, 0], transf[355:376, 1],'*', markersize = 7, color='indigo', alpha=0.5, label='cluster 4') plt.plot(transf[376:538, 0], transf[376:538, 1],'*', markersize = 7, color='yellow', alpha=0.5, label='cluster 5') plt.plot(transf[538:614, 0], transf[538:614, 1],'*', markersize = 7, color='black', alpha=0.5, label='cluster 6') plt.xlim([-10, 10]) plt.ylim([-10, 10]) plt.legend() plt.title("Kmeans") plt.show()
def dimensionalityReduction(self,nr=5): '''It applies all the dimensionality reduction techniques available in this class: Techniques available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] sklearn_pca = sklearnPCA(n_components=nr) p_components = sklearn_pca.fit_transform(dataset) fa=FactorAnalysis(n_components=nr) factors=fa.fit_transform(dataset) kpca=KernelPCA(nr,kernel='rbf') rbf=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='poly') poly=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='cosine') cosine=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='sigmoid') sigmoid=kpca.fit_transform(dataset) ipca=IncrementalPCA(nr) i_components=ipca.fit_transform(dataset) fip=FastICA(nr,algorithm='parallel') fid=FastICA(nr,algorithm='deflation') ficaD=fip.fit_transform(dataset) ficaP=fid.fit_transform(dataset) '''isomap=Isomap(n_components=nr).fit_transform(dataset) try: lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset) except ValueError: lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset) try: lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset) except ValueError: lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) try: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset) except ValueError: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)''' values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3] keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa'] self.ModelInputs.update(dict(zip(keys, values))) [self.datasetsAvailable.append(key) for key in keys ] #debug #dataset=pd.DataFrame(self.ModelInputs['Dataset']) #dataset['Output']=self.ModelOutput #self.debug['Dimensionalityreduction']=dataset ### return self
def apply_pca(data): from sklearn.preprocessing import StandardScaler X_std = StandardScaler().fit_transform(data) from sklearn.decomposition import PCA as sklearnPCA sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X_std) return Y_sklearn
def pcaDecomp(data, normalize = True): if normalize: data = StandardScaler().fit_transform(data) pca = sklearnPCA(n_components = 2) decomp = pca.fit_transform(data) # plt.scatter(data[:,0], data[:,1]) # plt.show() histo2d(decomp, ranged = False)
def pca(self, samples): ''' Apply pca from sklearn. ''' sklearn_pca = sklearnPCA(n_components=2) # Fit the model with samples fit = sklearn_pca.fit(samples) # Apply the dimensionality reduction on samples pca = fit.transform(samples) return pca
def pca_json(df, n_components=4, exp_var_min=.05): sklearn_pca = sklearnPCA(n_components=n_components) pca_points = sklearn_pca.fit_transform(df.T) exp_var, num_pc = pc_to_keep(sklearn_pca.explained_variance_ratio_, exp_var_min) pca_points_df = trim_pc(pca_points, num_pc) pca_points_df['sample'] = df.columns.values pca_points_df = append_exp_var(pc_df=pca_points_df, exp_var_list=exp_var, num_pc=num_pc) return pca_points_df
def sklearn_practice(): from sklearn.decomposition import PCA as sklearnPCA import numpy as np class1_sample, class2_sample = random_gen() all_samples = np.concatenate((class1_sample, class2_sample), axis=1) sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = sklearn_pca.fit_transform(all_samples.T) return sklearn_transf
def plotGraph(samples, n_samples, tags, dimensions): colours = ['blue', 'red', 'green', 'yellow', 'black'] n_tags = len(tags) if dimensions == '2D': sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = sklearn_pca.fit_transform(samples) for i in range(n_tags): plt.plot(sklearn_transf[i*n_samples:(i+1)*n_samples,0],sklearn_transf[i*n_samples:(i+1)*n_samples,1],\ 'o', markersize=7, color=colours[i], alpha=0.5, label=tags[i]) plt.xlabel('Feature 1') plt.ylabel('Feature 2') # plt.xlim([-4,4]) # plt.ylim([-4,4]) plt.legend() plt.title('PCA') elif dimensions == '3D': sklearn_pca = sklearnPCA(n_components=3) sklearn_transf = sklearn_pca.fit_transform(samples) fig = plt.figure(figsize=(8,8)) ax = fig.add_subplot(111, projection='3d') plt.rcParams['legend.fontsize'] = 10 for i in range(n_tags): ax.plot(sklearn_transf[i*n_samples:(i+1)*n_samples,0], sklearn_transf[i*n_samples:(i+1)*n_samples,1],\ sklearn_transf[i*n_samples:(i+1)*n_samples,2], 'o', markersize=8, color=colours[i], alpha=0.5, label=tags[i]) plt.title('PCA') ax.legend(loc='upper right') # plt.savefig("%s.png" % (dimensions), bbox_inches='tight',dpi=200) plt.show() # plt.close() return True
def plotGraph(samples, word, dimensions): if dimensions == '2D': sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = sklearn_pca.fit_transform(samples) plt.plot(sklearn_transf[:,0],sklearn_transf[:,1],\ 'o', markersize=7, color='blue', alpha=0.5, label='') # plt.plot(sklearn_transf[1::2,0], sklearn_transf[1::2,1],\ # '^', markersize=7, color='red', alpha=0.5, label='Matrix') plt.xlabel('Feature 1') plt.ylabel('Feature 2') # plt.xlim([-4,4]) plt.ylim([-.8,.8]) plt.legend() plt.title('Word embeddings PCA') print sklearn_transf elif dimensions == '3D': sklearn_pca = sklearnPCA(n_components=3) sklearn_transf = sklearn_pca.fit_transform(samples) fig = plt.figure(figsize=(8,8)) ax = fig.add_subplot(111, projection='3d') plt.rcParams['legend.fontsize'] = 10 ax.plot(sklearn_transf[:,0], sklearn_transf[:,1],\ sklearn_transf[:,2], 'o', markersize=8, color='blue', alpha=0.5, label='') # ax.plot(sklearn_transf[:,0], sklearn_transf[:,1],\ # sklearn_transf[:,2], '^', markersize=8, alpha=0.5, color='red', label='Matrix') plt.title('Word embeddings PCA') ax.legend(loc='upper right') print sklearn_transf plt.savefig("%s-%s.png" % (word, dimensions), bbox_inches='tight', dpi=200) plt.close() return True
def Seleccion_Ratios(df): import numpy as np import pandas as pd from sklearn import tree #from sklearn import metrics from sklearn import cross_validation from sklearn.decomposition import PCA as sklearnPCA from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier # Eliminamos antes del cálculo de las PCAs las columnas target e id. df.columns = [x.lower() for x in df.columns] objetivo = [col for col in df.columns if 'target' in col] objetivo = ''.join(objetivo) dfBorrar = df[['id', objetivo]] borrar = ['id', objetivo] dfaux = df.drop(borrar, axis=1) ListaColumnas= dfaux.columns tamDf = len(dfaux.columns) X_std = StandardScaler().fit_transform(dfaux.values) pca=sklearnPCA(n_components=tamDf).fit_transform(X_std) columnas_pca=[] for i in range(0,pca.shape[0]): v="VAR_PCA_"+str(i) columnas_pca.append(v) df1=pd.DataFrame(X_std,columns=ListaColumnas) df2=pd.DataFrame(pca,columns=columnas_pca) df_PCA=pd.concat([df1,df2],axis=1) y = df[objetivo] forest = RandomForestClassifier(n_estimators=250, random_state=0) forest.fit(df_PCA, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Obtenemos el ranking de los mejores 30 print("TOP 30:") for f in range(30): print("%d. Ratio %s (%f) " % (f + 1, df_PCA.columns[indices[f]], importances[indices[f]] ))
def plot(self): self.train() # this will get data frame in self.mllib.X_train X = self.mllib.X_train.iloc[:,:-1] Y = self.mllib.X_train.iloc[:,-1] # get data in 3D axis scaler = sklearnPCA(n_components=3).fit(X) X = scaler.transform(X) Y = Y.reshape(Y.shape[0],1) X = np.append(X, Y, 1) self.mllib.plot(X)
def pca_built(self, all_samples): from sklearn.decomposition import PCA as sklearnPCA sklearn_pca = sklearnPCA(n_components=2) sklearn_transf = sklearn_pca.fit_transform(all_samples.T) sklearn_transf = sklearn_transf*(-1) plt.plot(sklearn_transf[0:20, 0], sklearn_transf[0:20, 1], 'o', markersize=7, color='yellow', alpha=0.5, label='class1') plt.plot(sklearn_transf[20:40, 0], sklearn_transf[20:40, 1], '^', markersize=7, color='black', alpha=0.5, label='class2') plt.xlabel('x_values') plt.ylabel('y_values') plt.xlim([-4, 4]) plt.ylim([-4, 4]) plt.legend() plt.title('Transformed samples with class labels from built PCA') plt.draw() plt.show()
def dim_reduction_PCA(X,n_dim): """ Reduce the dimension by PCA. :param X: matrix data (n*k), n is the number of samples. k is the dimension of each sample :param n_dim: number of dimension we desired to reduce to. :return reduced_X:matrix data(n*n_dim) """ try: reduced_X = sklearnPCA(n_components=n_dim).fit_transform(X) except: print ("Dimension Error") reduced_X = [] finally: return reduced_X
def best_dimension(X,n_com = 0.8): """ get the number of dimension :param X: matrix data (n*k), n is the number of samples. k is the dimension of each sample :param n_dim: number of dimension we desired to reduce to. :return best_dimension: """ try: pca = sklearnPCA(n_components=n_com) pca.fit_transform(X) except: print ("Dimension Error") return 0 finally: return pca.n_components_
def deaPCA(df, allres=False, normalise=False, plot=True): """ Extract principal components from pandas dataframe and shift distribution so that all values are strictly positive, as required for DEA. Takes: df: A dataframe of series to run the PCA on. allres: Boolean. Set True if you would like to get the PCA object returned instead of the transformed data. This can be useful if you wish to use the entire results of the PCA. The object is a fit_transformed sklearn.decomposition.PCA object. normalise: Boolean. Set True to normalise the series to a z-score before transforming. plot: Should the function display a plot of the variance explained? """ from sklearn.decomposition import PCA as sklearnPCA if normalise: df = normalise_df(df) indat_pca = sklearnPCA() indat_transf = pd.DataFrame( indat_pca.fit_transform(df.values), index=df.index) pca_colnames = ["PCA" + str(i) for i in indat_transf.columns] indat_transf.columns = pca_colnames indat_transf_pos = _all_positive(indat_transf) if plot: _, ax1 = plt.subplots() ax1.plot(np.array(indat_pca.explained_variance_ratio_).cumsum()) ax1.bar(np.arange(0.1, len(indat_pca.explained_variance_ratio_), 1), np.array(indat_pca.explained_variance_ratio_)) ax1.legend(['Cumulative variance explained', 'Variance explained by component'], loc='center right') ax1.set_ylabel('Proportion of variance explained') ax1.set_title('Variance explained by each principal component') ax1.set_xlim(right=len(indat_pca.explained_variance_ratio_)) ax1.set_ylim(top=1) if allres: return indat_pca else: return indat_transf_pos
def MakeBlocksArray(band): path ='/Users/ryszardcetnarski/Desktop/PcaResults/' plt.style.use('seaborn-bright') db = LoadDatabase() all_normed = [] for name, subject in db.groupby(db.index): blocks = ExtractBlocks(subject, 'training', band) # return blocks #blocks_normed =( zscore(blocks, axis = None).T - zscore(blocks, axis = None)[:,0][:, np.newaxis].T).T blocks_normed = zscore(blocks, axis = None) all_normed.append(pd.DataFrame(blocks_normed, index= subject.index)) all_normed = pd.concat(all_normed) all_normed['condition'] = db['condition'] label_dict = {'plus':0, 'minus':1, 'control':2, 'sham':3} color = ['r', 'b','grey','g'] X = all_normed.ix[:,0:10].values y = all_normed.ix[:,10].values X_std = StandardScaler().fit_transform(X) sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X) fig = plt.figure() fig.suptitle(band, fontweight = 'bold') ax = fig.add_subplot(111) for idx, row in enumerate(Y_sklearn): ax.scatter(row[0], row[1], color = color[label_dict[all_normed.iloc[idx]['condition']]], alpha = 0.5) ax.set_xlabel('Principal Component 1') ax.set_ylabel('Principal Component 2') ax.legend( labels=('plus', 'minus', 'control')) legend = ax.get_legend() legend.legendHandles[0].set_color('red') legend.legendHandles[1].set_color('blue') legend.legendHandles[2].set_color('green')
def PcaFilter(X, name, PLOT_ON, method): """Individual subjects have outlier blocks filtered out based on malahanobis distance of their pca 1st and 2nd component (each subject is projected into their own variance space). Returns mask array (of 0's and 1's lenght of X) indicing bad and good blocks""" #The convention for mask arrays is 0 - inlier, 1 - outlier #Pca decomposition into first two components sklearn_pca = sklearnPCA(n_components=2) pcs = sklearn_pca.fit_transform(X) if(method == 'outlier'): #This index corresponds to the original index on the array of time series #last argument is the threshold of how many standard deviations away a point is considered an outlier outlier_idx = MD_removeOutliers(pcs[:,0], pcs[:,1], 2) #this will be used for a boolean array for filtering. mask_array = np.zeros(len(pcs)) if(len(outlier_idx) >0 ): mask_array[outlier_idx] = 1 if (method == 'cluster'): mask_array = Cluster(pcs) if(PLOT_ON): colors = ['r', 'b'] fig = plt.figure() fig.suptitle(name) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) #Plot PCA scores and mark outliers ax2.scatter(pcs[:,0], pcs[:,1], c = mask_array, cmap = 'jet', s = 60, marker = 'o') #Print variance ratio ax2.annotate(sklearn_pca.explained_variance_ratio_,xy= (1,1), xycoords='axes fraction', horizontalalignment='right', verticalalignment='top') #Plot original signals and mark PCA indentified outliers for idx,row in enumerate(X): ax1.plot(row, color =colors[int(mask_array[idx])], alpha = 0.2) return mask_array
def PCA(X, labels): sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X) fig = plt.figure() # fig.suptitle(band, fontweight = 'bold') ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) km = KMeans(n_clusters=2) km.fit(Y_sklearn) #ax.scatter(Y_sklearn[:,0], Y_sklearn[:,1]) #tresh = [0 if i >4 else 1 for i in Y_sklearn[:,0]] colors = ['blue', 'red'] for idx, row in enumerate(Y_sklearn): ax2.scatter(row[0], row[1], color =colors[km.labels_[idx]], alpha = 0.5) ax1.plot(X[idx,:], color = colors[km.labels_[idx]], alpha = 0.2) labels.iloc[km.labels_ ==1].to_csv('/Users/ryszardcetnarski/Desktop/Nencki/Badanie_NFB/Dane/miesniowcy_pca.csv ') return km.labels_, labels.iloc[km.labels_ ==1]
def combine_lda_pca(X, y): sklearn_lda = LDA(n_components=2) X_lda_sklearn = sklearn_lda.fit_transform(X, y) sklearn_pca = sklearnPCA(n_components=2) #PCA X_ldapca_sklearn = sklearn_pca.fit_transform(X_lda_sklearn) plot_scikit_lda(X_ldapca_sklearn, title='LDA+PCA via scikit-learn', mirror=(-1))
# Male list = [] # Fill the list for i in maleIndex: list.append(distances[column][i]) ci = meanConfidenceInterval(list) row = pd.Series(["M", str(column), ci[0], ci[1], ci[2]]) ciDf = ciDf.append(row, ignore_index=True) # Female list = [] # Fill the list for i in femaleIndex: list.append(distances[column][i]) ci = meanConfidenceInterval(list) row = pd.Series(["F", str(column), ci[0], ci[1], ci[2]]) ciDf = ciDf.append(row, ignore_index=True) # Set the dataframe columns ciDf.columns = ("gender", "distance", "mean", "lowerBound", "upperBound") # Save the dataframe ciDf.to_csv("confidence_intervals", index=False) # PCA if plotPCA: pca = sklearnPCA(n_components=3).fit_transform(difference) x = [p[0] for p in pca] y = [p[1] for p in pca] z = [p[2] for p in pca] ax = plt.axes(projection="3d") ax.scatter3D(x, y, z) plt.show()
def cluster_snapshot(): #args = flask.request.args # Get all the form arguments in the url with defaults #new_att = str(getitem(args, 'device_button','G_Swap_Used')) request_att=[] if request.method == "POST": #checked = 'att' in request.form request_att = request.form.getlist('att') #any_selected = bool(selected) request_att=[x.encode('UTF8') for x in request_att] #print(type(request_att)) #return str(request_att) #print (new_att) n_times=getData_N_Min_cluster(5) #n_times=cursor if n_times.count()==0: print ("No Data") #exit(1) data=dict() t=0 data=[] #Target only swap space #'compute-2-28' removed machine=['compute-2-29', 'compute-9-30', 'compute-9-36', 'compute-9-35','compute-9-34','compute-2-23', 'compute-2-25', 'compute-2-24', 'compute-2-27', 'compute-2-26', 'compute-6-29', 'compute-6-28', 'compute-6-25', 'compute-6-24', 'compute-6-27', 'compute-6-26', 'compute-6-23', 'compute-9-33', 'compute-9-32', 'compute-22-17', 'compute-22-16', 'compute-22-15', 'compute-22-14', 'compute-22-13', 'compute-22-12', 'compute-22-11', 'compute-22-18', 'compute-7-39', 'compute-7-38', 'compute-21-29', 'compute-21-28', 'compute-21-27', 'compute-21-26', 'compute-21-25', 'compute-21-24', 'compute-21-23', 'compute-5-1', 'compute-14-1', 'compute-14-2', 'compute-14-3', 'compute-14-4', 'compute-14-6', 'compute-14-7', 'compute-14-8', 'compute-13-6', 'compute-13-5', 'compute-13-4', 'compute-7-40', 'compute-13-2', 'compute-13-1', 'compute-14-30', 'compute-5-8', 'compute-14-32', 'compute-14-33', 'compute-14-34', 'compute-14-35', 'compute-18-18', 'compute-14-37', 'compute-14-38', 'compute-18-17', 'compute-5-3', 'compute-18-15', 'compute-5-5', 'compute-18-13', 'compute-5-7', 'compute-5-6', 'compute-6-2', 'compute-3-41', 'compute-6-1', 'compute-6-6', 'compute-6-7', 'compute-6-4', 'compute-6-5', 'compute-6-8', 'compute-13-28', 'compute-13-29', 'compute-13-26', 'compute-13-27', 'compute-13-24', 'compute-13-25', 'compute-13-23', 'compute-2-10', 'compute-2-11', 'compute-2-12', 'compute-2-14', 'compute-2-15', 'compute-2-16', 'compute-2-17', 'compute-2-18', 'compute-14-40', 'compute-2-8', 'compute-2-9', 'compute-2-7', 'compute-20-40', 'compute-1-9', 'compute-1-8', 'compute-6-11', 'compute-8-40', 'compute-6-14', 'compute-6-15', 'compute-6-16', 'compute-6-17', 'compute-6-10', 'compute-6-12', 'compute-6-13', 'compute-6-18', 'compute-4-29', 'compute-4-28', 'compute-23-38', 'compute-22-2', 'compute-23-36', 'compute-23-37', 'compute-23-34', 'compute-23-35', 'compute-4-27', 'compute-23-33', 'compute-4-25', 'compute-11-18', 'compute-8-38', 'compute-8-39', 'compute-11-17', 'compute-11-16', 'compute-22-40', 'compute-1-11', 'compute-1-10', 'compute-1-13', 'compute-1-12', 'compute-1-15', 'compute-1-14', 'compute-1-17', 'compute-1-16', 'compute-5-15', 'compute-5-14', 'compute-12-8', 'compute-5-16', 'compute-5-11', 'compute-5-10', 'compute-5-13', 'compute-5-12', 'compute-12-2', 'compute-12-3', 'compute-12-1', 'compute-12-6', 'compute-12-7', 'compute-12-4', 'compute-12-5', 'compute-12-27', 'compute-12-26', 'compute-12-18', 'compute-19-37', 'compute-19-36', 'compute-12-10', 'compute-12-11', 'compute-12-12', 'compute-12-13', 'compute-12-14', 'compute-12-15', 'compute-12-16', 'compute-12-17', 'compute-20-37', 'compute-20-36', 'compute-20-35', 'compute-20-39', 'compute-20-38', 'compute-23-39', 'compute-23-32', 'compute-4-26', 'compute-23-30', 'compute-12-23', 'compute-12-22', 'compute-12-25', 'compute-12-24', 'compute-19-39', 'compute-19-38', 'compute-12-29', 'compute-12-28', 'compute-19-35', 'compute-9-27', 'compute-21-40', 'compute-9-28', 'compute-9-29', 'compute-11-40', 'compute-21-38', 'compute-21-39', 'compute-21-30', 'compute-21-33', 'compute-21-34', 'compute-21-35', 'compute-21-36', 'compute-21-37', 'compute-5-28', 'compute-5-29', 'compute-5-24', 'compute-5-25', 'compute-5-26', 'compute-5-27', 'compute-5-23', 'compute-13-18', 'compute-13-13', 'compute-13-12', 'compute-13-11', 'compute-13-10', 'compute-13-17', 'compute-13-16', 'compute-13-15', 'compute-13-14', 'compute-14-15', 'compute-22-39', 'compute-22-38', 'compute-22-30', 'compute-22-33', 'compute-22-35', 'compute-22-34', 'compute-22-37', 'compute-22-36', 'compute-7-17', 'compute-7-16', 'compute-7-18', 'compute-5-17', 'compute-14-18', 'compute-14-12', 'compute-14-13', 'compute-14-10', 'compute-14-11', 'compute-14-16', 'compute-14-17', 'compute-14-14', 'compute-5-18', 'compute-21-8', 'compute-21-1', 'compute-21-2', 'compute-21-3', 'compute-21-4', 'compute-21-5', 'compute-21-6', 'compute-21-7', 'compute-4-30', 'compute-18-14', 'compute-23-27', 'compute-23-26', 'compute-12-37', 'compute-12-38', 'compute-21-11', 'compute-5-39', 'compute-5-38', 'compute-5-37', 'compute-5-36', 'compute-5-35', 'compute-5-34', 'compute-5-33', 'compute-5-32', 'compute-5-30', 'compute-22-3', 'compute-18-35', 'compute-22-1', 'compute-18-37', 'compute-22-7', 'compute-22-6', 'compute-22-5', 'compute-22-4', 'compute-22-8', 'compute-18-38', 'compute-18-39', 'compute-20-15', 'compute-20-17', 'compute-20-16', 'compute-20-18', 'compute-9-25', 'compute-2-38', 'compute-2-39', 'compute-2-36', 'compute-2-37', 'compute-2-34', 'compute-2-35', 'compute-2-32', 'compute-2-33', 'compute-2-30', 'compute-6-32', 'compute-6-33', 'compute-6-30', 'compute-6-36', 'compute-6-37', 'compute-6-34', 'compute-6-35', 'compute-6-38', 'compute-6-39', 'compute-9-26', 'compute-21-12', 'compute-21-13', 'compute-23-16', 'compute-23-17', 'compute-21-16', 'compute-21-17', 'compute-21-14', 'compute-21-15', 'compute-21-18', 'compute-23-18', 'compute-8-18', 'compute-8-16', 'compute-8-17', 'compute-11-39', 'compute-11-38', 'compute-22-28', 'compute-22-29', 'compute-22-23', 'compute-22-26', 'compute-22-27', 'compute-22-24', 'compute-22-25', 'compute-13-8', 'compute-13-7', 'compute-19-13', 'compute-19-15', 'compute-19-14', 'compute-19-17', 'compute-19-16', 'compute-14-27', 'compute-14-26', 'compute-14-25', 'compute-14-24', 'compute-14-23', 'compute-14-36', 'compute-14-29', 'compute-14-28', 'compute-18-16', 'compute-14-39', 'compute-3-39', 'compute-3-38', 'compute-5-2', 'compute-13-39', 'compute-13-38', 'compute-13-35', 'compute-13-34', 'compute-13-37', 'compute-13-36', 'compute-13-30', 'compute-13-33', 'compute-13-32', 'compute-3-40', 'compute-6-3', 'compute-13-40', 'compute-18-36', 'compute-23-29', 'compute-23-28', 'compute-23-25', 'compute-4-32', 'compute-4-33', 'compute-4-34', 'compute-4-35', 'compute-19-40', 'compute-18-40'] for t1 in n_times: devices=t1['data'].keys() for d in machine: lst=[] lst.append(d) for x in xrange(0,39): lst.append(t1['data'][d][x][1]) data.append(lst) t=t+1 res=['Device','G_Swap_Total', 'G_Swap_Free', 'G_Swap_Used', 'G_Proc_Run', 'G_Cpu_User', 'G_Cpu_Wio', 'G_Load_One', 'G_Load', 'G_Five', 'G_Load_Fifteen', 'G_Mem_Cached', 'G_Mem_Total', 'T_State', 'T_Slots', 'T_SlotsUsed', 'T_AvailMem(MB)', 'T_TotalMem(MB)/Swap', 'T_Time_Last_Rec', 'T_LoadAve', 'T_NetLoad(MB)', 'N_Status', 'N_Swap_Service', 'N_Swap_State', 'N_Swap_Info', 'N_IPMI_Service', 'N_IPMI_State', 'N_IPMI_Info', 'N_FreeSpace_Service', 'N_FreeSpace_State', 'N_FreeSpace_Info', 'N_CVMFS-OSG_Service', 'N_CVMFS-OSG_State', 'N_CVMFS-OSG_Info', 'N_CVMFS-CERN_Service', 'N_CVMFS-CERN_State', 'N_CVMFS-CERN_Info', 'N_CVMFS-CONDB_Service', 'N_CVMFS-CONDB_State', 'N_CVMFS-CONDB_Info'] att=['G_Swap_Used','G_Cpu_User', 'G_Cpu_Wio', 'G_Load_One', 'G_Load', 'G_Five', 'G_Load_Fifteen', 'G_Mem_Cached', 'T_AvailMem(MB)', 'T_LoadAve', 'T_NetLoad(MB)'] new_att=['Device','G_Swap_Used', 'G_Proc_Run', 'G_Cpu_User', 'G_Cpu_Wio', 'G_Load_One', 'G_Load', 'G_Five', 'G_Load_Fifteen', 'G_Mem_Cached', 'T_State', 'T_Slots', 'T_SlotsUsed','T_AvailMem(MB)', 'T_Time_Last_Rec', 'T_LoadAve','N_Status', 'N_Swap_State','N_IPMI_State','N_IPMI_Info','N_FreeSpace_State', 'N_CVMFS-OSG_State', 'N_CVMFS-CERN_State', 'N_CVMFS-CONDB_State'] #new_att=['Device','G_Swap_Used','T_AvailMem(MB)','G_Five','G_Cpu_Wio'] if request_att !=[]: new_att=request_att print (request_att) new_index=[] full_index=[] for i in new_att: full_index.append(res.index(i)) for a in att: new_index.append(res.index(a)) new_data=[] for d in data: core_count=int(d[14]) if core_count!=0: for i in new_index: d[i]=round(float(d[i])/core_count,2) d[i]=unicode(d[i]) tmp=[] for i in full_index: if i==res.index('N_IPMI_Info'): code_in_IPMI=re.findall(r'\d+',str(d[i])) if code_in_IPMI==[]: d[i]='0' else: d[i]=code_in_IPMI[0] tmp.append(d[i]) # tmp=[d[i] for i in full_index] new_data.append(tmp) df=pd.DataFrame(new_data) df.columns=new_att X = df.ix[:,1:len(df.columns)].values y = df.ix[:,0].values from sklearn.preprocessing import StandardScaler X_std = StandardScaler().fit_transform(X) from sklearn.decomposition import PCA as sklearnPCA sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X_std) x_corr=[] y_corr=[] label=[] x_l=[] y_l=[] new_dim_data=dict() for lab in machine: x_fact = Y_sklearn[y==lab, 0].tolist() y_fact = Y_sklearn[y==lab, 1].tolist() new_dim_data[lab] =[x_fact,y_fact] x_l.append(x_fact) y_l.append(y_fact) # Store new dimensions in database post={"date":datetime.datetime.utcnow(),"data":new_dim_data} d_var=db_new_dim.data post_id=d_var.insert_one(post).inserted_id for x in x_l: for x1 in x: x_corr.append(x1) for y in y_l: for y1 in y: y_corr.append(y1) l=len(x_l[0]) for lab in machine: for i in [lab for x in range(0,l)]: label.append(i) new_arr=np.array(zip(x_corr,y_corr)) k_means=KMeans(n_clusters=4) k_means.fit(new_arr) centroid=k_means.cluster_centers_ labels=k_means.labels_ colors=["green","red","cyan","yellow","blue"] color_src=[] for i in range(len(x_corr)): color_src.append(colors[labels[i]]) #output_file("toolbar.html") TOOLS="resize,crosshair,pan,wheel_zoom,box_zoom,reset,tap,previewsave,box_select,poly_select,lasso_select" source = ColumnDataSource( data=dict( x=x_corr, y=y_corr, desc=label, # colors=color_src, ) ) hover = HoverTool( tooltips=""" <div> <div> <span style="font-size: 17px; font-weight: bold;">@desc</span> <span style="font-size: 15px; color: #966;">[$index]</span> </div> <div> <span style="font-size: 15px;">Location</span> <span style="font-size: 10px; color: #696;">($x, $y)</span> </div> </div> """ ) #TOOLS= [BoxZoomTool(), ResetTool(),hover,ResizeTool(),WheelZoomTool()] TOOLS=["pan,wheel_zoom,box_zoom,reset,resize",hover] p = figure(plot_width=600, plot_height=600, tools=TOOLS, title="Mouse over the dots") p.circle('x', 'y', size=30, source=source,fill_color=color_src) p.scatter(centroid[:,0],centroid[:,1], color='black')#,s=200,linewidths=5,zorder=10) js_resources = INLINE.render_js() css_resources = INLINE.render_css() # For more details see: # http://bokeh.pydata.org/en/latest/docs/user_guide/embedding.html#components script, div = components(p, INLINE) html = flask.render_template( 'cluster_snapshot.html', plot_script=script, plot_div=div, js_resources=js_resources, css_resources=css_resources, ) return encode_utf8(html)
# plt.tight_layout() # plt.show() # break from sklearn.preprocessing import StandardScaler X_std = StandardScaler().fit_transform(X) print X_std from sklearn.decomposition import PCA as sklearnPCA sklearn_pca = sklearnPCA(n_components=2) Y_sklearn = sklearn_pca.fit_transform(X_std) x_corr = [] y_corr = [] label = [] # print Y_sklearn x_l = [] y_l = [] for lab in machine: x_l.append(Y_sklearn[y == lab, 0].tolist()) y_l.append(Y_sklearn[y == lab, 1].tolist()) for x in x_l:
#from matplotlib import pyplot as plt import numpy as np # feature_dict = {0: 'G_Swap_Total', 1: 'G_Swap_Free', 2: 'G_Swap_Used', 3: 'G_Proc_Run', 4: 'G_Cpu_User', 5: 'G_Cpu_Wio', 6: 'G_Load_One', 7: 'G_Load', # 8: 'G_Five', 9: 'G_Load_Fifteen', 10: 'G_Mem_Cached', 11: 'G_Mem_Total', 12: 'T_State', 13: 'T_Slots', 14: 'T_SlotsUsed', 15: 'T_AvailMem(MB)', 16: 'T_TotalMem(MB)/Swap', 17: 'T_Time_Last_Rec', 18: 'T_LoadAve', 19: 'T_NetLoad(MB)'} from sklearn.preprocessing import StandardScaler X_std = StandardScaler().fit_transform(X) CLUSTER_SIZE=4 from sklearn.decomposition import PCA as sklearnPCA sklearn_pca = sklearnPCA(n_components=CLUSTER_SIZE) Y_sklearn = sklearn_pca.fit_transform(X_std) x_corr=[] y_corr=[] label=[] x_l=[] y_l=[] for lab in machine: x_l.append(Y_sklearn[y==lab, 0].tolist()) y_l.append(Y_sklearn[y==lab, 1].tolist()) for x in x_l: