def plot_graph(train_set,idm='1'): plt.clf() print('Plotting graph of training examples') colors = ['red','blue','cyan','yellow','black','magenta','orange','brown','purple','olive','pink'] cl = {} itr = 0 for i in class_values: cl[i]=[colors[itr]] itr += 1 markers = ['o','x'] mk = {} itr = 0 for i in class_values: mk[i]=[markers[0]] itr += 1 labels = ['Class 0','Class 1','Class 2','Class 3','Class 4','Class 5','Class 6','Class 7','Class 8','Class 9'] lab = {} itr = 0 for i in class_values: lab[i]=[labels[itr]] itr += 1 fig = plt.figure() #ax1 = fig.add_subplot(211, projection = '3d') ax2 = fig.add_subplot(111) for i in train_set: newx = {} for m in range(len(train_set[i][0])): newx[m] = [] for s in train_set[i]: for k in range(len(s)): newx[k].append(s[k]) #ax1.scatter(newx, newy, newz, color = cl[i], marker = mk[i], label = lab[i]) #print(newx) data_mat = np.column_stack((newx[0],newx[1],newx[2])) #print(data_mat) data_mat_std = StandardScaler().fit_transform(data_mat) features = data_mat_std.T covariance_mat = np.cov(features) eig_vals, eig_vecs = np.linalg.eig(covariance_mat) proj_x = data_mat_std.dot(eig_vecs.T[np.argmax(eig_vals)]) eig_vals[np.argmax(eig_vals)] = 0 proj_y = data_mat_std.dot(eig_vecs.T[np.argmax(eig_vals)]) newy_ = [0 for _ in proj_x] ax2.scatter(proj_x,proj_y,color = cl[i], label = lab[i], s = 5) plt.legend() #plt.xticks(np.linspace(proj_x[np.argmin(proj_x)], proj_x[np.argmax(proj_x)], 5)) #plt.yticks(np.linspace(proj_y[np.argmin(proj_y)], proj_y[np.argmax(proj_y)], 15)) plt.savefig('Training_Set'+idm+'.png') plt.clf()
def myPCA(): iris = load_iris() #x = matriz, y = classe x, y = iris.data, iris.target #print(x) #Normaliza x = StandardScaler().fit_transform(x) #Matriz de covariança cov = np.cov(x.T) #Covalor e Covetor val, vec = eig(cov) #Covalor e covetor equivalentes juntos em pares, para ordenar pairs = [(np.abs(val[i]), vec[:, i]) for i in range(len(val))] pairs.sort() pairs.reverse() #Escolhe os dois primeiros(são os que mais "ajudam") e cria a matriz nova newMatrix = np.hstack((pairs[0][1].reshape(4, 1), pairs[1][1].reshape(4, 1))) newSamples = x.dot(newMatrix) print("\n", newSamples)
def pcaClassica(x): X = x # Padronização dos valores com media 0 e desvio padrão 1 entrada = StandardScaler().fit_transform(X) #matriz de covariancia, poderia ser usada uma matriz de correlação, podem daria o mesmo valor mat_covariancia = np.mean(entrada, axis=0) #calculo da covariancia cov_mat = (entrada - mat_covariancia).T.dot( (entrada - mat_covariancia)) / (entrada.shape[0] - 1) cov_mat = np.cov(entrada.T) #auto valores e auto vetores eig_vals, eig_vecs = np.linalg.eig(cov_mat) u, s, v = np.linalg.svd(entrada.T) for ev in eig_vecs: np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev)) eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] #aleatorizando os auto valores e auto vetores eig_pairs.sort(key=lambda x: x[0], reverse=True) matrix_w = np.hstack( (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1))) Y = entrada.dot(matrix_w) return Y
def principal_component_analysis(self): logging.info('Principal component analysis') X = self.data[self.all_metrics] X_scaled = StandardScaler().fit_transform(X) features = X_scaled.T cov_matrix = np.cov(features) values, vectors = np.linalg.eig(cov_matrix) importance = {} explained_variances = [] for i in range(len(values)): val = values[i] / np.sum(values) explained_variances.append(val) importance[val] = self.all_metrics[i] logging.info('Explained variances sum {} and list {},'.format( np.sum(explained_variances), explained_variances)) dict_keys = list(importance.keys()) dict_keys.sort(reverse=True) all_in_order = "" for k in dict_keys: all_in_order += importance[k] + " " logging.info( 'Variables in order of importance {} \n their variances {}'.format( all_in_order, dict_keys)) projected_1 = X_scaled.dot(vectors.T[0]) projected_2 = X_scaled.dot(vectors.T[1]) res = pd.DataFrame(projected_1, columns=['PC1']) res['PC2'] = projected_2 self.projected_res = res
def encodeVideo (self, fileName): y = None # if False: if os.path.exists (fileName+'.npa'): print ('loading {0}'.format (fileName+'npa')) y = np.load (fileName+'.npa') else: flows = self.calculateFlow (fileName) if flows.shape[0] == 0: return None x_std = StandardScaler().fit_transform (flows) # v is an array of eigen vectors and s is an array of eigen values u, s, v = np.linalg.svd (x_std) eig_vecs = v[:,0:40] # making transformation using eigen vectors by matrix multiplication y = x_std.dot(eig_vecs) y.dump (fileName+'.npa') i = 0 cflows = [] for flow in y: if i%1 == 0: cflows.append (flow) i += 1 return cflows
def pca(X, ndims=3): """Runs PCA on provided data, X, and returns the projection onto ndims principal components. This function assumes X has data series in columns. This function also returns the covariance matrix of the data (scaled to zero norm and unit variance), as well as the eigen vectors and values of that matrix. Input: X : ndarray with data series in columns (e.g. one neuron's calcium trace (or DF/F) per column) ndims : the number of dimensions to project down to. Default is 3 for fancy 3d scatter plots. Output: Y : Projected, scaled data. cov_mat : Covariance matrix of the scaled data eig_pairs : a list of tuples. Each tuple is of the form (eigen value, eigen vector), and they are sorted high to low""" original_dims = X.shape[1] if ndims > original_dims: ndims = original_dims #TODO Check what this scaler is actually doing; it might be scaling columns independently X_std = StandardScaler().fit_transform(X) cov_mat = np.cov(X.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] eig_pairs.sort(key=lambda x: x[0], reverse=True) W = np.hstack( (eig_pairs[i][1].reshape(original_dims, 1) for i in range(ndims))) Y = X_std.dot(W) return Y, cov_mat, eig_pairs
def PCA(data, n_components): data = StandardScaler().fit_transform(data) new_data = (calculate_covariance(subtract_by_mean(data))) eigens = compute_eigen(new_data) top_eigs = get_the_top_eigenvectors(eigens, n_components) pca = data.dot(top_eigs) return pca, top_eigs, np.mean(data, axis=1)
def PCA(raw_data, principal_components): # Standardizing x_std = StandardScaler().fit_transform( raw_data ) # Produces a 7027x64 matrix (7027 companies, 64 independent variables # CALCULATE CORRELATION MATRIX AND ASSOCIATED EIGENVALUES/EIGENVECTORS cor_mat = np.corrcoef(x_std.T) eig_vals, eig_vecs = np.linalg.eig(cor_mat) # Make a list of (eigenvalue, eigenvector) tuples eig_pairs = [ (np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals)) ] # Add df.columns[i] after eig_vecs[:,1] as a third column to include variable name # SORT eig_pairs.sort() eig_pairs.reverse() # CONSTRUCT PROJECTION MATRIX WITH 7 PRINCIPAL COMPONENTS (~60% INFORMATION RETAINED) #pc = principal_components matrix_w = np.hstack( (eig_pairs[i][1].reshape(64, 1) for i in range(principal_components) )) # Produces a 64 x 'Principal Components' (e.g. 64x7) Matrix ''' UN-HIDE CODE TO EXPORT REDUCED DATASET AS A CSV ''' # matrix_w_df = pd.DataFrame(matrix_w) # matrix_w_df.to_csv("matrix_w.csv") y = x_std.dot(matrix_w) y_df = pd.DataFrame(y) return y_df
def PCA(): dataset = readDataset() #Passo 1: Centralizar os dados em torno do ponto 0. Caso as features possuem unidades de medidas diferentes, devemos dividir o resultado pela standard deviation. scaled = StandardScaler().fit_transform(dataset.X.astype(float)) #Passo 2: Calcular a covariancia da matrix de dados, onde a covariância indica o grau de interdependência númerica entre duas variáveis covMatrix = (np.corrcoef(scaled.astype(float).T)) #Passo 3: Calcular os autovalores e autovetores da matrix de covariancia w, v = np.linalg.eig(covMatrix) #Verificar o quanto de informação pode ser atribuido para cada componente percentage = (w / sum(w)) * 100 print('Informação atribuida para cada componente: ', percentage) eig_pairs = [(np.abs(w[i]), v[:, i]) for i in range(len(w))] # Concatena horizontalmente as features. matrix_w = np.hstack( (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1), eig_pairs[2][1].reshape(4, 1), eig_pairs[3][1].reshape(4, 1))) X = scaled.dot(matrix_w) df = pd.DataFrame(data=X, columns=[ 'Principal component 1', 'Principal component 2', 'Principal component 3', 'Principal component 4' ]) df['target'] = dataset.Y sns.pairplot(data=df, hue='target') plt.show()
class PCA: def __init__(self, X): self.__X = X self.__X_std = 0 def __covariance_matrix(self, X): cov_mat = np.cov(self.__X_std.T) return cov_mat def __calculate_eigens(self): self.__X_std = StandardScaler().fit_transform(self.__X) cov_mat = self.__covariance_matrix(self.__X_std) eig_vals, eig_vec = np.linalg.eig(cov_mat) return eig_vals, eig_vec def get_components(self, num): eig_vals, eig_vecs = self.__calculate_eigens() eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] eig_pairs.sort() eig_pairs.reverse() w = [eig_pairs[i][1] for i in range(num)] matrix_w = np.array(w).T new_X = self.__X_std.dot(matrix_w) return new_X
def pca(array, fcount): X = np.array(array) X_std = StandardScaler().fit_transform(X) mean_vec = np.mean(X_std, axis=0) cov_mat = np.cov(X_std.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] eig_pairs.sort(key=lambda x: x[0], reverse=True) tot = sum(eig_vals) var_exp = [(i / tot) * 100 for i in sorted(eig_vals, reverse=True)] cum_var_exp = np.cumsum(var_exp) arr = [] for i in range(fcount): arr.append(eig_pairs[i][1].reshape(len(eig_vals), 1)) matrix_w = np.hstack(arr) result = X_std.dot(matrix_w) cluster_nums = [] for val in result: maxVal = np.amax(val) cluster_nums.append(np.nonzero(val == maxVal)[0][0]) output = [] for idx, val in enumerate(cluster_nums): output.append([val] + array[idx]) return output
def get_final_score(UCF, CF, POP, weights=None): score = np.array( (list(UCF.values()), list(CF.values()), list(POP.values()))).T score = StandardScaler().fit_transform(score) if weights: return score.dot(np.array(weights).reshape(3, 1)) else: return score
def findPCA(trainData): X_std = StandardScaler().fit_transform(trainData) covMatTdata = np.cov(X_std.T) eigenVal, eigenVect = np.linalg.eig(covMatTdata) eigenPairs = [(np.abs(eigenVal[i]), eigenVect[:, i]) for i in range(len(eigenVal))] eigenPairs.sort(key=lambda x: x[0], reverse=True) eigVector50 = [eigenPairs[i][1] for i in range(0, 50)] npEigVect50 = (np.asarray(eigVector50)).T reducedTdata = X_std.dot(npEigVect50) return reducedTdata
def prepare(df, model): features = df.shape[1] - 1 df1 = df[df.label == 3] df2 = df[df.label == 5] df3 = pd.concat([df2, df1]) m = np.asarray(df3) label = m[:, features] mat = StandardScaler().fit_transform(df3.ix[:, :(len(df3.columns) - 1)]) matrix = mat.dot(model) matrix = np.real(matrix) return matrix, label
def pca_test(df, model, name): df = df.transpose() label = list(df.ix[:, (len(df.columns) - 1)]) matrix = StandardScaler().fit_transform(df.ix[:, :(len(df.columns) - 1)]) final = matrix.dot(model) final = np.real(final) df_out = pd.DataFrame(columns=name) for i in range(150): df_out.ix[:, i] = final[:, i] df_out['label'] = label return df_out
def simulate(n_samples, w0, b0=None): n_features = w0.shape[0] cov = toeplitz(0.5**np.arange(0, n_features)) X = multivariate_normal(np.zeros(n_features), cov, size=n_samples) X = StandardScaler().fit_transform(X) logits = X.dot(w0) if b0 is not None: logits += b0 p = sigmoid(logits) y = np.random.binomial(1, p, size=n_samples).astype("float64") y[:] = 2 * y - 1 y = y.astype("float64") return X, y
def perform_pca(file_path): # read the dataset from csv file X = np.genfromtxt (file_path, delimiter=",") # delete the first row X = np.delete(X, (0), axis=0) X_std = StandardScaler().fit_transform(X) # Step 1: calculate mean center for all the columns mean_vec = np.mean(X_std, axis=0) # Step 2: calculate cov(x) cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1) print('Covariance matrix \n%s' %cov_mat) # Step 3: calculate eigen values and eigen vectors of cov(x) eigen_values, eigen_vectors = np.linalg.eig(cov_mat) print('Eigenvectors \n%s' %eigen_vectors) print('\nEigenvalues \n%s' %eigen_values) # Make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(np.abs(eigen_values[i]), eigen_vectors[:,i]) for i in range(len(eigen_values))] # Sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs.sort() eig_pairs.reverse() # Step 4: Projection matrix_w = np.hstack((eig_pairs[0][1].reshape(X_std.shape[1],1), eig_pairs[1][1].reshape(X_std.shape[1],1))) print('Matrix W:\n', matrix_w) Y = X_std.dot(matrix_w) print('Matrix Y:\n', Y) # Step 5: Plot the projections for the first and second principal components fig = plt.figure() ax = fig.add_subplot(1,1,1) ax.scatter(Y[:,0],Y[:,1]) # set the x-spine (see below for more info on `set_position`) ax.spines['left'].set_position('zero') # turn off the right spine/ticks ax.spines['right'].set_color('none') ax.yaxis.tick_left() # set the y-spine ax.spines['bottom'].set_position('zero') # turn off the top spine/ticks ax.spines['top'].set_color('none') ax.xaxis.tick_bottom() fig.show()
def PCA(data, n_components): if n_components >= data.shape[1]: return "Number of components have to be less than the number of columns, or {}".format( data.shape[1]) data = StandardScaler().fit_transform(data) Sigma_if_it_were = (calculate_covariance(subtract_by_mean(data))) U, S, V = np.linalg.svd(Sigma_if_it_were) min_list = [] #this is just to get an accurate k #there is a PCA reconstruction in the other file, which is anothber mothed for choosing k for i in range(data.shape[1] - 1): #percentage of variance retained from reducing dimensions. SS = 1 - (np.sum(S[:i]) / np.sum(S)) min_list.append(SS) print((min_list)) return data.dot(U[:n_components, :].T)
def pca(X, n_pcs=2): """\ Implementing `PCA' from scratch, using covariance matrix Parameters: ----------- X: gene expression matrix, of the shape (n_cells, n_features) n_pcs: number of reduced dimensions Returns: ----------- X_pca: pca representation of X, of the shape (n_cells, n_pcs). """ # Data normalization X = StandardScaler().fit_transform(X) # implement your code here cov_matrix = np.cov(X.T) values, vectors = np.linalg.eig(cov_matrix) projection_matrix = (vectors.T[:][:n_pcs]).T return (X.dot(projection_matrix))
def pca(df): df = df.transpose() label = list(df.ix[:, (len(df.columns) - 1)]) matrix = StandardScaler().fit_transform(df.ix[:, :(len(df.columns) - 1)]) covariance = np.cov(matrix.T) eigen_val, eigen_vec = np.linalg.eig(covariance) eigens = list() for i in range(len(eigen_val)): eigens.append([(np.abs(eigen_val[i])), eigen_vec[:, i]]) eigens.sort eigens.reverse eigen_total = sum(eigen_val) lam = [] cum_sum = 0 for value in eigen_val: cum_sum += value lam.append(cum_sum / eigen_total) plt.plot(lam, marker='o') plt.xlabel("# of Features") plt.ylabel("Cumulative sum of eigen values/eigen value total") plt.show() last = [] name = [] for i in range(150): last.append(eigens[i][1].reshape(784, 1)) name.append(str(i)) name.append("label") reduced = np.hstack(last) print matrix.shape print reduced.shape final = matrix.dot(reduced) final = np.real(final) df_out = pd.DataFrame(columns=name) for i in range(150): df_out.ix[:, i] = final[:, i] df_out['label'] = label return df_out, reduced, name
def __predict__(self, predictors, prediction_parameters, *args, **kwargs): # Compute prediction (first remove df from the end of the params vector) pred_params = prediction_parameters[:-1, :] intercept = 0 if self._svr_intercept == self.PredictionIntercept: intercept = pred_params[0, :] pred_params = pred_params[1:, :] # Scale predictors to match the scaling used in fitting try: scaled_predictors = self._predictors_scaler.transform(predictors) except AttributeError: # Assume that the data used to predict has the similar statistics than the used # in learning and therefore the scaling can be learned from this data to be # predicted scaled_predictors = StandardScaler().fit_transform(predictors) # Return prediction return scaled_predictors.dot(pred_params) + intercept
def one_dim_feature_map(data, num_nodes): centered_data = StandardScaler().fit_transform(data) covariance_mat = np.cov(np.transpose(centered_data)) eigenval, eigenvec = np.linalg.eig(covariance_mat) eigs = [(eigenval[i], eigenvec[:, i]) for i in range(4)] sorted_eigs = sorted(eigs, reverse=True) pcs = [sorted_eigs[i][1] for i in range(2)] conversion = np.array(pcs) projection = centered_data.dot(np.transpose(conversion)) data = np.transpose(projection) network_dimensions = np.array([num_nodes, 1]) n_iters = 10000 init_learning_rate = 0.01 m = data.shape[0] n = data.shape[1] nodes = np.random.random((network_dimensions[0], network_dimensions[1], m)) init_radius = max(network_dimensions[0], network_dimensions[1]) / 2 time_constant = n_iters / np.log(init_radius) for i in range(n_iters): t = data[:, np.random.randint(0, n)].reshape(np.array([m, 1])) bmu, bmu_index = find_bmu(t, nodes, m) r = init_radius * np.exp(-i / time_constant) l = init_learning_rate * np.exp(-i / n_iters) for x in range(nodes.shape[0]): for y in range(nodes.shape[1]): w = nodes[x, y, :].reshape(m, 1) w_dist = np.sum((np.array([x, y]) - bmu_index)**2) if w_dist <= r**2: neighborhood = np.exp(-w_dist / (2 * (r**2))) new_w = w + (l * neighborhood * (t - w)) new_nodes = nodes[x, y, :] nodes[x, y, :] = new_w.reshape(1, 2) final_nodes = np.squeeze(nodes, axis=1) plt.scatter(data[0], data[1]) plt.plot(final_nodes[:, 0], final_nodes[:, 1], 'ok') plt.plot(final_nodes[:, 0], final_nodes[:, 1], 'k') plt.title("SOFM with 25 Nodes") plt.show()
def PCA(X, y, eps=1): (m, n) = X.shape X_std = StandardScaler().fit_transform(X) mean_vec = np.mean(X_std, axis=0) cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1) print('Covariance matrix \n%s' %cov_mat) cov_mat = np.cov(X_std.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) print('Eigenvectors \n%s' %eig_vecs) print('Eigenvalues \n%s' %eig_vals) u,s,v = np.linalg.svd(X_std.T) eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] eig_pairs.sort(key=lambda x: x[0], reverse=True) eig_vals_sorted = [] print('Eigenvalues in descending order:') for i in eig_pairs: print(i[0]) eig_vals_sorted = np.append(eig_vals_sorted, i[0]) tot = sum(eig_vals) var_exp = [(eig_vals_sorted[i] / tot)*100 for i in range(n)] cum_var_exp = np.cumsum(var_exp) var_i = np.array([np.sum(eig_vals_sorted[: i + 1])/ tot * 100.0 for i in range(n)]) print("% of saved information with different component cnt", var_i) k = 2 print('%.2f %% variance retained in %d dimensions' % (var_i[k-1], k)) matrix_w = np.zeros((n, k)) for i in range(k): ar = np.asarray(eig_pairs[i][1]) for j in range(n): matrix_w[j][i] = ar[j] print('Matrix W:\n', matrix_w) Y = X_std.dot(matrix_w) return Y
def pca(df): # Get values of dataframe X = df.values # Standardise the data values X_std = StandardScaler().fit_transform(X) # Get the mean vectore of the data mean_vec = np.mean(X_std, axis=0) # Subtract mean from data X_std_mean = (X_std - mean_vec) # Get transpose of data to multiply it by the untransposed data to get the covariance matrix X_std_mean_transpose = X_std_mean.T # Calculate the covariance matrix cov_mat = X_std_mean_transpose.dot(X_std_mean) / (X_std.shape[0]-1) # Compute the eigen values and vectors eig_vals, eig_vecs = np.linalg.eig(cov_mat) # Make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] # Sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs.sort(key=lambda x: x[0], reverse=True) # Compute the projection matrix based on the eigen vectors num_features = X.shape[1] proj_mat = eig_pairs[0][1].reshape(num_features,1) for eig_vec_idx in range(1, X_std.shape[1]): proj_mat = np.hstack((proj_mat, eig_pairs[eig_vec_idx][1].reshape(num_features,1))) # Project the data pca_data = X_std.dot(proj_mat) # Return projection matrix and the pca data return proj_mat, pca_data, eig_vecs
class PCA(): _eigenvalues = None _eigenvectors = None _cov_mat = None def __init__(self, data, k): self._data = StandardScaler().fit_transform(data.astype(float)) self._k = k def calc_cov_matrix(self): self._cov_mat = np.cov(self._data.T) def get_cov_matrix(self): return self._cov_mat def calc_eigenstuff(self): self._eigenvalues, self._eigenvectors = np.linalg.eig(self._cov_mat) def get_eigenvalues(self): return self._eigenvalues def get_eigenvectors(self): return self._eigenvectors def choose_best(self): eigenpairs = [(np.abs(self.get_eigenvalues()[i]), self.get_eigenvectors()[:, i]) for i in range(len(self.get_eigenvalues()))] eigenpairs.sort() eigenpairs.reverse() return eigenpairs def run(self): self.calc_cov_matrix() self.calc_eigenstuff() pairs = self.choose_best() projection_matrix = [pairs[i][1] for i in range(self._k)] projection_matrix = np.array(projection_matrix).T final = self._data.dot(projection_matrix) return final
def transform_embedding(embedding, distance_matrix): config = Config() distances = distance_matrix[:, :, 0] signs = distance_matrix[:, :, 1] # performing the usual operations config.logger.info("Performing L1 normalization...") normalized = Normalizer('l1').transform(distances.T).T config.logger.info("Performing sign correction...") sign_corrected = normalized * signs config.logger.info("Performing standard scaling...") scaled = StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(embedding) transformed_space = scaled.dot(sign_corrected) config.logger.info("Transformed space calculated!") return transformed_space
def reduce_dims(data, labels, non_setosa_labels, plot=True): centered_data = StandardScaler().fit_transform(data) covariance_mat = np.cov(np.transpose(centered_data)) eigenval, eigenvec = np.linalg.eig(covariance_mat) eigs = [(eigenval[i], eigenvec[:, i]) for i in range(4)] sorted_eigs = sorted(eigs, reverse=True) pcs = [sorted_eigs[i][1] for i in range(2)] conversion = np.array(pcs) projection = centered_data.dot(np.transpose(conversion)) setosa = [] versicolor = [] virginica = [] non_set = [] for i in range(150): if labels[i] == -1: setosa.append(projection[i]) else: non_set.append(projection[i]) for i in range(100): if non_setosa_labels[i] == -1: versicolor.append(non_set[i]) else: virginica.append(non_set[i]) setosa = np.array(setosa) virginica = np.array(virginica) versicolor = np.array(versicolor) if plot: plt.figure() setosa_points = plt.scatter(setosa[:, 0], setosa[:, 1], c='b') versicolor_points = plt.scatter(versicolor[:, 0], versicolor[:, 1], c='g') virginica_points = plt.scatter(virginica[:, 0], virginica[:, 1], c='m') plt.title("Iris Dataset projected onto first two PCs") plt.legend((setosa_points, versicolor_points, virginica_points), ("Setosa", "Versicolor", "Virginica")) plt.show() return projection
def find_pca(self,feature_list): feature_transformed = StandardScaler().fit_transform(feature_list) plt.figure() pca = PCA().fit(feature_transformed) #pca.fit_transform(feature_transformed) eigen_vectors = pca.components_ eigen_vectors = eigen_vectors.T #eigen_values = pca.explained_variance_ plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.title('PCA of person') plt.grid(True) plt.show() pca = PCA(n_components=5).fit(feature_transformed) print (pca.explained_variance_ratio_) #eigen_values = pca.explained_variance_ eigen_vectors = pca.components_ eigen_vectors = eigen_vectors.T final_features = feature_transformed.dot(eigen_vectors) plt.figure() plt.plot(final_features[0],c = 'g',label='Feature 1') plt.plot(final_features[1],c = 'y',label='Feature 2') plt.plot(final_features[2],c = 'r',label='Feature 3') plt.plot(final_features[3],c = 'c',label='Feature 4') plt.plot(final_features[4],c = 'black',label='Feature 5') plt.title('Features of PCA') plt.legend() imp_features = [] for i in range(pca.n_components): index = np.where(pca.components_[i] == pca.components_[i].max()) imp_features.append(index[0][0]+1) print(index[0][0]+1)
def pca(X, ndims=3): """Runs PCA on provided data, X, and returns the projection onto ndims principal components. This function assumes X has data series in columns. This function also returns the covariance matrix of the data (scaled to zero norm and unit variance), as well as the eigen vectors and values of that matrix. Input: X : ndarray with data series in columns (e.g. one neuron's calcium trace (or DF/F) per column) ndims : the number of dimensions to project down to. Default is 3 for fancy 3d scatter plots. Output: Y : Projected, scaled data. cov_mat : Covariance matrix of the scaled data eig_pairs : a list of tuples. Each tuple is of the form (eigen value, eigen vector), and they are sorted high to low""" original_dims = X.shape[1]; if ndims > original_dims: ndims = original_dims #TODO Check what this scaler is actually doing; it might be scaling columns independently X_std = StandardScaler().fit_transform(X) cov_mat = np.cov(X.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] eig_pairs.sort(key=lambda x: x[0], reverse=True) W = np.hstack((eig_pairs[i][1].reshape(original_dims,1) for i in range(ndims))) Y = X_std.dot(W) return Y, cov_mat, eig_pairs
def pca_preprocessing(df): X = df.iloc[:, 0:4].values y = df.iloc[:, 4].values X_std = StandardScaler().fit_transform(X) mean_vec = np.mean(X_std, axis=0) cov_mat = (X_std - mean_vec).T.dot( (X_std - mean_vec)) / (X_std.shape[0] - 1) #cov_mat = np.cov(X_std.T) eig_vals, eig_vecs = np.linalg.eig(cov_mat) #make a list of (eigenvalue, eigenvector) tuples eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))] #sort the (eigenvalue, eigenvector) tuples from high to low eig_pairs.sort(reverse=True) #choose top 3 to form transform weights matrix_w = np.hstack( (eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1), eig_pairs[2][1].reshape(4, 1))) x_reduced = X_std.dot(matrix_w) y_encoded = y_encode(y) return x_reduced, y_encoded
df.dropna(how="all", inplace=True) df.tail() X = df.iloc[:, 0:4].values print(X) Y = df.iloc[:, 4].values print(Y) from sklearn.preprocessing import StandardScaler X = StandardScaler().fit_transform(X) X_cov = np.transpose(X).dot(X) print(X_cov) eig_vals, eig_vecs = np.linalg.eig(X_cov) print('Eigenvectors \n%s' % eig_vecs) print('\nEigenvalues \n%s' % eig_vals) sum_of_ev = 0 for i in eig_vals: sum_of_ev = sum_of_ev + i var1 = eig_vals[0] / sum_of_ev var2 = eig_vals[1] / sum_of_ev var3 = eig_vals[2] / sum_of_ev var4 = eig_vals[3] / sum_of_ev print('Due to PC1 : %s ' % (var1 * 100)) print('Due to PC2 : %s ' % (var2 * 100)) print('Due to PC3 : %s ' % (var3 * 100)) print('Due to PC4 : %s ' % (var4 * 100)) W = np.transpose([eig_vecs[:, 0], eig_vecs[:, 1]]) print(np.matrix(W)) T = X.dot(W) print(np.matrix(T))
plt.figure(figsize=(6, 4)) plt.bar(range(4), var_exp, alpha=0.5, align='center', label='% Individual de Varianza Descrita') plt.step(range(4), cum_var_exp, where='mid', label='% Acumulado de Varianza Descrita') plt.ylabel('Radio de Varianza Descrita') plt.xlabel('Componentes Principales') plt.legend(loc='best') plt.tight_layout() #se utilizan las dos componentes principales mas grandes matrix_w = np.hstack((eig_pairs[0][1].reshape(22,1), eig_pairs[1][1].reshape(22,1))) #se genera la proyeccion Y_proy = X_std.dot(matrix_w) #la d data_2d = pd.DataFrame(Y_proy) data_2d.index = data.index data_2d.columns = ['PC1','PC2'] #la e row_means = data.mean(axis=1) row_trends = data.diff(axis=1).mean(axis=1) #Scatter color secuencial data_2d.plot(kind='scatter', x='PC1', y='PC2', figsize=(16,8), c=row_means,cmap='Blues') plt.xlabel('Componente Principal 1') plt.ylabel('Componente Principal 2') #Scatter color Divergente
E_vecs.dot(Sigma.dot(E_vecs.T)) #Analyse feature reduction / variance trade-off: sum_evals = sum(E_vals) retained_variance = [(i / sum_evals)*100 for i in sorted(E_vals, reverse=True)] cum_retained_variance = np.cumsum(retained_variance) #print(cum_retained_variance[1000], cum_retained_variance[5250], cum_retained_variance[7000], cum_retained_variance[10000]) #=============Prepare data for XGBoost==============================================================# #Choose 5250 features giving 80% retained variance i = 5250 sorted_reduced_evecs = E_vecs[np.argsort(E_vals)[-i:]] #Determine reduced projection matrix for both (normalised) test and train Xp = X.dot(sorted_reduced_evecs.T) X_test_p = X_test.dot(sorted_reduced_evecs.T) Xp_df = pd.DataFrame(Xp) X_test_p_df = pd.DataFrame(X_test_p) #Assemble Train, Test, y X_train_cols = (training_join_df['App'], Xp_df) X_test_cols = (test_data_df['App'], X_test_p_df) y_train_cols = (training_join_df['Label']) #training_join_df.loc['Desc'] X_train_df = pd.concat(X_train_cols, axis=1) X_test_df = pd.concat(X_test_cols, axis=1) #Convert to Array train_X = X_train_df.values
data = Data([trace1, trace2]) layout=Layout( yaxis=YAxis(title='Explained variance in percent'), title='Explained variance by different principal components') fig = Figure(data=data, layout=layout) py.iplot(fig) #%% matrix_w = np.hstack((eig_pairs[0][1].reshape(13,1), eig_pairs[1][1].reshape(13,1))) print('Matrix W:\n', matrix_w) #%% Y = X_std.dot(matrix_w) traces = [] for name in (1,2,3): trace = Scatter( x=Y[y==name,0], y=Y[y==name,1], mode='markers', name=name, marker=Marker( size=12, line=Line( color='rgba(217, 217, 217, 0.14)', width=0.5), opacity=0.8))
#grafico varianzas plt.figure(figsize=(6, 4)) plt.bar(range(4), var_exp, alpha=0.5, align='center', label='% Individual de Varianza Descrita') plt.step(range(4), cum_var_exp, where='mid', label='% Acumulado de Varianza Descrita') plt.ylabel('Radio de Varianza Explicada') plt.xlabel('Componentes Principales') plt.legend(loc='best') plt.tight_layout() #se utilizan las dos primeras pc's, se proyectan y se genera la muestra 2d matrix_w = np.hstack((eig_pairs[0][1].reshape(18,1), eig_pairs[1][1].reshape(18,1))) #proyeccion de las 2 pc's Y_sklearn = X_std.dot(matrix_w) data_2d = pd.DataFrame(Y_sklearn) data_2d.index = data.index data_2d.columns = ['PC1','PC2'] #la e #media y varianzas de data 2d row_means = data.mean(axis=1) row_trends = data.diff(axis=1).mean(axis=1) data_2d.plot(kind='scatter', x='PC1', y='PC2', figsize=(16,8), c=row_means,cmap='Blues') plt.xlabel('Componente Principal 1') plt.ylabel('Componente Principal 2') #scatter diver data_2d.plot(kind='scatter', x='PC1', y='PC2', figsize=(16,8), c=row_means,cmap='seismic') plt.xlabel('Componente Principal 1')
def df_pca(df_in, keep=None, expvar=False, rmoutliers=True, show=True, colorcol=None): """ Run a simple PCA on the df features of keep. If expvar is True, a plot of explained variance is also shown. Heavily inspired by http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html """ from sklearn.preprocessing import StandardScaler if keep is None: keep = ['maxV', 'maxDerivV', 'maxDerivdV', 'minDerivV', 'minDerivdV', 'preMinV', 'postMinV', 'preMaxCurveK', 'postMaxCurveK', 'postMaxCurveV', 'preMaxCurveV', 'height', 'repolarizationV', 'intervals'] # Clean the data frame df = df_in.copy() for col in df.columns: if col not in keep: df = df.drop(col, 1) else: if col != colorcol: df[col] = outlier(df[col].values) df = df.dropna() if colorcol is not None: colors = df[colorcol].values df = df.drop(colorcol, 1) # Make into np.array data = [] for col in df.columns: temp_ = df[col] data.append(temp_) data = np.array(data).T # Make as array and transpose data = StandardScaler().fit_transform(data) # Standardize data # run pca (svd) u, eigvals, eigvecs = np.linalg.svd(data, full_matrices=False) eigpairs = [(np.abs(eigvals[i]), eigvecs[:,i]) for i in range(len(eigvals))] eigpairs.sort() eigpairs.reverse() mat_w = np.hstack((eigpairs[0][1].reshape(eigvals.shape[0],1), eigpairs[1][1].reshape(eigvals.shape[0],1))) Y = data.dot(mat_w) # Re-transform by matrix # Plot these data if show: contcols = ['lightskyblue', 'brown', 'orange', 'springgreen', 'fuchsia', 'tomato', 'gold', 'indigo', 'darkslateblue', 'black', 'darkgreen', 'aqua', 'darkorchid', 'grey', 'salmon', 'plum', 'coral', 'sienna', 'darkkhaki', 'yellowgreen', 'deeppink', 'ivory', 'orchid', 'lightsteelblue'] plt.figure() if colorcol is not None: try: colors = [contcols[list(set(colors)).index(u)] for u in colors] except: colors = 'blue' else: colors='blue' plt.scatter(Y[:,0], Y[:,1], color=colors, edgecolor='none', alpha=0.7) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.tight_layout() # Explained variance if expvar: # eigvals come pre-sorted var_exp = [i/sum(eigvals)*100. for i in eigvals] cum_var_exp = np.cumsum(var_exp) #with plt.style.context('seaborn_whitegrid'): plt.figure() plt.bar(range(len(var_exp)), var_exp, alpha=0.5, align='center', label='individual explained variance') plt.step(range(len(cum_var_exp)), cum_var_exp, where='mid', label='cumulative explained variance') plt.xlabel('Principal components') plt.ylabel('Explained variance (\%100)') # \\% plt.legend(loc='best') plt.tight_layout() plt.show() # Show the plots return Y