def svd(feature_matrix,k): svd = TruncatedSVD(n_components=k) components = svd.fit_transform(feature_matrix) print(svd.explained_variance_ratio_) principalDf = pd.DataFrame(data=components) # print(principalDf) return 0
def svd(self, factors = 5, top_questions = 10): ''' Input: instance variable, integer, integer Output: dataframe, dataframe, list, integer This function performs SVD, calls the describe function which returns the results and returns the decomposed matricies. It also returns the svd model itself so that other functions can be called on it during EDA. ''' svd = TruncatedSVD(n_components = factors) W = svd.fit_transform(self.answers_clean) H = svd.components_ var_ratio = svd.explained_variance_ratio_ var = svd.explained_variance_ self.describe_svd_results(W, H, top_questions) return W, H, var_ratio, var
def sk_lsi_bi(self, num_vec): """ performs lsa/lsi or simply svd on the tf-matrix using sklearn :param num_vec: :return: """ index_arr = self.ini_index_bi.to_numpy()[:, :-1] index_arr_spa = csr_matrix(index_arr) svd = TruncatedSVD(n_components=num_vec, n_iter=50) B1 = svd.fit_transform(index_arr_spa) B = svd.inverse_transform(B1) b_df = pd.DataFrame(data=B, index=list(self.ini_index_bi.index), columns=self.docIDs) # b_df['n_i'] = self.ini_index['n_i'] b_df['idf'] = self.ini_index_bi['idf'] self.index_bi = b_df
# データロード iris = load_iris() # データ格納 iris_X = iris.data y = iris.target # 1 特異値分解の実行 ------------------------------------------------------------------------------ # インスタンス生成 svd = TruncatedSVD() vars(svd) # 学習 iris_svd = svd.fit_transform(iris_X) iris_svd[:5] vars(svd) # 分散の寄与率 svd.explained_variance_ratio_ # 2 主成分分析と比較 -------------------------------------------------------------------------------- # インスタンスの生成 pca = PCA(n_components=2) vars(pca) # 学習 iris_pca = pca.fit_transform(iris_X)
import numpy as np from scipy.linalg import svd from sklearn.decomposition import TruncatedSVD d = np.array([[4, 2, 1], [10, 8, 3], [3, 9, 4]]) u, s, v = svd(d) r = 1 new_s = s[0:r] new_u = u[:, 0:r] new_v = v[0:r, :] print new_v print new_u.dot(np.diag(new_s)) svd = TruncatedSVD(n_components=r, random_state=42) p = svd.fit_transform(d) print p
def tSNE(fileName): cnx = sqlite3.connect('data/10Feature.db') dforigtrain = pandas.read_csv(fileName) print(dforigtrain.shape) print dforigtrain.head() dforigtrain.rename( columns=lambda x: '_'.join([x.strip() for x in x.lower().split()]), inplace=True) df = dforigtrain[[ c for c in dforigtrain.columns.values.tolist() if c != 'orig_set' ]] print(df.shape) # Write to DB to allow easier loading later df.to_sql('df_clean', cnx, if_exists='replace', index=None) df = pd.read_sql('select * from df_clean', cnx) print(df.shape) scaler = StandardScaler().fit(df.iloc[:, 2:]) dfs = pd.DataFrame(scaler.transform(df.iloc[:, 2:]), index=df.index, columns=df.columns[2:]) # Commented part helps in creating SVD '''u, s, vt = svd(dfs) ax = pd.Series(s).plot(figsize=(10,3), logy=True) print('{} SVs are NaN'.format(np.isnan(s).sum())) print('{} SVs less than 1e-12'.format(len(s[s < 1e-12]))) plt.show()''' # from here the Truncated SVD , this is mostly helpful in image data set where reducing dimensions is mostl;y possibel . In our case, every feature was contributing . Hence no Truncation is possible ncomps = 19 svd = TruncatedSVD(algorithm='randomized', n_components=ncomps) svd_fit = svd.fit(dfs) Y = svd.fit_transform(dfs) ax = pd.Series(svd_fit.explained_variance_ratio_.cumsum()).plot( kind='line', figsize=(10, 3)) print( 'Variance preserved by first ' + str(ncomps) + ' components == {:.2%}'.format( svd_fit.explained_variance_ratio_.cumsum()[-1])) plt.show() dfsvd = pd.DataFrame(Y, columns=['c{}'.format(c) for c in range(ncomps)], index=df.index) dfsvd.to_sql('df_svd', cnx, if_exists='replace', index=None) dfsvd = pd.read_sql('select * from df_svd', cnx) print(dfsvd.shape) svdcols = [c for c in dfsvd.columns if c[0] == 'c'] df = pd.read_sql('select * from df_clean', cnx) print(dfsvd.shape) print(dfsvd.head()) plotdims = 8 ploteorows = 1 dfsvdplot = dfsvd[svdcols].iloc[:, :plotdims] dfsvdplot['class'] = df['class'] #interactive(plot_3d_scatter, A=fixed(dfsvd), elevation=30, azimuth=120) ax = sns.pairplot(dfsvdplot.iloc[::ploteorows, :], hue='class', size=1.8) plt.show() #rowsubset = [10,20,40,80,160,320,640, 1280, 1900] tsne = TSNE(n_components=2, random_state=0) '''runs = np.empty((len(rowsubset),1)) for i, rows in enumerate(rowsubset): t0 = time() Z = tsne.fit_transform(dfsvd.iloc[:rows,:][svdcols]) runs[i] = time() - t0 ax = pd.DataFrame(runs, index=rowsubset).plot(kind='bar', logy=False, figsize=(10,4)) plt.show() ''' Z = tsne.fit_transform(dfsvd[svdcols]) dftsne = pd.DataFrame(Z, columns=['x', 'y'], index=dfsvd.index) ax = sns.lmplot('x', 'y', dftsne, fit_reg=False, size=8, scatter_kws={ 'alpha': 0.7, 's': 60 }) ax.axes.flat[0].set_title( 'Scatterplot of a 50D dataset reduced to 2D- Unsupervised') #plt.show() dftsne['class'] = df['class'] g = sns.lmplot('x', 'y', dftsne, hue='class', fit_reg=False, size=8, scatter_kws={ 'alpha': 0.7, 's': 60 }) g.axes.flat[0].set_title( 'Scatterplot of a 50D dataset reduced to 2D -Supervised') plt.show()
def svd(data, dim=2): svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42) return svd.fit_transform(doc_term_matrix_d1)
import numpy as np from scipy.linalg import svd from sklearn.decomposition import TruncatedSVD d = np.array([[4, 2, 1], [10, 8, 3], [3, 9, 4]]) u, s, v = svd(d) r = 1 new_s = s[0:r] new_u = u[:, 0 : r] new_v = v[0 : r, :] print new_v print new_u.dot(np.diag(new_s)) svd = TruncatedSVD(n_components = r, random_state = 42) p = svd.fit_transform(d) print p
def main(): millis_start = int(round(time.time() * 1000)) eta = 0.8 c = 0.5 epochs = 1 figure_count = 0 print("Start", millis_start) # load data #file_path="/home/fahad/code/ML/MNIST_digit_data.mat"; file_path = "MNIST_digit_data.mat" #file_path=input("Enter MNIST file path: "); if (Path(file_path).exists() == False): print("File does not exist in current directory") file_path = input("Enter MNIST file path: ") #return; matdata = scio.loadmat(file_path) labels_test = np.array(matdata['labels_test']) labels_train = np.array(matdata['labels_train']) images_test = np.array(matdata['images_test']) images_train = np.array(matdata['images_train']) #These are the data sets to be used for everything shuff_train_images_1000, shuff_train_labels_1000 = shuffle_in_unison( images_train, labels_train, 1000) #3. Train SVM svm = LinearSVC() svm_model_wts = svm.fit(shuff_train_images_1000, shuff_train_labels_1000) #weights=svm.coef_; predicted = svm.predict(images_test) print("predicted", predicted.shape) #print("actual labels",labels_test); conf_matrix = np.zeros((10, 10)) accuracy = 0 for i in range(predicted.shape[0]): conf_matrix[labels_test[i][0]][predicted[i]] += 1 #cnf_matrix=confusion_matrix(labels_test[0:4], predicted); for j in range(10): accuracy += conf_matrix[j][j] print(np.sum(conf_matrix), " Accuracy:", accuracy / (np.sum(conf_matrix))) #4. Access m to get weight values #print("weight",svm.coef_.shape); confusion_matrix = np.zeros((10, 10)) #print("c matrix",confusion_matrix.shape); idx = 0 count_wts = 0 misclassify = 0 misclassify_records = [] for row in images_test: #print("looping test",idx); count_wts = 0 flag = True #weights_missclassified=[]; prev_wt = 0 pred_idx = 0 for weights in svm.coef_: #print("looping",count_wts); #if(labels_test[idx]==count_wts): if (np.dot(weights, row) > prev_wt): prev_wt = np.dot(weights, row) pred_idx = count_wts #weights_missclassified.append(np.dot(weights,row)); if (np.dot(weights, row) >= 1 and flag): #print("labels",labels_test[idx][0]); confusion_matrix[labels_test[idx][0]][count_wts] += 1 flag = False #else: # confusion_matrix[labels_test[idx][0]][count_wts]+=1; if (flag and count_wts == 9): #record is misclassified by all misclassify += 1 confusion_matrix[labels_test[idx][0]][pred_idx] += 1 #misclassify_records.append(row); count_wts += 1 idx += 1 #print(confusion_matrix); #print("total sum",sum(sum(confusion_matrix))); accuracy = 0 for j in range(10): accuracy += confusion_matrix[j][j] #print("Accuracy:",accuracy/labels_test.shape[0]); #5. PCA on training data pca = PCA(n_components=50) pca.fit(images_train) images_train_reduced = pca.transform(images_train) #print("shape of PCA",images_train_reduced.shape); #manually reduce the dimension using SVD mean = np.mean(images_train, axis=1) print("mean shape", mean.shape) mean_vec = mean.reshape((mean.shape[0], 1)) #N=images_train.shape[0]; norm_images_train = images_train - mean_vec mean_test = np.mean(images_test, axis=1) mean_vec_test = mean_test.reshape((mean_test.shape[0], 1)) norm_images_test = images_test - mean_vec_test print("shape of norm data", norm_images_train.shape) svd = skd.TruncatedSVD(n_components=50) reduced_images_train = svd.fit_transform(norm_images_train) #reduced_images_train=svd.transform(norm_images_train); print("reduced", reduced_images_train.shape) retrans_images_train = svd.inverse_transform(reduced_images_train) print("unreduced", retrans_images_train.shape) #Now compare to original data -- How?? #pick random idxes for 20 images and plot alongside indexes = random.sample(range(0, norm_images_train.shape[0]), 20) indexes = sorted(indexes) #print("indexes",indexes); fig_num = 1 figure_count += 1 plt.figure(figure_count) for i in range(0, 2): #figure_count+=1; for idxs in indexes: plt.subplot(8, 5, fig_num) plt.axis('off') if (i == 0): #print("which idx",idxs); imag = norm_images_train[idxs, :] else: #print("trans",idxs); imag = retrans_images_train[idxs, :] rot_img = np.fliplr(np.array(imag).reshape((28, 28))) final_rot_img = scipy.ndimage.rotate(rot_img, 90) plt.imshow(final_rot_img, cmap='gray', interpolation='nearest') #plt.tight_layout(); fig_num += 1 #plt.text(0,0,actual_string,fontsize=7); plt.savefig("comparison-20.png") vecs_eig, list_mse, list_dim = project_data(norm_images_train, 500) #plot mses figure_count += 1 plt.figure(figure_count) plt.ylabel("Mean Square Errors") plt.xlabel("Dimensions") plt.title("MSE across reduced dimensions") plt.hold(True) plt.grid(True) #plt.ylim((0,1)); plt.plot(list_dim, list_mse, 'xb-') plt.savefig("hw4-5-mse.png") #Display eigs fig_num = 1 figure_count += 1 plt.figure(figure_count) for i in range(vecs_eig.shape[0]): row = vecs_eig[i, :] plt.subplot(2, 5, fig_num) plt.axis('off') rot_img = np.fliplr(vecs_eig[i, :].reshape((28, 28))) final_rot_img = scipy.ndimage.rotate(rot_img, 90) plt.imshow(final_rot_img, cmap='gray', interpolation='nearest') fig_num += 1 plt.savefig("hw4-5-eigs.png") #return; #6. project and train - should i be using normed data? dimension_list = [ 2, 5, 10, 20, 30, 50, 70, 100, 150, 200, 250, 300, 400, 500, 748 ] #dimension_list=[400,500,748]; #print("dim length", len(dimension_list)); dim_acc_svm = np.empty((len(dimension_list), 2)) dim_acc_mlp = np.empty((len(dimension_list), 2)) counter = 0 for dims in dimension_list: accu = train_svm(norm_images_train[:10000], labels_train[:10000], norm_images_test[:3000], labels_test[:3000], dims) accu_mlp = train_mlp(norm_images_train[:10000], labels_train[:10000], norm_images_test[:3000], labels_test[:3000], dims) dim_acc_svm[counter, 0] = dims dim_acc_svm[counter, 1] = accu dim_acc_mlp[counter, 0] = dims dim_acc_mlp[counter, 1] = accu counter += 1 #print("dimension",dim_acc_svm); #print("dim mlp",dim_acc_mlp); figure_count += 1 plt.figure(figure_count) plt.ylabel("Accuracy") plt.xlabel("Dimensions") plt.title("Distribution of accuracy across dimensions") plt.hold(True) plt.grid(True) plt.ylim((0, 1)) plt.plot(dim_acc_svm[:, 0], dim_acc_svm[:, 1], 'xb-') plt.savefig("hw4-7.png") #return; #8. implementaion of neural network figure_count += 1 plt.figure(figure_count) plt.ylabel("Accuracy") plt.xlabel("Dimensions") plt.title("Distribution of accuracy across dimensions") plt.hold(True) plt.grid(True) plt.ylim((0, 1)) plt.plot(dim_acc_mlp[:, 0], dim_acc_mlp[:, 1], 'xb-') plt.savefig("hw4-8.png")