def svd_dimensionality_reduction_TruncatedSVD(A, dimension = 2): print("********** scikit-learn provides a TruncatedSVD class *************") from sklearn.decomposition import TruncatedSVD svd = TruncatedSVD(n_components=dimension) svd.fit(A) #print("svd \n",svd) result = svd.transform(A) #print("Transform Matrix is \n ",result) return result
def tSNE(fileName): cnx = sqlite3.connect('data/10Feature.db') dforigtrain = pandas.read_csv(fileName) print(dforigtrain.shape) print dforigtrain.head() dforigtrain.rename( columns=lambda x: '_'.join([x.strip() for x in x.lower().split()]), inplace=True) df = dforigtrain[[ c for c in dforigtrain.columns.values.tolist() if c != 'orig_set' ]] print(df.shape) # Write to DB to allow easier loading later df.to_sql('df_clean', cnx, if_exists='replace', index=None) df = pd.read_sql('select * from df_clean', cnx) print(df.shape) scaler = StandardScaler().fit(df.iloc[:, 2:]) dfs = pd.DataFrame(scaler.transform(df.iloc[:, 2:]), index=df.index, columns=df.columns[2:]) # Commented part helps in creating SVD '''u, s, vt = svd(dfs) ax = pd.Series(s).plot(figsize=(10,3), logy=True) print('{} SVs are NaN'.format(np.isnan(s).sum())) print('{} SVs less than 1e-12'.format(len(s[s < 1e-12]))) plt.show()''' # from here the Truncated SVD , this is mostly helpful in image data set where reducing dimensions is mostl;y possibel . In our case, every feature was contributing . Hence no Truncation is possible ncomps = 19 svd = TruncatedSVD(algorithm='randomized', n_components=ncomps) svd_fit = svd.fit(dfs) Y = svd.fit_transform(dfs) ax = pd.Series(svd_fit.explained_variance_ratio_.cumsum()).plot( kind='line', figsize=(10, 3)) print( 'Variance preserved by first ' + str(ncomps) + ' components == {:.2%}'.format( svd_fit.explained_variance_ratio_.cumsum()[-1])) plt.show() dfsvd = pd.DataFrame(Y, columns=['c{}'.format(c) for c in range(ncomps)], index=df.index) dfsvd.to_sql('df_svd', cnx, if_exists='replace', index=None) dfsvd = pd.read_sql('select * from df_svd', cnx) print(dfsvd.shape) svdcols = [c for c in dfsvd.columns if c[0] == 'c'] df = pd.read_sql('select * from df_clean', cnx) print(dfsvd.shape) print(dfsvd.head()) plotdims = 8 ploteorows = 1 dfsvdplot = dfsvd[svdcols].iloc[:, :plotdims] dfsvdplot['class'] = df['class'] #interactive(plot_3d_scatter, A=fixed(dfsvd), elevation=30, azimuth=120) ax = sns.pairplot(dfsvdplot.iloc[::ploteorows, :], hue='class', size=1.8) plt.show() #rowsubset = [10,20,40,80,160,320,640, 1280, 1900] tsne = TSNE(n_components=2, random_state=0) '''runs = np.empty((len(rowsubset),1)) for i, rows in enumerate(rowsubset): t0 = time() Z = tsne.fit_transform(dfsvd.iloc[:rows,:][svdcols]) runs[i] = time() - t0 ax = pd.DataFrame(runs, index=rowsubset).plot(kind='bar', logy=False, figsize=(10,4)) plt.show() ''' Z = tsne.fit_transform(dfsvd[svdcols]) dftsne = pd.DataFrame(Z, columns=['x', 'y'], index=dfsvd.index) ax = sns.lmplot('x', 'y', dftsne, fit_reg=False, size=8, scatter_kws={ 'alpha': 0.7, 's': 60 }) ax.axes.flat[0].set_title( 'Scatterplot of a 50D dataset reduced to 2D- Unsupervised') #plt.show() dftsne['class'] = df['class'] g = sns.lmplot('x', 'y', dftsne, hue='class', fit_reg=False, size=8, scatter_kws={ 'alpha': 0.7, 's': 60 }) g.axes.flat[0].set_title( 'Scatterplot of a 50D dataset reduced to 2D -Supervised') plt.show()
[21, 22, 23, 24, 25, 26, 27, 28, 29, 30]]) # SVD U, s, VT = svd(A) #Calculate how many singular values we have to take into account -> 80-90% sumv = np.sum(s) * 0.85 cumsumv = np.cumsum(s) def find_nearest(array, value): array = np.asarray(array) idx = (np.abs(array - value)).argmin() return array[idx] nearests = find_nearest(cumsumv, sumv) itemindexs = np.where(cumsumv == nearests) print(itemindexs) ## n_elements = valueof itemindexs svd = TruncatedSVD(n_components=3) svd.fit(A) result = svd.transform(A) np.asarray(result) print(result)
# transform T = U.dot(Sigma) print(T) T = A.dot(VT.T) print(T) from numpy import array from sklearn.decomposition import TruncatedSVD # define array A = array([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [11, 12, 13, 14, 15, 16, 17, 18, 19, 20], [21, 22, 23, 24, 25, 26, 27, 28, 29, 30]]) print(A) # svd svd = TruncatedSVD(n_components=2) svd.fit(A) result = svd.transform(A) print(result) B = svd.inverse_transform(result) C = array([[31, 32, 53, 44, 45, 66, 77, 48, 29, 210], [11, 12, 13, 134, 15, 16, 17, 18, 19, 20], [21, 22, 23, 24, 25, 126, 27, 28, 29, 30]]) print(C) result = svd.transform(C) print(result) B = svd.inverse_transform(result)
from scipy.linalg import svd from sklearn.metrics import accuracy_score, confusion_matrix, classification_report # train = "../../test/03-train-input.txt" train = "../../data/titles-en-train.labeled" test = "../../data/titles-en-test.labeled" train_X, train_y, train_v = load_data(train) test_X, test_y, test_v = load_data(test) # cv = CountVectorizer() # cv.fit(list(map(lambda x: " ".join(x), train_X))) # train_X, test_X = get_cntvec(cv, train_X), get_cntvec(cv, train_X) v = TfidfVectorizer(max_df=0.8) v.fit(list(map(lambda x: " ".join(x), train_X))) train_X, test_X = get_tdidf(v, train_X), get_tdidf(v, test_X) svd = TruncatedSVD(n_components=200, random_state=3939) svd.fit(train_X) train_X, test_X = get_reduced(svd, train_X), get_reduced(svd, test_X) train_X, train_y = np.array(train_X).astype( np.float32), np.array(train_y).astype(np.int32) test_X, test_y = np.array(test_X).astype( np.float32), np.array(test_y).astype(np.int32) # print(train_X, train_y) model = MLP(len(train_X[0]), len(train_X[0]) // 2, 1) # optimizer = SGD(lr=0.01) optimizer = Adam(lr=0.01) batch_size = 32 max_epoch = 50 train_model(model, optimizer, train_X, train_y, batch_size, max_epoch) pred_y = model.predict(test_X) pred_y = np.tanh(pred_y) print(accuracy_score(test_y, activate(pred_y)))