Esempio n. 1
0
# do I have a 2D matrix now?
print(
    "# do I have a 2D matrix now --> FULLY PREDICTED ????????????????????????")
for item in final_stars_FINAL_READY.collect():
    print(item)
print(
    "# do I have a 2D matrix now --> FULLY PREDICTED ??????????????????????? ==> NOW WE KNOw ........."
)
iris_irm = IndexedRowMatrix(
    final_stars_FINAL_READY.map(lambda x: IndexedRow(x[1], x[0])))

# ------------------------------------------
# https://blog.paperspace.com/dimension-reduction-with-principal-component-analysis/
# do SVD:
num_of_top_sing_values = 2
SVD = iris_irm.computeSVD(num_of_top_sing_values, True)

U = SVD.U
S = SVD.s.toArray()

# compute the eigenvalues and number of components to retain
n = final_stars_FINAL_READY.count()
eigvals = S**2 / (n - 1)
eigvals = np.flipud(np.sort(eigvals))
cumsum = eigvals.cumsum()
total_variance_explained = cumsum / eigvals.sum()
print(
    "total_variance_explained, given num reviews: " + str(n) +
    " and num_of_top_sing_values. " + str(num_of_top_sing_values) +
    "=======================================> ", total_variance_explained)
# on 1000 with 2 PCs --> total_variance_explained =======================================>  [0.61812207 1.        ]
Esempio n. 2
0
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix
from sklearn import datasets
# create the standardizer model for standardizing the dataset

X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) )
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
num_rows = X_rm.numRows()
# U is whitened and projected onto principal components subspace.

S = svd_o.s.toArray()
eig_vals = S**2
# change the ncomp to 3 for this tutorial
#n_comp  = np.argmax(np.cumsum(eig_vals)/eig_vals.sum() > 0.95)+1
n_comp = 3
U = svd_o.U.rows.map(lambda x:(x.index, (np.sqrt(num_rows-1)*x.vector).tolist()[0:n_comp]))
# K is our transformation matrix to obtain projection on PC's subspace
K = (U/S).T[:n_comp]