def MatrixTranspose( mat ): #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help. ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it, #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order ## use indexed matrix could partially fix this issue by reordering but this is too wierd ''' transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector :param mat: the input row matrix :return a transposed row matrix ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark ''' if isinstance(mat, IndexedRowMatrix): mat = mat.toRowMatrix() #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF( ).orderBy("index") # back to sparse first then convert to indexedrowmatrix transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow( row["index"], MLLibVectors.sparse( row["vector"].size, np.nonzero(row["vector"].values)[0], row["vector"].values[ np.nonzero(row["vector"].values)]))) return IndexedRowMatrix(transposed_mat)
def transpose(rm): cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) return cm.transpose().toRowMatrix()
#use local spark on computer # findspark.init() #from pyspark.sql import SparkSession local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx' rdd = spark.sparkContext.textFile(local_file_location) rdd = rdd.map(lambda line: line.split(" ")) rdd = rdd.map( lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2]))) mat = CoordinateMatrix(rdd) M = mat.toRowMatrix() A = mat.toBlockMatrix() At = mat.transpose().toBlockMatrix() print("SVD") print(M.numRows(), M.numCols()) start_svd = time.time() NUM_TIMES = 10 #do it 10 times to get mean for i in range(NUM_TIMES): svd = M.computeSVD(5, computeU=True) end_svd = time.time() print("Time elapsed: ", (end_svd - start_svd) / NUM_TIMES) # CPU seconds elapsed (floating point) print("multiply Matrix")
debug = Debugger() debug.TIMESTAMP(1) def to_matrix_entry(s): ss = s.split() entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2])) return entry data = sc.textFile('hdfs://node1:9000/input/sqr.txt') mat = data.map(to_matrix_entry) rdd = sc.parallelize(mat.collect()) coord_mat = CoordinateMatrix(rdd) coord_mat = coord_mat.transpose() row_mat = coord_mat.toRowMatrix() sim = row_mat.columnSimilarities() print(sim.entries.take(10)) debug.TIMESTAMP(2) ''' data = data.map(lambda _ : np.array(_.strip().split()).astype(float)) unitMatrix = data.map(lambda _ : _/np.linalg.norm(_)) #unitMatrix = sc.parallelize(np.array([[1,2,3,5,7], [6,2,1,-1,3], [7,0,1,2,-4]]).T) mat = RowMatrix(unitMatrix) S = mat.columnSimilarities() sims = S.entries.collect() print(len(sims))