Esempio n. 1
0
def MatrixTranspose(
    mat
):  #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help.
    ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it,
    #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix
    ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order
    ## use indexed matrix could partially fix this issue by reordering but this is too wierd
    '''
	transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector
	:param mat: the input row matrix
	:return a transposed row matrix
	ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark
	'''
    if isinstance(mat, IndexedRowMatrix):
        mat = mat.toRowMatrix()
    #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency
    transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
    transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF(
    ).orderBy("index")
    # back to sparse first then convert to indexedrowmatrix
    transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow(
        row["index"],
        MLLibVectors.sparse(
            row["vector"].size,
            np.nonzero(row["vector"].values)[0], row["vector"].values[
                np.nonzero(row["vector"].values)])))
    return IndexedRowMatrix(transposed_mat)
Esempio n. 2
0
 def transpose(rm):
     cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap(
         lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
     return cm.transpose().toRowMatrix()
Esempio n. 3
0
#use local spark on computer
# findspark.init()
#from pyspark.sql import SparkSession

local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx'

rdd = spark.sparkContext.textFile(local_file_location)
rdd = rdd.map(lambda line: line.split(" "))
rdd = rdd.map(
    lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2])))

mat = CoordinateMatrix(rdd)
M = mat.toRowMatrix()
A = mat.toBlockMatrix()
At = mat.transpose().toBlockMatrix()

print("SVD")
print(M.numRows(), M.numCols())
start_svd = time.time()

NUM_TIMES = 10
#do it 10 times to get mean
for i in range(NUM_TIMES):
    svd = M.computeSVD(5, computeU=True)

end_svd = time.time()
print("Time elapsed: ", (end_svd - start_svd) /
      NUM_TIMES)  # CPU seconds elapsed (floating point)

print("multiply Matrix")
Esempio n. 4
0
debug = Debugger()
debug.TIMESTAMP(1)


def to_matrix_entry(s):
    ss = s.split()
    entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2]))
    return entry


data = sc.textFile('hdfs://node1:9000/input/sqr.txt')
mat = data.map(to_matrix_entry)
rdd = sc.parallelize(mat.collect())

coord_mat = CoordinateMatrix(rdd)
coord_mat = coord_mat.transpose()
row_mat = coord_mat.toRowMatrix()
sim = row_mat.columnSimilarities()
print(sim.entries.take(10))

debug.TIMESTAMP(2)
'''
data = data.map(lambda _ : np.array(_.strip().split()).astype(float))
unitMatrix = data.map(lambda _ : _/np.linalg.norm(_))

#unitMatrix = sc.parallelize(np.array([[1,2,3,5,7], [6,2,1,-1,3], [7,0,1,2,-4]]).T)
mat = RowMatrix(unitMatrix)
S = mat.columnSimilarities()

sims = S.entries.collect()
print(len(sims))