def process(sparkContext,sqlContext): # Define database connection parameters MYSQL_USERNAME = '******' MYSQL_PASSWORD = '******' MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"dblp.author_sample").load() rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect() colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect() rawData = df.map(lambda p: (long(rows.index(p.name_hash)),long(colums.index(p.paper_hash)),1.0)).cache() # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(rawData) rowMat = mat.toRowMatrix() print mat.numRows() # 3 print rowMat.numCols() # transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax)))) for r in rowMat.rows().collect(): print r
def calculate_similarity(self): train = self.train_data train_user_mean = train.groupBy("userId").agg(F.mean('rating')) train_user_mean = train_user_mean.withColumnRenamed("avg(rating)", "user_mean") train_rating_avg = train.join(train_user_mean, 'userId', how='left_outer') train_rating_avg = train_rating_avg.select( '*', (train_rating_avg.rating - train_rating_avg.user_mean) .alias('rating_norm')) rdd = (train_rating_avg.select('movieId', 'userId', 'rating_norm') .rdd.map(tuple)) coord = CoordinateMatrix(rdd) mat = coord.toRowMatrix() similarities = mat.columnSimilarities() similarities_df = similarities.entries.toDF() window = (Window.partitionBy(similarities_df['i']) .orderBy(similarities_df['value'].desc())) similarities_df_ranked = ( similarities_df .select('*', F.row_number().over(window).alias('row_number')) .filter(F.col('row_number') <= 100)) similarities_df_ranked.write.parquet(SIMILARITY_FILE_SORTED, mode='overwrite')
def process(sparkContext, sqlContext): # Define database connection parameters MYSQL_USERNAME = "******" MYSQL_PASSWORD = "******" MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"jdbc") .options(url=MYSQL_CONNECTION_URL, driver="com.mysql.jdbc.Driver", dbtable="dblp.author_sample") .load() ) rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect() colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect() rawData = df.map(lambda p: (long(rows.index(p.name_hash)), long(colums.index(p.paper_hash)), 1.0)).cache() # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(rawData) rowMat = mat.toRowMatrix() print mat.numRows() # 3 print rowMat.numCols() # transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax)))) for r in rowMat.rows().collect(): print r
global train_rdd train_rdd = train_lines.map(lambda line: line.split(',')).map( lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2]))) # Build Train Data Dict. with Format [(user, item)] = rating, for later check if the similar movie is rated global train_dict train_dict = {} for x, y, z in train_rdd.collect(): train_dict[(x, y)] = z # ----------------------------------------- Build simPdsDF ----------------------------------------------- # Form utilityMatrix to get simMat later sqlCon = SQLContext(sc) utilityMatrix = CoordinateMatrix(train_rdd) # Similarity Btw. Items simMat = utilityMatrix.toRowMatrix().columnSimilarities() # Convert simMat to Pandas format global simPdsDF sparkDF = simMat.entries.map(lambda x: str(x.i) + "," + str(x.j) + "," + str( x.value)).map(lambda w: w.split(',')).toDF() simPdsDF = sparkDF.toPandas() # edit columns' name simPdsDF.columns = ['ItemID_1', 'ItemID_2', 'Similarity'] # change data type simPdsDF['ItemID_1'] = simPdsDF['ItemID_1'].astype(int) simPdsDF['ItemID_2'] = simPdsDF['ItemID_2'].astype(int) simPdsDF['Similarity'] = simPdsDF['Similarity'].astype(float) # --------------------------------------- Used for RDD to calculate bias --------------------------------------------- global train_pdsDF train_pdsDF = pd.read_csv('train.dat', sep=",")
import pyspark from pyspark import SparkContext from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry from operator import add from pyspark.sql import SparkSession sc = SparkContext() r=sc.textFile("part-00000") m=r.flatMap(lambda x: x.split('\n')).filter(lambda x : "A" in x).map(lambda x : (x.strip("A, ")).split(' ')).map(lambda x: tuple(list(map(int, x)))) #n=m.map(lambda x : MatrixEntry(tuple(x))) spark = SparkSession(sc) #m.toDF().show() print(hasattr(m,"toDF")) cmat=CoordinateMatrix(m) #mat = CoordinateMatrix(n) #o=mat.take(5) print(cmat.numRows()) # 3 print(cmat.numCols()) rowmat = cmat.toRowMatrix() print(rowmat.numRows()) # 3 print(rowmat.numCols())
#conf = SparkConf().setAppName('linalgtest') #sc = SparkContext(conf=conf).getOrCreate() #use local spark on computer # findspark.init() #from pyspark.sql import SparkSession local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx' rdd = spark.sparkContext.textFile(local_file_location) rdd = rdd.map(lambda line: line.split(" ")) rdd = rdd.map( lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2]))) mat = CoordinateMatrix(rdd) M = mat.toRowMatrix() A = mat.toBlockMatrix() At = mat.transpose().toBlockMatrix() print("SVD") print(M.numRows(), M.numCols()) start_svd = time.time() NUM_TIMES = 10 #do it 10 times to get mean for i in range(NUM_TIMES): svd = M.computeSVD(5, computeU=True) end_svd = time.time() print("Time elapsed: ", (end_svd - start_svd) / NUM_TIMES) # CPU seconds elapsed (floating point)
debug.TIMESTAMP(1) def to_matrix_entry(s): ss = s.split() entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2])) return entry data = sc.textFile('hdfs://node1:9000/input/sqr.txt') mat = data.map(to_matrix_entry) rdd = sc.parallelize(mat.collect()) coord_mat = CoordinateMatrix(rdd) coord_mat = coord_mat.transpose() row_mat = coord_mat.toRowMatrix() sim = row_mat.columnSimilarities() print(sim.entries.take(10)) debug.TIMESTAMP(2) ''' data = data.map(lambda _ : np.array(_.strip().split()).astype(float)) unitMatrix = data.map(lambda _ : _/np.linalg.norm(_)) #unitMatrix = sc.parallelize(np.array([[1,2,3,5,7], [6,2,1,-1,3], [7,0,1,2,-4]]).T) mat = RowMatrix(unitMatrix) S = mat.columnSimilarities() sims = S.entries.collect() print(len(sims)) print(sims)
diag_entries = diag_entries_1.union(diag_entries_2).union(diag_entries_3); coo_matrix_input_all = coo_matrix_input_all.union(diag_entries); coo_matrix_entries = coo_matrix_input_all.map(lambda e: MatrixEntry(e[0], e[1], e[2])); coo_matrix = CoordinateMatrix(coo_matrix_entries); #SAVE TO A FILE coo_matrix_input_all.repartition(1).saveAsTextFile("./Laplacian_4v7o_4cores_1") t2 = timeit.default_timer() print("Elapsed time for construction: {:} s".format(t2 - t0)) #Singular value decomposition dataRows = coo_matrix.toRowMatrix().rows k = int(args.k) #N_singvalues svd = RowMatrix(dataRows.persist()).computeSVD(k, computeU=True) U = svd.U # The U factor is a RowMatrix. s = svd.s # The singular values are stored in a local dense vector. V = svd.V #The V factor is a local dense matrix sc.parallelize(V.toArray()).repartition(1).saveAsTextFile("EigenVectors_4v7o_4cores") sc.parallelize(s.toArray()).repartition(1).saveAsTextFile("EigenValues_4v7o_4cores") t4 = timeit.default_timer() print("Elapsed time for SVD: {:} s".format(t4 - t2)) print("Total memory = {:}, used memory = {:}, free memory = {:}".format(psutil.virtual_memory().total/mb, (psutil.virtual_memory().total - psutil.virtual_memory().free) / mb, psutil.virtual_memory().free/mb)); print("System size = {:} atoms".format(Natoms.sum()))