def calculate_cosine(): def transform(self, f): return f(self) def cosineSimilarity(v1, v2): x = v1.toArray() y = v2.toArray() values1 = v1.values values2 = v2.values dotProduct = x.dot(y) ratingNorm = math.sqrt(sum(values1**2)) rating2Norm = math.sqrt(sum(values2**2)) return (float(dotProduct / (ratingNorm * rating2Norm))) ratings = spark.read.csv("/home/your_path/ratings.csv", header=True, inferSchema=True) df_3 = ratings.select(["movieId", "userId", "rating"]) cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple)) i_3 = cmat_3.toIndexedRowMatrix() i_df_3 = i_3.rows.toDF(["id", "features"]) DataFrame.transform = transform distance_cosine = udf(cosineSimilarity, FloatType()) possibleMatches = i_df_3.transform( lambda df: df.alias("left").join(df.alias("right"))) result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \ .withColumn("cosine", distance_cosine("left.features", "right.features")) \ .select("left.id", "right.id", "cosine")
def process(sparkContext, sqlContext): # Define database connection parameters MYSQL_USERNAME = "******" MYSQL_PASSWORD = "******" MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"jdbc") .options(url=MYSQL_CONNECTION_URL, driver="com.mysql.jdbc.Driver", dbtable="dblp.author_sample") .load() ) rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect() colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect() rawData = df.map(lambda p: (long(rows.index(p.name_hash)), long(colums.index(p.paper_hash)), 1.0)).cache() # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(rawData) rowMat = mat.toRowMatrix() print mat.numRows() # 3 print rowMat.numCols() # transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax)))) for r in rowMat.rows().collect(): print r
def process(sparkContext,sqlContext): # Define database connection parameters MYSQL_USERNAME = '******' MYSQL_PASSWORD = '******' MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"dblp.author_sample").load() rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect() colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect() rawData = df.map(lambda p: (long(rows.index(p.name_hash)),long(colums.index(p.paper_hash)),1.0)).cache() # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(rawData) rowMat = mat.toRowMatrix() print mat.numRows() # 3 print rowMat.numCols() # transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax)))) for r in rowMat.rows().collect(): print r
def calculate_jaccard(): def jaccardSimilarity(v1, v2): indices1 = set(v1.indices) indices2 = set(v2.indices) intersection = set.intersection(indices1, indices2) union = indices1.union(indices2) return (float(len(intersection)) / float(len(union))) ratings = spark.read.csv("/home/your_path/ratings.csv", header=True, inferSchema=True) df_3 = ratings.select(["movieId", "userId"]).withColumn("rating", lit(1.0)) cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple)) i_3 = cmat_3.toIndexedRowMatrix() i_df_3 = i_3.rows.toDF(["id", "features"]) def transform(self, f): return f(self) DataFrame.transform = transform jaccard_udf = udf(jaccardSimilarity, FloatType()) possibleMatches = i_df_3.transform( lambda df: df.alias("left").join(df.alias("right"))) result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \ .withColumn("jaccard", jaccard_udf("left.features", "right.features")) \ .select("left.id", "right.id", "jaccard")
def calculate_similarity(self): train = self.train_data train_user_mean = train.groupBy("userId").agg(F.mean('rating')) train_user_mean = train_user_mean.withColumnRenamed("avg(rating)", "user_mean") train_rating_avg = train.join(train_user_mean, 'userId', how='left_outer') train_rating_avg = train_rating_avg.select( '*', (train_rating_avg.rating - train_rating_avg.user_mean) .alias('rating_norm')) rdd = (train_rating_avg.select('movieId', 'userId', 'rating_norm') .rdd.map(tuple)) coord = CoordinateMatrix(rdd) mat = coord.toRowMatrix() similarities = mat.columnSimilarities() similarities_df = similarities.entries.toDF() window = (Window.partitionBy(similarities_df['i']) .orderBy(similarities_df['value'].desc())) similarities_df_ranked = ( similarities_df .select('*', F.row_number().over(window).alias('row_number')) .filter(F.col('row_number') <= 100)) similarities_df_ranked.write.parquet(SIMILARITY_FILE_SORTED, mode='overwrite')
def MatrixTranspose( mat ): #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help. ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it, #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order ## use indexed matrix could partially fix this issue by reordering but this is too wierd ''' transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector :param mat: the input row matrix :return a transposed row matrix ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark ''' if isinstance(mat, IndexedRowMatrix): mat = mat.toRowMatrix() #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF( ).orderBy("index") # back to sparse first then convert to indexedrowmatrix transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow( row["index"], MLLibVectors.sparse( row["vector"].size, np.nonzero(row["vector"].values)[0], row["vector"].values[ np.nonzero(row["vector"].values)]))) return IndexedRowMatrix(transposed_mat)
def test_naive_multiplication_coord_matrix(self): mat_a = CoordinateMatrix(self.rdd_X, *self.X_shape) mat_b = CoordinateMatrix(self.rdd_y, *self.y_shape) computed_result = lp_matrix_multiply.naive_multiplication_rdd( mat_a=mat_a, mat_b=mat_b, is_triangle=True).collect() actual_result = self.product for element in computed_result: self.assertEqual(actual_result[element.i, element.j], element.value, msg='i {}, j {}, computed {} - actual_value: {}'.format( element.i, element.j, element.value, actual_result[element.i, element.j]))
def build_matrix(svo_path: str, cat1_instances: set, cat2_instances: set) -> CoordinateMatrix: raw_df = spark.read.csv(svo_path, sep='\t') pairs_df = (raw_df.filter( (f.col('_c0').isin(cat1_instances) & f.col('_c2').isin(cat2_instances)) | (f.col('_c0').isin(cat2_instances) & f.col('_c2').isin(cat1_instances))).rdd.map(lambda x: (tuple( sorted((x['_c0'], x['_c2']))), x['_c1'], int(x['_c3']))).toDF( ['pair', 'verb', 'n'])) named_coords = (pairs_df.selectExpr('pair', 'verb as left_verb', 'n').join( pairs_df.selectExpr('pair', 'verb as right_verb'), 'pair').filter('left_verb < right_verb').groupby( ['left_verb', 'right_verb']).count()) verb_to_id = (pairs_df.select('verb').distinct().rdd.zipWithIndex().map( lambda r: [r[0].verb, r[1]]).toDF(['verb', 'id'])) coords = (named_coords.join( verb_to_id, named_coords.left_verb == verb_to_id.verb).selectExpr( 'right_verb', 'id as left_verb_id', 'count').join( verb_to_id, named_coords.right_verb == verb_to_id.verb).selectExpr( 'left_verb_id', 'id as right_verb_id', 'count')) matrix = CoordinateMatrix(coords.rdd.map(lambda c: MatrixEntry(*c))) return matrix
def expand_mat(df,power,blockstyle=True): ''' Calculate nth power of a matrix A - A^n df: Dataframe of the coordinate matrix A power: Integer n. Exponent to which the matrix should be raised blockstyle: Boolean. Calculate matrix multiplication block style or by simple rdd joins returns: Dataframe of A^n matrix with source, destination, and weight columns ''' # Convert into CoordinateMatrix cols = df.columns cdf = CoordinateMatrix(df.rdd.map(tuple)) rdf = cdf # Calculate A^n blockstyle or rdd join style if blockstyle: for i in range(power-1): rdf = matrix_multiply_mod(rdf,cdf) else: for i in range(power-1): rdf = matrix_multiply(rdf,cdf) # Convert back to dataframe and return rdf_rdd = rdf.entries.map(lambda x: (x.i,x.j,x.value)) result_df = rdf_rdd.toDF() result_df = result_df.withColumnRenamed('_1',cols[0]).withColumnRenamed('_2',cols[1]).withColumnRenamed('_3',cols[2]) return result_df
def readRatings(spark, f_name, ratio=[0.8, 0.2], seed=0): """ Read the rating of users for movies Return the utility matrix""" df = spark.read.csv(f_name, header=True) #df = normalize(spark, df) rdd = df.rdd (training, test) = df.randomSplit(ratio, seed=seed) training_utility = CoordinateMatrix( training.rdd.map(lambda row: MatrixEntry(row['userId'], row[ 'movieId'], row['rating'])), users_total, movies_total) test_utility = CoordinateMatrix( test.rdd.map(lambda row: MatrixEntry(row['userId'], row[ 'movieId'], row['rating'])), users_total, movies_total) return (training_utility, test_utility)
def get_vectors_df(playcounts_df): """ Each row of playcounts_df has the following columns: recording_id, spark_user_id and a play count denoting how many times a user has played that recording. However, the correlation matrix requires a dataframe having a column of user vectors. Spark has various representations built-in for storing sparse matrices. Of these, two are Coordinate Matrix and Indexed Row Matrix. A coordinate matrix stores the matrix as tuples of (i, j, x) where matrix[i, j] = x. An Indexed Row Matrix stores it as tuples of row index and vectors. Our playcounts_df is similar in structure to a coordinate matrix. We begin with mapping each row of the playcounts_df to a MatrixEntry and then create a matrix of these entries. The recording_ids are rows, user_ids are columns and the playcounts are the values in the matrix. We convert the coordinate matrix to indexed row matrix form. Spark ML and MLlib have different representations of vectors, hence we need to manually convert between the two. Finally, we take the rows and create a dataframe from them. """ tuple_mapped_rdd = playcounts_df.rdd.map(lambda x: MatrixEntry(x["recording_id"], x["spark_user_id"], x["count"])) coordinate_matrix = CoordinateMatrix(tuple_mapped_rdd) indexed_row_matrix = coordinate_matrix.toIndexedRowMatrix() vectors_mapped_rdd = indexed_row_matrix.rows.map(lambda r: (r.index, r.vector.asML())) return listenbrainz_spark.session.createDataFrame(vectors_mapped_rdd, ['index', 'vector'])
def unfolding(self, mode=None): def mapFuncI(entry): return MatrixEntry(entry.i, entry.k + self.numDimK * entry.j, entry.val) def mapFuncJ(entry): return MatrixEntry(entry.j, entry.i + self.numDimI * entry.k, entry.val) def mapFuncK(entry): return MatrixEntry(entry.k, entry.j + self.numDimJ * entry.i, entry.val) if mode == 1: matrix = CoordinateMatrix(self.entries.map(mapFuncI)) elif mode == 2: matrix = CoordinateMatrix(self.entries.map(mapFuncJ)) elif mode == 3: matrix = CoordinateMatrix(self.entries.map(mapFuncK)) else: raise ValueError("The dimension index is out of the space!") return matrix
def newW(R, W, H): #W = np.multiply((X.dot(H.T))/(W.dot(H).dot(H.T)),W) a = R.multiply(H.transpose()).toCoordinateMatrix()\ .map(lambda entries:((entries.i,entries.j),(0,entries.value))) b = W.multiply(H).multiply(H.transpose()).toCoordinateMatrix()\ .map(lambda entries:((entries.i,entries.j),(1,entries.value))) c = a.union(b).reduceByKey(lambda a, b: (a[0] == 0 and (2, a[ 2] / b[2])) or (b[0] == 0 and 2, b[2] / a[2]) or b) #identify the right order of dividing c = c.map(lambda x: ((x[0][0], x[0][1]), x[1][1])) d = c.join(W.toCoordinateMatrix().map(lambda entries:((entries.i,entries.j),entries.value)))\ .reduceByKey(lambda a,b:a*b) return CoordinateMatrix( d.map(lambda x: MatrixEntry( (x[0][0], x[0][1]), x[1][1]))).toBlockMatrix()
def matrix_multiply(A, B): ''' This function returns the cross product between two matrices represented in Coordinate matrix format It is implemented by making simple joins. The code is implemented by refering to the scala implementation in the below link https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703 A: CoordinateMatrix Dataframe B: CoordinateMatrix Dataframe returns: CoordinateMatrix Dataframe of cross product between A and B ''' A_rdd = A.entries.map(lambda x: (x.j,(x.i,x.value))) # Convert dataframe to rdd of (column,(row, value)) B_rdd = B.entries.map(lambda x: (x.i,(x.j,x.value))) # Convert dataframe to rdd of (row,(column, value)) interm_rdd = A_rdd.join(B_rdd).map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1]*x[1][1][1]))) # Join two rdds and convert to ((row,column),(value)) C_rdd = interm_rdd.reduceByKey(add).map(lambda x: MatrixEntry(x[0][0],x[0][1],x[1])) # Add the product of same (row,column) pair and convert each row into a matrix entry of (row, column, value) return CoordinateMatrix(C_rdd)
def aggregate_and_pivot_into_vector(ddf, id_column, pivot_column, aggs, vector_column_name='features', return_indexer=False): """ 1. apply aggs to DataFrame (group by [id_column, pivot_column]), 2. pivot (one-hot encode) by pivot_column (values are indexed by StringIndexer) 3. save results into vector_column_name as Vector (if multiple aggregations provided, assemble result into one vector using pyspark.ml.feature.VectorAssembler) Example: aggs = get_ddf_aggs(grpby_columns=['customer_id', 'category'], agg_columns=['productsize','purchasequantity'], agg_funcs={'total':F.sum}, prefix='agg_', columns cast_to='double') print aggs #[Column<cast((sum(productsize),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_productsize_total#127>, #Column<cast((sum(purchasequantity),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_purchasequantity_total#128>] ddf_trans_pivot = aggregate_and_pivot_into_vector(ddf_trans, id_column='customer_id', pivot_column='category', aggs=aggs) ddf_trans_pivot.first() #Row(customer_id=98468631, features=SparseVector(1666, {0: 1.0, 1: 1.0, 5: 1.0, 8: 2.0, 13: 1.0, )) :param ddf: DataFrame :param id_column: row id column :param pivot_column: column to one-hot encode :param aggs: :param vector_column_name: :param return_indexer: add indexer object (StringIndexer) to result. Indexer holds encoder which was used to encode pivot_column values :return: """ from pyspark.mllib.linalg.distributed import CoordinateMatrix, IndexedRowMatrix from pyspark.ml.feature import VectorAssembler index_col_suffix = '_idx' grpby_columns = [id_column, pivot_column] aggregated = ddf.groupBy(grpby_columns).agg(*aggs) pivot_indexed_column = pivot_column+index_col_suffix agg_column_names = list(set(aggregated.columns)-set([id_column, pivot_column, pivot_indexed_column])) indexed, indexers = index_columns(ddf=aggregated, index_columns=[pivot_column], index_col_suffix=index_col_suffix, return_indexers=True) res = None agg_columns_vectors = map(lambda c: c+'_vector',agg_column_names) for agg_column, agg_column_vector in zip(agg_column_names, agg_columns_vectors): cm = CoordinateMatrix( indexed.map(lambda r: (long(r[id_column]), long(r[pivot_indexed_column]), r[agg_column])) ) irm = cm.toIndexedRowMatrix() ddf_irm = irm.rows.toDF() ddf_irm = ddf_irm.withColumnRenamed('index', id_column).withColumnRenamed('vector', agg_column_vector) if res: res = res.join(ddf_irm, on=id_column, how='inner') else: res = ddf_irm if len(agg_columns_vectors) > 1: assembler = VectorAssembler(inputCols=agg_columns_vectors, outputCol=vector_column_name) res = assembler.transform(res) else: res = res.withColumnRenamed(agg_columns_vectors[0], vector_column_name) res = drop_columns(res, columns=agg_columns_vectors) if return_indexer and len(indexers) > 0: return res, indexers.pop() else: return res
# create the context sc = pyspark.SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() small_data = sc.textFile('graph-small.txt') full_data = sc.textFile('graph-full.txt') LAMBDA = 1 NU = 1 source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct() edges = source_dest_pair.map(lambda x: (x[0], x[1], 1)) edges_transpose = source_dest_pair.map(lambda x: (x[1], x[0], 1)) L = CoordinateMatrix(edges).toBlockMatrix() L_transpose = CoordinateMatrix(edges_transpose).toBlockMatrix() h_init = [] for i in range(1000): h_init.append((i, 0, 1)) h = CoordinateMatrix(sc.parallelize(h_init)).toBlockMatrix() a = None for i in range(40): a_new = L_transpose.multiply(h) a_new_max = np.max(np.array(a_new.toLocalMatrix().toArray()))
def process(sparkContext,sqlContext): print("Building Graph...") G_apa = buildGraphAPA() print("Meta Path...") paths = metaPathAPA(G_apa) print("Training...") authorIndex = [] authorDegree = [] authors = paths[0] pathNumber = paths[2] pathNumberAarry = [] for pn in pathNumber.keys(): pathNumberAarry.append(str(pn)+":"+str(pathNumber.get(pn))) index = 0 for author in authors: authorDegree.append(str(author)+":"+str(len(G_apa[author]))) authorIndex.append(str(author)+":"+str(index)) index = index+1.0 # unique_authors = authors authorsRDD = sparkContext.parallelize(authors) authorIndex = sparkContext.parallelize(authorIndex) pathNumber = sparkContext.parallelize(pathNumberAarry) authorDegree = sparkContext.parallelize(authorDegree) authors = authorsRDD.collect() ai = authorIndex.collect() authorIndex = dict() for a in ai: p = a.split(":") authorIndex[p[0]]=p[1] # print authorIndex ad = authorDegree.collect() authorDegree = dict() for a in ad: p = a.split(":") authorDegree[p[0]]=p[1] # print authorDegree pn = pathNumber.collect() pathNumber = dict() for a in pn: p = a.split(":") pathNumber[p[0]]=p[1] # print pathNumber def matEntry(author,authors): row = [] # for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0)) else: key = str(author)+str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0)) return row def matEntryNoArgs(): row = [] for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0)) else: key = str(author)+str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0)) return row # print matEntry() print "memememememememmmmmmemmmm" me = authorsRDD.map(matEntry(author,authors)).collect()#.reduce(lambda x,y: x.append(y)) # me = matEntry() # me = matEntryNoArgs() print "memememememememmmmmmemmmmOoooooooooooooooo" entries = sc.parallelize(me) print "ssssssssssssssss" # # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # print mat # mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt") # Get its size. print mat.numRows() # 3 print mat.numCols() # 2
from pyspark.mllib.linalg.distributed import MatrixEntry if __name__ == "__main__": # set up spark context and configuration conf = SparkConf().setAppName("PythonPCAOnRowMatrixExample") sc = SparkContext(conf=conf) print(sc.getConf().getAll()) sqlContext = sql.SQLContext(sc) # load data data = sc.textFile("gs://dataproc-ae279739-4c78-478e-9024-8b7ea842f82e-us/heart1.txt") entries = data.map(lambda l: l.split(' ')).map(lambda l: MatrixEntry(np.long(l[0]), np.long(l[1]), np.float(l[2]))) # create RowMatrix premat = CoordinateMatrix(entries) mat = premat.toIndexedRowMatrix() print(mat.numCols()) print(mat.numRows()) # gramian start_time = time.time() decomp = mat.computeGramianMatrix() elapsedtime = time.time() - start_time print(elapsedtime) # svd start_time = time.time() decomp = mat.computeSVD(1000) elapsedtime = time.time() - start_time
def sparse_dot_product_cross_join( spark: SQLContext, output_col: str, primary_row_number_col: str, primary_vector_col: str, primary_df: DataFrame, secondary_row_number_col: str, secondary_vector_col: str, secondary_df: DataFrame, ): """Calculate the dot product for every pair of items between a column of SparseVectors in the primary dataframe and a column of SparseVectors in the secondary dataframe. The input dataframes must have a row number attached. This will correspond to the row number in ther resulting row matrix. It does not matter if the row numbers are sequential as long as they are unique within their dataframes respectively. NOTE: if you are using this function in order to generate cosine similarity scores then remember to normalize your input vectors first. This way the resulting coordinate matrix will represent the similarity scores.""" def primary_row_to_coords(row): """Convert a sparse vector to a list of coords in the format of (row_num, col_num, value)""" row_num = row.__getitem__(primary_row_number_col) vec = row.__getitem__(primary_vector_col) return [(row_num, i, j) for i, j in zip(vec.indices, vec.values)] primary_rdd = primary_df.select(F.col(primary_row_number_col), F.col(primary_vector_col)).rdd.flatMap( lambda row: primary_row_to_coords(row)) if primary_rdd.isEmpty(): raise ValueError( "Primary RDD is empty. Cannot perform matrix multiplication") primary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER) def secondary_row_to_coords(row): """Convert a sparse vector to a list of coords in the format of (row_num, col_num, value)""" row_num = row.__getitem__(secondary_row_number_col) vec = row.__getitem__(secondary_vector_col) # IMPORTANT - note that we are actually creating # the transpose of the secondary matrix hence # why the coordinates are back to front return [(i, row_num, j) for i, j in zip(vec.indices, vec.values)] secondary_rdd = secondary_df.select( F.col(secondary_row_number_col), F.col(secondary_vector_col)).rdd.flatMap( lambda row: secondary_row_to_coords(row)) secondary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER) if secondary_rdd.isEmpty(): raise ValueError( "Secondary RDD is empty. Cannot perform matrix multiplication") # create the primary coordinate matrix from the coords primary_matrix = CoordinateMatrix(primary_rdd) log.info( "finished creating primary coordinate matrix", rows=primary_matrix.numRows(), cols=primary_matrix.numCols(), ) # create the secondary coordinate matrix from the coords secondary_matrix = CoordinateMatrix(secondary_rdd) log.info( "finished creating secondary coordinate matrix transpose", rows=secondary_matrix.numRows(), cols=secondary_matrix.numCols(), ) coords_matrix = multiply_coordinate_matrices(primary_matrix, secondary_matrix) res = coord_matrix_to_dataframe( spark, primary_row_number_col, secondary_row_number_col, output_col, coords_matrix, ) primary_rdd.unpersist() secondary_rdd.unpersist() return res
# create the session conf = SparkConf().set("spark.ui.port", "4050") # create the context sc = pyspark.SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() small_data = sc.textFile('graph-small.txt') full_data = sc.textFile('graph-full.txt') BETA = 0.8 source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct() edges = source_dest_pair.map(lambda x: (x[1], x[0], 1)) degrees = source_dest_pair.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y).map(lambda x: (x[0], x[0], 1 / x[1])) edge_matrix = CoordinateMatrix(edges).toBlockMatrix() degree_inverse_matrix = CoordinateMatrix(degrees).toBlockMatrix() M = edge_matrix.multiply(degree_inverse_matrix) r_init = [] beta_init = [] teleport_init = [] for i in range(1000): r_init.append((i, 0, 1 / 1000)) beta_init.append((i, i, BETA)) teleport_init.append((i, 0, (1 - BETA) / 1000)) r = CoordinateMatrix(sc.parallelize(r_init)).toBlockMatrix() beta = CoordinateMatrix(sc.parallelize(beta_init)).toBlockMatrix() teleport = CoordinateMatrix(sc.parallelize(teleport_init)).toBlockMatrix()
# Diagonalize RDD diag_entries_1 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==0).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3),-value )); diag_entries_1.cache() diag_entries_2 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==1).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+1,-value )); diag_entries_2.cache() diag_entries_3 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==2).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+2,-value )); diag_entries_3.cache() diag_entries = diag_entries_1.union(diag_entries_2).union(diag_entries_3); coo_matrix_input_all = coo_matrix_input_all.union(diag_entries); coo_matrix_entries = coo_matrix_input_all.map(lambda e: MatrixEntry(e[0], e[1], e[2])); coo_matrix = CoordinateMatrix(coo_matrix_entries); #SAVE TO A FILE coo_matrix_input_all.repartition(1).saveAsTextFile("./Laplacian_4v7o_4cores_1") t2 = timeit.default_timer() print("Elapsed time for construction: {:} s".format(t2 - t0)) #Singular value decomposition dataRows = coo_matrix.toRowMatrix().rows k = int(args.k) #N_singvalues svd = RowMatrix(dataRows.persist()).computeSVD(k, computeU=True) U = svd.U # The U factor is a RowMatrix.
def multiply_coordinate_matrices(left: CoordinateMatrix, right: CoordinateMatrix): """Multiply 2 spark Coordindate Matrices without converting either of them into a DenseMatrix. NOTE: spark does not provide distributed matrix multiplication of sparse matrices for this reason a custom approach has to be used which is discussed here https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703 """ def key_by_col(x): """Take a MatrixEntry of (row, col, val) and return a 2-tuple of (col, (row, val))""" return (x.j, (x.i, x.value)) def key_by_row(x): """Take a MatrixEntry of (row, col, val) and return a 2-tuple of (row, (col, val))""" return (x.i, (x.j, x.value)) left_by_col = left.entries.map(lambda x: key_by_col(x)) right_by_row = right.entries.map(lambda x: key_by_row(x)) # Next we perform a row by col matrix multiplication # where a shared "key" is used to group entries of the left matrix # with COLUMN j and entries of the right matrix with ROW j. # Note that entries with the same j will stick together. # This should be obvious if you recall that matrix multiplication # matches the index of the left column with the index of the right row. col_by_row = left_by_col.join(right_by_row) def row_by_col_multiplication(x): """The input is a key-pair tuple in the following format: (key, ((left_row, left_val), (right_col, right_val))) the output is a pair of tuples in the following format: ((left_row, right_col), (left_val, right_val)) Note that having finished the grouping we no longer need the shared key anymore, (i.e. we no longer need the original indices of the left_col or right_row). This is because summed values will go into the output matrix at the location (left_row, right_col) and thus we can regroup by these indices and sum """ return ((x[1][0][0], x[1][1][0]), (x[1][0][1] * x[1][1][1])) # multiply elements by the left matrix column and the right matrix row products = col_by_row.map(lambda x: row_by_col_multiplication(x)) # Sum up all the products for the a given left_row and right_col summed = products.reduceByKey(lambda accum, n: accum + n) # unnest the keys so we can convert back to a coordinate matrix flattened = summed.map(lambda x: (x[0][0], x[0][1], x[1])) res = CoordinateMatrix(flattened) log.info( "finished creating coord matrix from dot product", rows=res.numRows(), cols=res.numCols(), ) return res
#conf = SparkConf().setAppName('linalgtest') #sc = SparkContext(conf=conf).getOrCreate() #use local spark on computer # findspark.init() #from pyspark.sql import SparkSession local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx' rdd = spark.sparkContext.textFile(local_file_location) rdd = rdd.map(lambda line: line.split(" ")) rdd = rdd.map( lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2]))) mat = CoordinateMatrix(rdd) M = mat.toRowMatrix() A = mat.toBlockMatrix() At = mat.transpose().toBlockMatrix() print("SVD") print(M.numRows(), M.numCols()) start_svd = time.time() NUM_TIMES = 10 #do it 10 times to get mean for i in range(NUM_TIMES): svd = M.computeSVD(5, computeU=True) end_svd = time.time() print("Time elapsed: ", (end_svd - start_svd) /
import pyspark from pyspark import SparkContext from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry from operator import add from pyspark.sql import SparkSession sc = SparkContext() r=sc.textFile("part-00000") m=r.flatMap(lambda x: x.split('\n')).filter(lambda x : "A" in x).map(lambda x : (x.strip("A, ")).split(' ')).map(lambda x: tuple(list(map(int, x)))) #n=m.map(lambda x : MatrixEntry(tuple(x))) spark = SparkSession(sc) #m.toDF().show() print(hasattr(m,"toDF")) cmat=CoordinateMatrix(m) #mat = CoordinateMatrix(n) #o=mat.take(5) print(cmat.numRows()) # 3 print(cmat.numCols()) rowmat = cmat.toRowMatrix() print(rowmat.numRows()) # 3 print(rowmat.numCols())
end = time.time() elapsed_seconds = float("%.4f" % (end - start)) logging.info('%s: elapsed seconds: %s', name, elapsed_seconds) logging.getLogger().setLevel(logging.INFO) def to_matrix_entry(x): i, j, v = x.split(',') return MatrixEntry(i, j, v) sc = pyspark.SparkContext(appName="Matrix Multiplication") for i in range(1, 10): with time_usage("temps matrix multiplication"): matrix_a_raw = sc.textFile(sys.argv[1]) matrix_b_raw = sc.textFile(sys.argv[2]) spark = SparkSession(sc) entries_a = matrix_a_raw.map(to_matrix_entry) entries_b = matrix_b_raw.map(to_matrix_entry) mat_a = CoordinateMatrix(entries_a).toBlockMatrix() mat_b = CoordinateMatrix(entries_b).toBlockMatrix() product = mat_a.multiply(mat_b) product.toLocalMatrix() #for t in result: #print('%s, %s, %s' % (t[0], t[1], t[2]))
debug = Debugger() debug.TIMESTAMP(1) def to_matrix_entry(s): ss = s.split() entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2])) return entry data = sc.textFile('hdfs://node1:9000/input/sqr.txt') mat = data.map(to_matrix_entry) rdd = sc.parallelize(mat.collect()) coord_mat = CoordinateMatrix(rdd) coord_mat = coord_mat.transpose() row_mat = coord_mat.toRowMatrix() sim = row_mat.columnSimilarities() print(sim.entries.take(10)) debug.TIMESTAMP(2) ''' data = data.map(lambda _ : np.array(_.strip().split()).astype(float)) unitMatrix = data.map(lambda _ : _/np.linalg.norm(_)) #unitMatrix = sc.parallelize(np.array([[1,2,3,5,7], [6,2,1,-1,3], [7,0,1,2,-4]]).T) mat = RowMatrix(unitMatrix) S = mat.columnSimilarities() sims = S.entries.collect()
spark = SparkSession.builder.appName( 'imbalanced_binary_classification').getOrCreate() #new_df = spark.read.option("delimiter", " ").csv('data/1138_bus/1138_bus_no_head.mtx', header=False, inferSchema=True) #new_df.printSchema() rdd = sc.textFile('data/1138_bus/1138_bus_no_head.mtx') rdd = rdd.map(lambda line: line.split(" ")) rdd = rdd.map(lambda line: [float(x) for x in line]) print(rdd.take(2)) #ncol = len(rdd.map(lambda r: r.image).first()) nrows = rdd.count() ncols = 3 #matrix = Matrices.dense(nrows, ncols, rdd) print("ncol: %d, nrow %d" % (ncols, nrows)) coord_mat = CoordinateMatrix(rdd.map(tuple)) print("num rows in matrix %d" % coord_mat.numRows()) print("finished using pyspark") #________________________________________________- print("now use SparkSession") from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() df_2 = spark.read.option("delimiter", " ").csv('./data/lpi_ceria3d_b.mtx', header=False, inferSchema=True) df_2.printSchema()
def process(sparkContext, sqlContext): print("Building Graph...") G_apa = buildGraphAPA() print("Meta Path...") paths = metaPathAPA(G_apa) print("Training...") authorIndex = [] authorDegree = [] authors = paths[0] pathNumber = paths[2] pathNumberAarry = [] for pn in pathNumber.keys(): pathNumberAarry.append(str(pn) + ":" + str(pathNumber.get(pn))) index = 0 for author in authors: authorDegree.append(str(author) + ":" + str(len(G_apa[author]))) authorIndex.append(str(author) + ":" + str(index)) index = index + 1.0 # unique_authors = authors authorsRDD = sparkContext.parallelize(authors) authorIndex = sparkContext.parallelize(authorIndex) pathNumber = sparkContext.parallelize(pathNumberAarry) authorDegree = sparkContext.parallelize(authorDegree) authors = authorsRDD.collect() ai = authorIndex.collect() authorIndex = dict() for a in ai: p = a.split(":") authorIndex[p[0]] = p[1] # print authorIndex ad = authorDegree.collect() authorDegree = dict() for a in ad: p = a.split(":") authorDegree[p[0]] = p[1] # print authorDegree pn = pathNumber.collect() pathNumber = dict() for a in pn: p = a.split(":") pathNumber[p[0]] = p[1] # print pathNumber def matEntry(author, authors): row = [] # for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 1.0)) else: key = str(author) + str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 2.0 * float(pathNumber.get(key)) / (float(authorDegree[author]) + float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 0.0)) return row def matEntryNoArgs(): row = [] for author in authors: for a in authors: if author == a: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 1.0)) else: key = str(author) + str(a) if pathNumber.has_key(key): row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 2.0 * float(pathNumber.get(key)) / (float(authorDegree[author]) + float(authorDegree[a])))) else: row.append((long(float(authorIndex[author])), long(float(authorIndex[a])), 0.0)) return row # print matEntry() print "memememememememmmmmmemmmm" me = authorsRDD.map(matEntry( author, authors)).collect() #.reduce(lambda x,y: x.append(y)) # me = matEntry() # me = matEntryNoArgs() print "memememememememmmmmmemmmmOoooooooooooooooo" entries = sc.parallelize(me) print "ssssssssssssssss" # # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # print mat # mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt") # Get its size. print mat.numRows() # 3 print mat.numCols() # 2
train_lines = train_lines.filter(lambda line: line != header) # Format Train Data (ItemID, UserID, Rating) global train_rdd train_rdd = train_lines.map(lambda line: line.split(',')).map( lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2]))) # Build Train Data Dict. with Format [(user, item)] = rating, for later check if the similar movie is rated global train_dict train_dict = {} for x, y, z in train_rdd.collect(): train_dict[(x, y)] = z # ----------------------------------------- Build simPdsDF ----------------------------------------------- # Form utilityMatrix to get simMat later sqlCon = SQLContext(sc) utilityMatrix = CoordinateMatrix(train_rdd) # Similarity Btw. Items simMat = utilityMatrix.toRowMatrix().columnSimilarities() # Convert simMat to Pandas format global simPdsDF sparkDF = simMat.entries.map(lambda x: str(x.i) + "," + str(x.j) + "," + str( x.value)).map(lambda w: w.split(',')).toDF() simPdsDF = sparkDF.toPandas() # edit columns' name simPdsDF.columns = ['ItemID_1', 'ItemID_2', 'Similarity'] # change data type simPdsDF['ItemID_1'] = simPdsDF['ItemID_1'].astype(int) simPdsDF['ItemID_2'] = simPdsDF['ItemID_2'].astype(int) simPdsDF['Similarity'] = simPdsDF['Similarity'].astype(float) # --------------------------------------- Used for RDD to calculate bias ---------------------------------------------
IndexedRow(1, [4, 5, 6]), IndexedRow(2, [7, 8, 9]), IndexedRow(3, [10, 11, 12]) ]) mat = IndexedRowMatrix(indexed) print(mat) # convert to row matrix rowMat = mat.toRowMatrix() print(rowMat) # A CoordinateMatrix is distributed and stored in an object called a coordinate list. from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry entries = sc.parallelize( [MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) mat = CoordinateMatrix(entries) m = mat.numRows() n = mat.number_columns() print(m) print(n) # convert to indexed row matrix rowMat = mat.toIndexedRowMatrix() print(rowMat)
def transpose(rm): cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap( lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])])) return cm.transpose().toRowMatrix()
txt = txt.sample(False, 0.001, 1) # XXX: random sample for local testing txt = txt.zipWithIndex().filter(lambda x: int(x[1]) >= 4).map( lambda x: x[0].split('\t')) ## Get graph Laplacian N = txt.flatMap(lambda x: [int(xx) for xx in x]).max() upper_entries = txt.map( lambda x: MatrixEntry(int(x[0]) - 1, int(x[1]) - 1, 1.0)) lower_entries = txt.map( lambda x: MatrixEntry(int(x[1]) - 1, int(x[0]) - 1, 1.0)) degrees = upper_entries.map(lambda entry: (entry.i, entry.value)).reduceByKey( lambda a, b: a + b) W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N) # XXX: laplacian = sys.argv[1] if laplacian == 'unnormalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1])) D = CoordinateMatrix(entries, numCols=N, numRows=N) L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix() elif laplacian == 'normalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1])) D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix() I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)), numCols=N, numRows=N).toBlockMatrix() L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix()