def calculate_jaccard(): def jaccardSimilarity(v1, v2): indices1 = set(v1.indices) indices2 = set(v2.indices) intersection = set.intersection(indices1, indices2) union = indices1.union(indices2) return (float(len(intersection)) / float(len(union))) ratings = spark.read.csv("/home/your_path/ratings.csv", header=True, inferSchema=True) df_3 = ratings.select(["movieId", "userId"]).withColumn("rating", lit(1.0)) cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple)) i_3 = cmat_3.toIndexedRowMatrix() i_df_3 = i_3.rows.toDF(["id", "features"]) def transform(self, f): return f(self) DataFrame.transform = transform jaccard_udf = udf(jaccardSimilarity, FloatType()) possibleMatches = i_df_3.transform( lambda df: df.alias("left").join(df.alias("right"))) result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \ .withColumn("jaccard", jaccard_udf("left.features", "right.features")) \ .select("left.id", "right.id", "jaccard")
def calculate_cosine(): def transform(self, f): return f(self) def cosineSimilarity(v1, v2): x = v1.toArray() y = v2.toArray() values1 = v1.values values2 = v2.values dotProduct = x.dot(y) ratingNorm = math.sqrt(sum(values1**2)) rating2Norm = math.sqrt(sum(values2**2)) return (float(dotProduct / (ratingNorm * rating2Norm))) ratings = spark.read.csv("/home/your_path/ratings.csv", header=True, inferSchema=True) df_3 = ratings.select(["movieId", "userId", "rating"]) cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple)) i_3 = cmat_3.toIndexedRowMatrix() i_df_3 = i_3.rows.toDF(["id", "features"]) DataFrame.transform = transform distance_cosine = udf(cosineSimilarity, FloatType()) possibleMatches = i_df_3.transform( lambda df: df.alias("left").join(df.alias("right"))) result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \ .withColumn("cosine", distance_cosine("left.features", "right.features")) \ .select("left.id", "right.id", "cosine")
def get_vectors_df(playcounts_df): """ Each row of playcounts_df has the following columns: recording_id, spark_user_id and a play count denoting how many times a user has played that recording. However, the correlation matrix requires a dataframe having a column of user vectors. Spark has various representations built-in for storing sparse matrices. Of these, two are Coordinate Matrix and Indexed Row Matrix. A coordinate matrix stores the matrix as tuples of (i, j, x) where matrix[i, j] = x. An Indexed Row Matrix stores it as tuples of row index and vectors. Our playcounts_df is similar in structure to a coordinate matrix. We begin with mapping each row of the playcounts_df to a MatrixEntry and then create a matrix of these entries. The recording_ids are rows, user_ids are columns and the playcounts are the values in the matrix. We convert the coordinate matrix to indexed row matrix form. Spark ML and MLlib have different representations of vectors, hence we need to manually convert between the two. Finally, we take the rows and create a dataframe from them. """ tuple_mapped_rdd = playcounts_df.rdd.map(lambda x: MatrixEntry(x["recording_id"], x["spark_user_id"], x["count"])) coordinate_matrix = CoordinateMatrix(tuple_mapped_rdd) indexed_row_matrix = coordinate_matrix.toIndexedRowMatrix() vectors_mapped_rdd = indexed_row_matrix.rows.map(lambda r: (r.index, r.vector.asML())) return listenbrainz_spark.session.createDataFrame(vectors_mapped_rdd, ['index', 'vector'])
def aggregate_and_pivot_into_vector(ddf, id_column, pivot_column, aggs, vector_column_name='features', return_indexer=False): """ 1. apply aggs to DataFrame (group by [id_column, pivot_column]), 2. pivot (one-hot encode) by pivot_column (values are indexed by StringIndexer) 3. save results into vector_column_name as Vector (if multiple aggregations provided, assemble result into one vector using pyspark.ml.feature.VectorAssembler) Example: aggs = get_ddf_aggs(grpby_columns=['customer_id', 'category'], agg_columns=['productsize','purchasequantity'], agg_funcs={'total':F.sum}, prefix='agg_', columns cast_to='double') print aggs #[Column<cast((sum(productsize),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_productsize_total#127>, #Column<cast((sum(purchasequantity),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_purchasequantity_total#128>] ddf_trans_pivot = aggregate_and_pivot_into_vector(ddf_trans, id_column='customer_id', pivot_column='category', aggs=aggs) ddf_trans_pivot.first() #Row(customer_id=98468631, features=SparseVector(1666, {0: 1.0, 1: 1.0, 5: 1.0, 8: 2.0, 13: 1.0, )) :param ddf: DataFrame :param id_column: row id column :param pivot_column: column to one-hot encode :param aggs: :param vector_column_name: :param return_indexer: add indexer object (StringIndexer) to result. Indexer holds encoder which was used to encode pivot_column values :return: """ from pyspark.mllib.linalg.distributed import CoordinateMatrix, IndexedRowMatrix from pyspark.ml.feature import VectorAssembler index_col_suffix = '_idx' grpby_columns = [id_column, pivot_column] aggregated = ddf.groupBy(grpby_columns).agg(*aggs) pivot_indexed_column = pivot_column+index_col_suffix agg_column_names = list(set(aggregated.columns)-set([id_column, pivot_column, pivot_indexed_column])) indexed, indexers = index_columns(ddf=aggregated, index_columns=[pivot_column], index_col_suffix=index_col_suffix, return_indexers=True) res = None agg_columns_vectors = map(lambda c: c+'_vector',agg_column_names) for agg_column, agg_column_vector in zip(agg_column_names, agg_columns_vectors): cm = CoordinateMatrix( indexed.map(lambda r: (long(r[id_column]), long(r[pivot_indexed_column]), r[agg_column])) ) irm = cm.toIndexedRowMatrix() ddf_irm = irm.rows.toDF() ddf_irm = ddf_irm.withColumnRenamed('index', id_column).withColumnRenamed('vector', agg_column_vector) if res: res = res.join(ddf_irm, on=id_column, how='inner') else: res = ddf_irm if len(agg_columns_vectors) > 1: assembler = VectorAssembler(inputCols=agg_columns_vectors, outputCol=vector_column_name) res = assembler.transform(res) else: res = res.withColumnRenamed(agg_columns_vectors[0], vector_column_name) res = drop_columns(res, columns=agg_columns_vectors) if return_indexer and len(indexers) > 0: return res, indexers.pop() else: return res
if __name__ == "__main__": # set up spark context and configuration conf = SparkConf().setAppName("PythonPCAOnRowMatrixExample") sc = SparkContext(conf=conf) print(sc.getConf().getAll()) sqlContext = sql.SQLContext(sc) # load data data = sc.textFile("gs://dataproc-ae279739-4c78-478e-9024-8b7ea842f82e-us/heart1.txt") entries = data.map(lambda l: l.split(' ')).map(lambda l: MatrixEntry(np.long(l[0]), np.long(l[1]), np.float(l[2]))) # create RowMatrix premat = CoordinateMatrix(entries) mat = premat.toIndexedRowMatrix() print(mat.numCols()) print(mat.numRows()) # gramian start_time = time.time() decomp = mat.computeGramianMatrix() elapsedtime = time.time() - start_time print(elapsedtime) # svd start_time = time.time() decomp = mat.computeSVD(1000) elapsedtime = time.time() - start_time print(elapsedtime)