Esempio n. 1
0
def calculate_jaccard():
    def jaccardSimilarity(v1, v2):
        indices1 = set(v1.indices)
        indices2 = set(v2.indices)
        intersection = set.intersection(indices1, indices2)
        union = indices1.union(indices2)
        return (float(len(intersection)) / float(len(union)))

    ratings = spark.read.csv("/home/your_path/ratings.csv",
                             header=True,
                             inferSchema=True)
    df_3 = ratings.select(["movieId", "userId"]).withColumn("rating", lit(1.0))

    cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple))

    i_3 = cmat_3.toIndexedRowMatrix()
    i_df_3 = i_3.rows.toDF(["id", "features"])

    def transform(self, f):
        return f(self)

    DataFrame.transform = transform

    jaccard_udf = udf(jaccardSimilarity, FloatType())

    possibleMatches = i_df_3.transform(
        lambda df: df.alias("left").join(df.alias("right")))

    result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \
            .withColumn("jaccard", jaccard_udf("left.features", "right.features")) \
            .select("left.id", "right.id", "jaccard")
Esempio n. 2
0
def calculate_cosine():
    def transform(self, f):
        return f(self)

    def cosineSimilarity(v1, v2):
        x = v1.toArray()
        y = v2.toArray()
        values1 = v1.values
        values2 = v2.values
        dotProduct = x.dot(y)
        ratingNorm = math.sqrt(sum(values1**2))
        rating2Norm = math.sqrt(sum(values2**2))
        return (float(dotProduct / (ratingNorm * rating2Norm)))

    ratings = spark.read.csv("/home/your_path/ratings.csv",
                             header=True,
                             inferSchema=True)
    df_3 = ratings.select(["movieId", "userId", "rating"])

    cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple))
    i_3 = cmat_3.toIndexedRowMatrix()

    i_df_3 = i_3.rows.toDF(["id", "features"])

    DataFrame.transform = transform

    distance_cosine = udf(cosineSimilarity, FloatType())

    possibleMatches = i_df_3.transform(
        lambda df: df.alias("left").join(df.alias("right")))

    result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \
            .withColumn("cosine", distance_cosine("left.features", "right.features")) \
            .select("left.id", "right.id", "cosine")
def get_vectors_df(playcounts_df):
    """
    Each row of playcounts_df has the following columns: recording_id, spark_user_id and a play count denoting how many times
    a user has played that recording. However, the correlation matrix requires a dataframe having a column of user
    vectors. Spark has various representations built-in for storing sparse matrices. Of these, two are Coordinate
    Matrix and Indexed Row Matrix. A coordinate matrix stores the matrix as tuples of (i, j, x) where matrix[i, j] = x.
    An Indexed Row Matrix stores it as tuples of row index and vectors.

    Our playcounts_df is similar in structure to a coordinate matrix. We begin with mapping each row of the
    playcounts_df to a MatrixEntry and then create a matrix of these entries. The recording_ids are rows, user_ids are
    columns and the playcounts are the values in the matrix. We convert the coordinate matrix to indexed row matrix
    form. Spark ML and MLlib have different representations of vectors, hence we need to manually convert between the
    two. Finally, we take the rows and create a dataframe from them.
    """
    tuple_mapped_rdd = playcounts_df.rdd.map(lambda x: MatrixEntry(x["recording_id"], x["spark_user_id"], x["count"]))
    coordinate_matrix = CoordinateMatrix(tuple_mapped_rdd)
    indexed_row_matrix = coordinate_matrix.toIndexedRowMatrix()
    vectors_mapped_rdd = indexed_row_matrix.rows.map(lambda r: (r.index, r.vector.asML()))
    return listenbrainz_spark.session.createDataFrame(vectors_mapped_rdd, ['index', 'vector'])
Esempio n. 4
0
def aggregate_and_pivot_into_vector(ddf, id_column, pivot_column, aggs, vector_column_name='features', return_indexer=False):
    """
    1. apply aggs to DataFrame (group by [id_column, pivot_column]),
    2. pivot (one-hot encode) by pivot_column (values are indexed by StringIndexer)
    3. save results into vector_column_name as Vector (if multiple aggregations provided, assemble result into one
    vector using pyspark.ml.feature.VectorAssembler)

    Example:
    aggs = get_ddf_aggs(grpby_columns=['customer_id', 'category'], agg_columns=['productsize','purchasequantity'],
                        agg_funcs={'total':F.sum}, prefix='agg_', columns cast_to='double')
    print aggs
        #[Column<cast((sum(productsize),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_productsize_total#127>,
        #Column<cast((sum(purchasequantity),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_purchasequantity_total#128>]

    ddf_trans_pivot = aggregate_and_pivot_into_vector(ddf_trans, id_column='customer_id', pivot_column='category', aggs=aggs)
    ddf_trans_pivot.first()
        #Row(customer_id=98468631, features=SparseVector(1666, {0: 1.0, 1: 1.0, 5: 1.0, 8: 2.0, 13: 1.0, ))

    :param ddf: DataFrame
    :param id_column: row id column
    :param pivot_column: column to one-hot encode
    :param aggs:
    :param vector_column_name:
    :param return_indexer: add indexer object (StringIndexer) to result. Indexer holds encoder which was used to encode
    pivot_column values
    :return:
    """
    from pyspark.mllib.linalg.distributed import CoordinateMatrix, IndexedRowMatrix
    from pyspark.ml.feature import VectorAssembler

    index_col_suffix = '_idx'
    grpby_columns = [id_column, pivot_column]

    aggregated = ddf.groupBy(grpby_columns).agg(*aggs)

    pivot_indexed_column = pivot_column+index_col_suffix
    agg_column_names = list(set(aggregated.columns)-set([id_column, pivot_column, pivot_indexed_column]))

    indexed, indexers = index_columns(ddf=aggregated, index_columns=[pivot_column], index_col_suffix=index_col_suffix, return_indexers=True)

    res = None
    agg_columns_vectors = map(lambda c: c+'_vector',agg_column_names)
    for agg_column, agg_column_vector in zip(agg_column_names, agg_columns_vectors):
        cm = CoordinateMatrix(
            indexed.map(lambda r: (long(r[id_column]), long(r[pivot_indexed_column]), r[agg_column]))
        )
        irm = cm.toIndexedRowMatrix()
        ddf_irm = irm.rows.toDF()
        ddf_irm = ddf_irm.withColumnRenamed('index', id_column).withColumnRenamed('vector', agg_column_vector)

        if res:
            res = res.join(ddf_irm, on=id_column, how='inner')
        else:
            res = ddf_irm

    if len(agg_columns_vectors) > 1:
        assembler = VectorAssembler(inputCols=agg_columns_vectors, outputCol=vector_column_name)
        res = assembler.transform(res)
    else:
        res = res.withColumnRenamed(agg_columns_vectors[0], vector_column_name)

    res = drop_columns(res, columns=agg_columns_vectors)

    if return_indexer and len(indexers) > 0:
        return res, indexers.pop()
    else:
        return res

if __name__ == "__main__":
    # set up spark context and configuration
    conf = SparkConf().setAppName("PythonPCAOnRowMatrixExample")
    sc = SparkContext(conf=conf)
    print(sc.getConf().getAll())    
    sqlContext = sql.SQLContext(sc)

    # load data
    data = sc.textFile("gs://dataproc-ae279739-4c78-478e-9024-8b7ea842f82e-us/heart1.txt")
    entries = data.map(lambda l: l.split(' ')).map(lambda l: MatrixEntry(np.long(l[0]), np.long(l[1]), np.float(l[2])))

    # create RowMatrix   
    premat = CoordinateMatrix(entries)
    mat = premat.toIndexedRowMatrix()

    print(mat.numCols())
    print(mat.numRows())

    # gramian
    start_time = time.time()
    decomp = mat.computeGramianMatrix()
    elapsedtime = time.time() - start_time
    print(elapsedtime)

    # svd
    start_time = time.time()
    decomp = mat.computeSVD(1000)
    elapsedtime = time.time() - start_time
    print(elapsedtime)