Esempio n. 1
0
def calculate_cosine():
    def transform(self, f):
        return f(self)

    def cosineSimilarity(v1, v2):
        x = v1.toArray()
        y = v2.toArray()
        values1 = v1.values
        values2 = v2.values
        dotProduct = x.dot(y)
        ratingNorm = math.sqrt(sum(values1**2))
        rating2Norm = math.sqrt(sum(values2**2))
        return (float(dotProduct / (ratingNorm * rating2Norm)))

    ratings = spark.read.csv("/home/your_path/ratings.csv",
                             header=True,
                             inferSchema=True)
    df_3 = ratings.select(["movieId", "userId", "rating"])

    cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple))
    i_3 = cmat_3.toIndexedRowMatrix()

    i_df_3 = i_3.rows.toDF(["id", "features"])

    DataFrame.transform = transform

    distance_cosine = udf(cosineSimilarity, FloatType())

    possibleMatches = i_df_3.transform(
        lambda df: df.alias("left").join(df.alias("right")))

    result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \
            .withColumn("cosine", distance_cosine("left.features", "right.features")) \
            .select("left.id", "right.id", "cosine")
Esempio n. 2
0
def process(sparkContext, sqlContext):

    # Define database connection parameters
    MYSQL_USERNAME = "******"
    MYSQL_PASSWORD = "******"
    MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"jdbc")
        .options(url=MYSQL_CONNECTION_URL, driver="com.mysql.jdbc.Driver", dbtable="dblp.author_sample")
        .load()
    )

    rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect()
    colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect()

    rawData = df.map(lambda p: (long(rows.index(p.name_hash)), long(colums.index(p.paper_hash)), 1.0)).cache()

    #   Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(rawData)

    rowMat = mat.toRowMatrix()

    print mat.numRows()  # 3
    print rowMat.numCols()

    #     transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax))))
    for r in rowMat.rows().collect():
        print r
Esempio n. 3
0
def process(sparkContext,sqlContext):
    
    # Define database connection parameters
    MYSQL_USERNAME = '******'
    MYSQL_PASSWORD = '******'
    MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"dblp.author_sample").load()
 
    rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect()
    colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect()

    rawData = df.map(lambda p: (long(rows.index(p.name_hash)),long(colums.index(p.paper_hash)),1.0)).cache()

#   Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(rawData)
    
    rowMat = mat.toRowMatrix()
    
    print mat.numRows()  # 3
    print rowMat.numCols()  
    
#     transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax))))
    for r in rowMat.rows().collect():
        print r
Esempio n. 4
0
def calculate_jaccard():
    def jaccardSimilarity(v1, v2):
        indices1 = set(v1.indices)
        indices2 = set(v2.indices)
        intersection = set.intersection(indices1, indices2)
        union = indices1.union(indices2)
        return (float(len(intersection)) / float(len(union)))

    ratings = spark.read.csv("/home/your_path/ratings.csv",
                             header=True,
                             inferSchema=True)
    df_3 = ratings.select(["movieId", "userId"]).withColumn("rating", lit(1.0))

    cmat_3 = CoordinateMatrix(df_3.rdd.map(tuple))

    i_3 = cmat_3.toIndexedRowMatrix()
    i_df_3 = i_3.rows.toDF(["id", "features"])

    def transform(self, f):
        return f(self)

    DataFrame.transform = transform

    jaccard_udf = udf(jaccardSimilarity, FloatType())

    possibleMatches = i_df_3.transform(
        lambda df: df.alias("left").join(df.alias("right")))

    result = possibleMatches.filter((col("left.id") != col("right.id")) & (col("left.id") < col("right.id")) ) \
            .withColumn("jaccard", jaccard_udf("left.features", "right.features")) \
            .select("left.id", "right.id", "jaccard")
 def calculate_similarity(self):
     train = self.train_data
     train_user_mean = train.groupBy("userId").agg(F.mean('rating'))
     train_user_mean = train_user_mean.withColumnRenamed("avg(rating)",
                                                         "user_mean")
     train_rating_avg = train.join(train_user_mean, 'userId',
                                   how='left_outer')
     train_rating_avg = train_rating_avg.select(
         '*',
         (train_rating_avg.rating - train_rating_avg.user_mean)
         .alias('rating_norm'))
     rdd = (train_rating_avg.select('movieId', 'userId', 'rating_norm')
                            .rdd.map(tuple))
     coord = CoordinateMatrix(rdd)
     mat = coord.toRowMatrix()
     similarities = mat.columnSimilarities()
     similarities_df = similarities.entries.toDF()
     window = (Window.partitionBy(similarities_df['i'])
                     .orderBy(similarities_df['value'].desc()))
     similarities_df_ranked = (
         similarities_df
         .select('*', F.row_number().over(window).alias('row_number'))
         .filter(F.col('row_number') <= 100))
     similarities_df_ranked.write.parquet(SIMILARITY_FILE_SORTED,
                                          mode='overwrite')
Esempio n. 6
0
def MatrixTranspose(
    mat
):  #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help.
    ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it,
    #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix
    ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order
    ## use indexed matrix could partially fix this issue by reordering but this is too wierd
    '''
	transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector
	:param mat: the input row matrix
	:return a transposed row matrix
	ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark
	'''
    if isinstance(mat, IndexedRowMatrix):
        mat = mat.toRowMatrix()
    #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency
    transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
    transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF(
    ).orderBy("index")
    # back to sparse first then convert to indexedrowmatrix
    transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow(
        row["index"],
        MLLibVectors.sparse(
            row["vector"].size,
            np.nonzero(row["vector"].values)[0], row["vector"].values[
                np.nonzero(row["vector"].values)])))
    return IndexedRowMatrix(transposed_mat)
 def test_naive_multiplication_coord_matrix(self):
     mat_a = CoordinateMatrix(self.rdd_X, *self.X_shape)
     mat_b = CoordinateMatrix(self.rdd_y, *self.y_shape)
     computed_result = lp_matrix_multiply.naive_multiplication_rdd(
         mat_a=mat_a, mat_b=mat_b, is_triangle=True).collect()
     actual_result = self.product
     for element in computed_result:
         self.assertEqual(actual_result[element.i, element.j], element.value,
                          msg='i {}, j {}, computed {} - actual_value: {}'.format(
                              element.i, element.j, element.value, actual_result[element.i, element.j]))
Esempio n. 8
0
def build_matrix(svo_path: str, cat1_instances: set,
                 cat2_instances: set) -> CoordinateMatrix:
    raw_df = spark.read.csv(svo_path, sep='\t')

    pairs_df = (raw_df.filter(
        (f.col('_c0').isin(cat1_instances)
         & f.col('_c2').isin(cat2_instances))
        | (f.col('_c0').isin(cat2_instances)
           & f.col('_c2').isin(cat1_instances))).rdd.map(lambda x: (tuple(
               sorted((x['_c0'], x['_c2']))), x['_c1'], int(x['_c3']))).toDF(
                   ['pair', 'verb', 'n']))

    named_coords = (pairs_df.selectExpr('pair', 'verb as left_verb', 'n').join(
        pairs_df.selectExpr('pair', 'verb as right_verb'),
        'pair').filter('left_verb < right_verb').groupby(
            ['left_verb', 'right_verb']).count())

    verb_to_id = (pairs_df.select('verb').distinct().rdd.zipWithIndex().map(
        lambda r: [r[0].verb, r[1]]).toDF(['verb', 'id']))

    coords = (named_coords.join(
        verb_to_id, named_coords.left_verb == verb_to_id.verb).selectExpr(
            'right_verb', 'id as left_verb_id', 'count').join(
                verb_to_id,
                named_coords.right_verb == verb_to_id.verb).selectExpr(
                    'left_verb_id', 'id as right_verb_id', 'count'))

    matrix = CoordinateMatrix(coords.rdd.map(lambda c: MatrixEntry(*c)))

    return matrix
Esempio n. 9
0
def expand_mat(df,power,blockstyle=True):

    '''

    Calculate nth power of a matrix A - A^n

    df: Dataframe of the coordinate matrix A
    power: Integer n. Exponent to which the matrix should be raised
    blockstyle: Boolean. Calculate matrix multiplication block style or by simple rdd joins
    returns: Dataframe of A^n matrix with source, destination, and weight columns

    '''

    # Convert into CoordinateMatrix
    cols = df.columns
    cdf =  CoordinateMatrix(df.rdd.map(tuple))
    rdf = cdf

    # Calculate A^n blockstyle or rdd join style
    if blockstyle:
        for i in range(power-1):
            rdf = matrix_multiply_mod(rdf,cdf)
    else:
        for i in range(power-1):
            rdf = matrix_multiply(rdf,cdf)

    # Convert back to dataframe and return
    rdf_rdd = rdf.entries.map(lambda x: (x.i,x.j,x.value))
    result_df = rdf_rdd.toDF()
    result_df = result_df.withColumnRenamed('_1',cols[0]).withColumnRenamed('_2',cols[1]).withColumnRenamed('_3',cols[2])
    return result_df
def readRatings(spark, f_name, ratio=[0.8, 0.2], seed=0):
    """ Read the rating of users for movies 
        Return the utility matrix"""
    df = spark.read.csv(f_name, header=True)
    #df = normalize(spark, df)
    rdd = df.rdd

    (training, test) = df.randomSplit(ratio, seed=seed)

    training_utility = CoordinateMatrix(
        training.rdd.map(lambda row: MatrixEntry(row['userId'], row[
            'movieId'], row['rating'])), users_total, movies_total)
    test_utility = CoordinateMatrix(
        test.rdd.map(lambda row: MatrixEntry(row['userId'], row[
            'movieId'], row['rating'])), users_total, movies_total)

    return (training_utility, test_utility)
def get_vectors_df(playcounts_df):
    """
    Each row of playcounts_df has the following columns: recording_id, spark_user_id and a play count denoting how many times
    a user has played that recording. However, the correlation matrix requires a dataframe having a column of user
    vectors. Spark has various representations built-in for storing sparse matrices. Of these, two are Coordinate
    Matrix and Indexed Row Matrix. A coordinate matrix stores the matrix as tuples of (i, j, x) where matrix[i, j] = x.
    An Indexed Row Matrix stores it as tuples of row index and vectors.

    Our playcounts_df is similar in structure to a coordinate matrix. We begin with mapping each row of the
    playcounts_df to a MatrixEntry and then create a matrix of these entries. The recording_ids are rows, user_ids are
    columns and the playcounts are the values in the matrix. We convert the coordinate matrix to indexed row matrix
    form. Spark ML and MLlib have different representations of vectors, hence we need to manually convert between the
    two. Finally, we take the rows and create a dataframe from them.
    """
    tuple_mapped_rdd = playcounts_df.rdd.map(lambda x: MatrixEntry(x["recording_id"], x["spark_user_id"], x["count"]))
    coordinate_matrix = CoordinateMatrix(tuple_mapped_rdd)
    indexed_row_matrix = coordinate_matrix.toIndexedRowMatrix()
    vectors_mapped_rdd = indexed_row_matrix.rows.map(lambda r: (r.index, r.vector.asML()))
    return listenbrainz_spark.session.createDataFrame(vectors_mapped_rdd, ['index', 'vector'])
Esempio n. 12
0
    def unfolding(self, mode=None):
        def mapFuncI(entry):
            return MatrixEntry(entry.i, entry.k + self.numDimK * entry.j,
                               entry.val)

        def mapFuncJ(entry):
            return MatrixEntry(entry.j, entry.i + self.numDimI * entry.k,
                               entry.val)

        def mapFuncK(entry):
            return MatrixEntry(entry.k, entry.j + self.numDimJ * entry.i,
                               entry.val)

        if mode == 1:
            matrix = CoordinateMatrix(self.entries.map(mapFuncI))
        elif mode == 2:
            matrix = CoordinateMatrix(self.entries.map(mapFuncJ))
        elif mode == 3:
            matrix = CoordinateMatrix(self.entries.map(mapFuncK))
        else:
            raise ValueError("The dimension index is out of the space!")

        return matrix
Esempio n. 13
0
 def newW(R, W, H):
     #W = np.multiply((X.dot(H.T))/(W.dot(H).dot(H.T)),W)
     a = R.multiply(H.transpose()).toCoordinateMatrix()\
      .map(lambda entries:((entries.i,entries.j),(0,entries.value)))
     b = W.multiply(H).multiply(H.transpose()).toCoordinateMatrix()\
      .map(lambda entries:((entries.i,entries.j),(1,entries.value)))
     c = a.union(b).reduceByKey(lambda a, b: (a[0] == 0 and (2, a[
         2] / b[2])) or (b[0] == 0 and 2, b[2] / a[2]) or b)
     #identify the right order of dividing
     c = c.map(lambda x: ((x[0][0], x[0][1]), x[1][1]))
     d = c.join(W.toCoordinateMatrix().map(lambda entries:((entries.i,entries.j),entries.value)))\
      .reduceByKey(lambda a,b:a*b)
     return CoordinateMatrix(
         d.map(lambda x: MatrixEntry(
             (x[0][0], x[0][1]), x[1][1]))).toBlockMatrix()
Esempio n. 14
0
def matrix_multiply(A, B):

    '''
    
    This function returns the cross product between two matrices represented in Coordinate matrix format
    It is implemented by making simple joins. The code is implemented by refering to the scala implementation in the below link
    https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703

    A: CoordinateMatrix Dataframe
    B: CoordinateMatrix Dataframe
    returns: CoordinateMatrix Dataframe of cross product between A and B

    '''

    A_rdd = A.entries.map(lambda x: (x.j,(x.i,x.value))) # Convert dataframe to rdd of (column,(row, value))
    B_rdd = B.entries.map(lambda x: (x.i,(x.j,x.value))) # Convert dataframe to rdd of (row,(column, value))

    interm_rdd = A_rdd.join(B_rdd).map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1]*x[1][1][1]))) # Join two rdds and convert to ((row,column),(value))
    C_rdd = interm_rdd.reduceByKey(add).map(lambda x: MatrixEntry(x[0][0],x[0][1],x[1])) # Add the product of same (row,column) pair and convert each row into a matrix entry of (row, column, value)
    return CoordinateMatrix(C_rdd)
Esempio n. 15
0
def aggregate_and_pivot_into_vector(ddf, id_column, pivot_column, aggs, vector_column_name='features', return_indexer=False):
    """
    1. apply aggs to DataFrame (group by [id_column, pivot_column]),
    2. pivot (one-hot encode) by pivot_column (values are indexed by StringIndexer)
    3. save results into vector_column_name as Vector (if multiple aggregations provided, assemble result into one
    vector using pyspark.ml.feature.VectorAssembler)

    Example:
    aggs = get_ddf_aggs(grpby_columns=['customer_id', 'category'], agg_columns=['productsize','purchasequantity'],
                        agg_funcs={'total':F.sum}, prefix='agg_', columns cast_to='double')
    print aggs
        #[Column<cast((sum(productsize),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_productsize_total#127>,
        #Column<cast((sum(purchasequantity),mode=Complete,isDistinct=false) as double) AS agg_customer_id_category_purchasequantity_total#128>]

    ddf_trans_pivot = aggregate_and_pivot_into_vector(ddf_trans, id_column='customer_id', pivot_column='category', aggs=aggs)
    ddf_trans_pivot.first()
        #Row(customer_id=98468631, features=SparseVector(1666, {0: 1.0, 1: 1.0, 5: 1.0, 8: 2.0, 13: 1.0, ))

    :param ddf: DataFrame
    :param id_column: row id column
    :param pivot_column: column to one-hot encode
    :param aggs:
    :param vector_column_name:
    :param return_indexer: add indexer object (StringIndexer) to result. Indexer holds encoder which was used to encode
    pivot_column values
    :return:
    """
    from pyspark.mllib.linalg.distributed import CoordinateMatrix, IndexedRowMatrix
    from pyspark.ml.feature import VectorAssembler

    index_col_suffix = '_idx'
    grpby_columns = [id_column, pivot_column]

    aggregated = ddf.groupBy(grpby_columns).agg(*aggs)

    pivot_indexed_column = pivot_column+index_col_suffix
    agg_column_names = list(set(aggregated.columns)-set([id_column, pivot_column, pivot_indexed_column]))

    indexed, indexers = index_columns(ddf=aggregated, index_columns=[pivot_column], index_col_suffix=index_col_suffix, return_indexers=True)

    res = None
    agg_columns_vectors = map(lambda c: c+'_vector',agg_column_names)
    for agg_column, agg_column_vector in zip(agg_column_names, agg_columns_vectors):
        cm = CoordinateMatrix(
            indexed.map(lambda r: (long(r[id_column]), long(r[pivot_indexed_column]), r[agg_column]))
        )
        irm = cm.toIndexedRowMatrix()
        ddf_irm = irm.rows.toDF()
        ddf_irm = ddf_irm.withColumnRenamed('index', id_column).withColumnRenamed('vector', agg_column_vector)

        if res:
            res = res.join(ddf_irm, on=id_column, how='inner')
        else:
            res = ddf_irm

    if len(agg_columns_vectors) > 1:
        assembler = VectorAssembler(inputCols=agg_columns_vectors, outputCol=vector_column_name)
        res = assembler.transform(res)
    else:
        res = res.withColumnRenamed(agg_columns_vectors[0], vector_column_name)

    res = drop_columns(res, columns=agg_columns_vectors)

    if return_indexer and len(indexers) > 0:
        return res, indexers.pop()
    else:
        return res
Esempio n. 16
0
# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

small_data = sc.textFile('graph-small.txt')
full_data = sc.textFile('graph-full.txt')

LAMBDA = 1
NU = 1

source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct()
edges = source_dest_pair.map(lambda x: (x[0], x[1], 1))
edges_transpose = source_dest_pair.map(lambda x: (x[1], x[0], 1))

L = CoordinateMatrix(edges).toBlockMatrix()
L_transpose = CoordinateMatrix(edges_transpose).toBlockMatrix()

h_init = []

for i in range(1000):
  h_init.append((i, 0, 1))

h = CoordinateMatrix(sc.parallelize(h_init)).toBlockMatrix()

a = None

for i in range(40):

  a_new = L_transpose.multiply(h)
  a_new_max = np.max(np.array(a_new.toLocalMatrix().toArray()))
Esempio n. 17
0
def process(sparkContext,sqlContext):

    print("Building Graph...")
    G_apa = buildGraphAPA()
    
    print("Meta Path...")
    paths = metaPathAPA(G_apa)
     
    print("Training...")
    authorIndex = []
    authorDegree = []
   
    authors = paths[0]
    pathNumber = paths[2]
    pathNumberAarry = []
    for pn in pathNumber.keys():
        pathNumberAarry.append(str(pn)+":"+str(pathNumber.get(pn)))
    
    index = 0
    for author in authors:
        authorDegree.append(str(author)+":"+str(len(G_apa[author])))
        authorIndex.append(str(author)+":"+str(index))
        index = index+1.0

    
#     unique_authors = authors 
     
    authorsRDD = sparkContext.parallelize(authors)
    authorIndex = sparkContext.parallelize(authorIndex)
    pathNumber = sparkContext.parallelize(pathNumberAarry)
    authorDegree = sparkContext.parallelize(authorDegree)
     
    
    
    authors = authorsRDD.collect()

    ai = authorIndex.collect()
    authorIndex = dict()
    for a in ai:
        p = a.split(":")
        authorIndex[p[0]]=p[1]
#     print authorIndex
    
    ad = authorDegree.collect()
    authorDegree = dict()
    for a in ad:
        p = a.split(":")
        authorDegree[p[0]]=p[1]
#     print authorDegree
    
    pn = pathNumber.collect()
    pathNumber = dict()
    for a in pn:
        p = a.split(":")
        pathNumber[p[0]]=p[1]
#     print pathNumber
    
    
    def matEntry(author,authors):
        row = []
#         for author in authors: 
        for a in authors:
            if author == a:
                row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0))
            else:
                key = str(author)+str(a)
                if pathNumber.has_key(key):
                    row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a]))))
                else:
                    row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0))
        
        return row
    
    def matEntryNoArgs():
        row = []
        for author in authors: 
            for a in authors:
                if author == a:
                    row.append((long(float(authorIndex[author])),long(float(authorIndex[a])),1.0))
                else:
                    key = str(author)+str(a)
                    if pathNumber.has_key(key):
                        row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 2.0*float(pathNumber.get(key))/(float(authorDegree[author])+float(authorDegree[a]))))
                    else:
                        row.append((long(float(authorIndex[author])),long(float(authorIndex[a])), 0.0))
        
        return row

#     print matEntry() 
    print "memememememememmmmmmemmmm"  
    
    me = authorsRDD.map(matEntry(author,authors)).collect()#.reduce(lambda x,y: x.append(y))
#     me =  matEntry()
#     me =  matEntryNoArgs()
    print "memememememememmmmmmemmmmOoooooooooooooooo"  
    
    
    
    entries = sc.parallelize(me)  
    print "ssssssssssssssss"  
#     # Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(entries)
#      
    print mat
#     mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt")
     
    # Get its size.
    print mat.numRows()  # 3
    print mat.numCols()  # 2
from pyspark.mllib.linalg.distributed import MatrixEntry


if __name__ == "__main__":
    # set up spark context and configuration
    conf = SparkConf().setAppName("PythonPCAOnRowMatrixExample")
    sc = SparkContext(conf=conf)
    print(sc.getConf().getAll())    
    sqlContext = sql.SQLContext(sc)

    # load data
    data = sc.textFile("gs://dataproc-ae279739-4c78-478e-9024-8b7ea842f82e-us/heart1.txt")
    entries = data.map(lambda l: l.split(' ')).map(lambda l: MatrixEntry(np.long(l[0]), np.long(l[1]), np.float(l[2])))

    # create RowMatrix   
    premat = CoordinateMatrix(entries)
    mat = premat.toIndexedRowMatrix()

    print(mat.numCols())
    print(mat.numRows())

    # gramian
    start_time = time.time()
    decomp = mat.computeGramianMatrix()
    elapsedtime = time.time() - start_time
    print(elapsedtime)

    # svd
    start_time = time.time()
    decomp = mat.computeSVD(1000)
    elapsedtime = time.time() - start_time
Esempio n. 19
0
def sparse_dot_product_cross_join(
    spark: SQLContext,
    output_col: str,
    primary_row_number_col: str,
    primary_vector_col: str,
    primary_df: DataFrame,
    secondary_row_number_col: str,
    secondary_vector_col: str,
    secondary_df: DataFrame,
):
    """Calculate the dot product for every pair of items between
    a column of SparseVectors in the primary dataframe and a
    column of SparseVectors in the secondary dataframe.

    The input dataframes must have a row number attached. This will
    correspond to the row number in ther resulting row matrix.
    It does not matter if the row numbers are sequential as long
    as they are unique within their dataframes respectively.

    NOTE: if you are using this function in order to generate cosine similarity
    scores then remember to normalize your input vectors first. This way the
    resulting coordinate matrix will represent the similarity scores."""
    def primary_row_to_coords(row):
        """Convert a sparse vector to a list of coords
        in the format of (row_num, col_num, value)"""
        row_num = row.__getitem__(primary_row_number_col)
        vec = row.__getitem__(primary_vector_col)
        return [(row_num, i, j) for i, j in zip(vec.indices, vec.values)]

    primary_rdd = primary_df.select(F.col(primary_row_number_col),
                                    F.col(primary_vector_col)).rdd.flatMap(
                                        lambda row: primary_row_to_coords(row))

    if primary_rdd.isEmpty():
        raise ValueError(
            "Primary RDD is empty. Cannot perform matrix multiplication")

    primary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

    def secondary_row_to_coords(row):
        """Convert a sparse vector to a list of coords
        in the format of (row_num, col_num, value)"""
        row_num = row.__getitem__(secondary_row_number_col)
        vec = row.__getitem__(secondary_vector_col)
        # IMPORTANT - note that we are actually creating
        # the transpose of the secondary matrix hence
        # why the coordinates are back to front
        return [(i, row_num, j) for i, j in zip(vec.indices, vec.values)]

    secondary_rdd = secondary_df.select(
        F.col(secondary_row_number_col),
        F.col(secondary_vector_col)).rdd.flatMap(
            lambda row: secondary_row_to_coords(row))

    secondary_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)

    if secondary_rdd.isEmpty():
        raise ValueError(
            "Secondary RDD is empty. Cannot perform matrix multiplication")

    # create the primary coordinate matrix from the coords
    primary_matrix = CoordinateMatrix(primary_rdd)

    log.info(
        "finished creating primary coordinate matrix",
        rows=primary_matrix.numRows(),
        cols=primary_matrix.numCols(),
    )

    # create the secondary coordinate matrix from the coords
    secondary_matrix = CoordinateMatrix(secondary_rdd)

    log.info(
        "finished creating secondary coordinate matrix transpose",
        rows=secondary_matrix.numRows(),
        cols=secondary_matrix.numCols(),
    )
    coords_matrix = multiply_coordinate_matrices(primary_matrix,
                                                 secondary_matrix)

    res = coord_matrix_to_dataframe(
        spark,
        primary_row_number_col,
        secondary_row_number_col,
        output_col,
        coords_matrix,
    )

    primary_rdd.unpersist()
    secondary_rdd.unpersist()

    return res
Esempio n. 20
0
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

small_data = sc.textFile('graph-small.txt')
full_data = sc.textFile('graph-full.txt')
BETA = 0.8

source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct()
edges = source_dest_pair.map(lambda x: (x[1], x[0], 1))
degrees = source_dest_pair.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y).map(lambda x: (x[0], x[0], 1 / x[1]))

edge_matrix = CoordinateMatrix(edges).toBlockMatrix()
degree_inverse_matrix = CoordinateMatrix(degrees).toBlockMatrix()

M = edge_matrix.multiply(degree_inverse_matrix)

r_init = []
beta_init = []
teleport_init = []
for i in range(1000):
  r_init.append((i, 0, 1 / 1000))
  beta_init.append((i, i, BETA))
  teleport_init.append((i, 0, (1 - BETA) / 1000))

r = CoordinateMatrix(sc.parallelize(r_init)).toBlockMatrix()
beta = CoordinateMatrix(sc.parallelize(beta_init)).toBlockMatrix()
teleport = CoordinateMatrix(sc.parallelize(teleport_init)).toBlockMatrix()
Esempio n. 21
0

	# Diagonalize RDD  

	diag_entries_1 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==0).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3),-value ));
	diag_entries_1.cache()
	diag_entries_2 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==1).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+1,-value ));
	diag_entries_2.cache()
	diag_entries_3 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==2).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+2,-value ));
	diag_entries_3.cache()

	diag_entries = diag_entries_1.union(diag_entries_2).union(diag_entries_3);
	
	coo_matrix_input_all  = coo_matrix_input_all.union(diag_entries);
	coo_matrix_entries = coo_matrix_input_all.map(lambda e: MatrixEntry(e[0], e[1], e[2]));
	coo_matrix = CoordinateMatrix(coo_matrix_entries);


	#SAVE TO A FILE
	coo_matrix_input_all.repartition(1).saveAsTextFile("./Laplacian_4v7o_4cores_1")
	t2 = timeit.default_timer()
	print("Elapsed time for construction: {:} s".format(t2 - t0))


	#Singular value decomposition
	
	dataRows = coo_matrix.toRowMatrix().rows

	k = int(args.k) #N_singvalues
	svd = RowMatrix(dataRows.persist()).computeSVD(k, computeU=True)
	U = svd.U # The U factor is a RowMatrix.
Esempio n. 22
0
def multiply_coordinate_matrices(left: CoordinateMatrix,
                                 right: CoordinateMatrix):
    """Multiply 2 spark Coordindate Matrices
    without converting either of them into a DenseMatrix.

    NOTE: spark does not provide distributed matrix multiplication of sparse matrices
    for this reason a custom approach has to be used which is discussed here
    https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703
    """
    def key_by_col(x):
        """Take a MatrixEntry of (row, col, val) and
        return a 2-tuple of (col, (row, val))"""
        return (x.j, (x.i, x.value))

    def key_by_row(x):
        """Take a MatrixEntry of (row, col, val) and
        return a 2-tuple of (row, (col, val))"""
        return (x.i, (x.j, x.value))

    left_by_col = left.entries.map(lambda x: key_by_col(x))
    right_by_row = right.entries.map(lambda x: key_by_row(x))

    # Next we perform a row by col matrix multiplication
    # where a shared "key" is used to group entries of the left matrix
    # with COLUMN j and entries of the right matrix with ROW j.
    # Note that entries with the same j will stick together.
    # This should be obvious if you recall that matrix multiplication
    # matches the index of the left column with the index of the right row.
    col_by_row = left_by_col.join(right_by_row)

    def row_by_col_multiplication(x):
        """The input is a key-pair tuple in the following format:
        (key, ((left_row, left_val), (right_col, right_val)))

        the output is a pair of tuples in the following format:
        ((left_row, right_col), (left_val, right_val))

        Note that having finished the grouping we no longer need the shared key anymore,
        (i.e. we no longer need the original indices of the left_col or right_row).
        This is because summed values will go into the output matrix at the
        location (left_row, right_col) and thus we can  regroup by these indices and sum
        """
        return ((x[1][0][0], x[1][1][0]), (x[1][0][1] * x[1][1][1]))

    # multiply elements by the left matrix column and the right matrix row
    products = col_by_row.map(lambda x: row_by_col_multiplication(x))

    # Sum up all the products for the a given left_row and right_col
    summed = products.reduceByKey(lambda accum, n: accum + n)

    # unnest the keys so we can convert back to a coordinate matrix
    flattened = summed.map(lambda x: (x[0][0], x[0][1], x[1]))

    res = CoordinateMatrix(flattened)

    log.info(
        "finished creating coord matrix from dot product",
        rows=res.numRows(),
        cols=res.numCols(),
    )
    return res
Esempio n. 23
0
#conf = SparkConf().setAppName('linalgtest')
#sc = SparkContext(conf=conf).getOrCreate()

#use local spark on computer
# findspark.init()
#from pyspark.sql import SparkSession

local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx'

rdd = spark.sparkContext.textFile(local_file_location)
rdd = rdd.map(lambda line: line.split(" "))
rdd = rdd.map(
    lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2])))

mat = CoordinateMatrix(rdd)
M = mat.toRowMatrix()
A = mat.toBlockMatrix()
At = mat.transpose().toBlockMatrix()

print("SVD")
print(M.numRows(), M.numCols())
start_svd = time.time()

NUM_TIMES = 10
#do it 10 times to get mean
for i in range(NUM_TIMES):
    svd = M.computeSVD(5, computeU=True)

end_svd = time.time()
print("Time elapsed: ", (end_svd - start_svd) /
import pyspark
from pyspark import SparkContext
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from operator import add
from pyspark.sql import SparkSession

sc = SparkContext()
r=sc.textFile("part-00000")
m=r.flatMap(lambda x: x.split('\n')).filter(lambda x : "A" in x).map(lambda x : (x.strip("A, ")).split(' ')).map(lambda x: tuple(list(map(int, x))))
#n=m.map(lambda x : MatrixEntry(tuple(x)))

spark = SparkSession(sc)
#m.toDF().show()
print(hasattr(m,"toDF"))

cmat=CoordinateMatrix(m)
#mat = CoordinateMatrix(n)
#o=mat.take(5)
print(cmat.numRows()) # 3
print(cmat.numCols())

rowmat = cmat.toRowMatrix()

print(rowmat.numRows()) # 3
print(rowmat.numCols())

Esempio n. 25
0
    end = time.time()
    elapsed_seconds = float("%.4f" % (end - start))
    logging.info('%s: elapsed seconds: %s', name, elapsed_seconds)


logging.getLogger().setLevel(logging.INFO)

def to_matrix_entry(x):
    i, j, v = x.split(',')
    return MatrixEntry(i, j, v)

sc = pyspark.SparkContext(appName="Matrix Multiplication")

for i in range(1, 10):
    with time_usage("temps matrix multiplication"):
        matrix_a_raw = sc.textFile(sys.argv[1])
        matrix_b_raw = sc.textFile(sys.argv[2])

        spark = SparkSession(sc)

        entries_a = matrix_a_raw.map(to_matrix_entry)
        entries_b = matrix_b_raw.map(to_matrix_entry)

        mat_a = CoordinateMatrix(entries_a).toBlockMatrix()
        mat_b = CoordinateMatrix(entries_b).toBlockMatrix()

        product = mat_a.multiply(mat_b)
        product.toLocalMatrix()

#for t in result:
    #print('%s, %s, %s' % (t[0], t[1], t[2]))
Esempio n. 26
0
debug = Debugger()
debug.TIMESTAMP(1)


def to_matrix_entry(s):
    ss = s.split()
    entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2]))
    return entry


data = sc.textFile('hdfs://node1:9000/input/sqr.txt')
mat = data.map(to_matrix_entry)
rdd = sc.parallelize(mat.collect())

coord_mat = CoordinateMatrix(rdd)
coord_mat = coord_mat.transpose()
row_mat = coord_mat.toRowMatrix()
sim = row_mat.columnSimilarities()
print(sim.entries.take(10))

debug.TIMESTAMP(2)
'''
data = data.map(lambda _ : np.array(_.strip().split()).astype(float))
unitMatrix = data.map(lambda _ : _/np.linalg.norm(_))

#unitMatrix = sc.parallelize(np.array([[1,2,3,5,7], [6,2,1,-1,3], [7,0,1,2,-4]]).T)
mat = RowMatrix(unitMatrix)
S = mat.columnSimilarities()

sims = S.entries.collect()
Esempio n. 27
0
spark = SparkSession.builder.appName(
    'imbalanced_binary_classification').getOrCreate()
#new_df = spark.read.option("delimiter", " ").csv('data/1138_bus/1138_bus_no_head.mtx', header=False, inferSchema=True)
#new_df.printSchema()

rdd = sc.textFile('data/1138_bus/1138_bus_no_head.mtx')
rdd = rdd.map(lambda line: line.split(" "))
rdd = rdd.map(lambda line: [float(x) for x in line])

print(rdd.take(2))
#ncol = len(rdd.map(lambda r: r.image).first())
nrows = rdd.count()
ncols = 3
#matrix = Matrices.dense(nrows, ncols, rdd)
print("ncol: %d, nrow %d" % (ncols, nrows))
coord_mat = CoordinateMatrix(rdd.map(tuple))
print("num rows in matrix %d" % coord_mat.numRows())

print("finished using pyspark")
#________________________________________________-

print("now use SparkSession")

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df_2 = spark.read.option("delimiter", " ").csv('./data/lpi_ceria3d_b.mtx',
                                               header=False,
                                               inferSchema=True)
df_2.printSchema()
Esempio n. 28
0
def process(sparkContext, sqlContext):

    print("Building Graph...")
    G_apa = buildGraphAPA()

    print("Meta Path...")
    paths = metaPathAPA(G_apa)

    print("Training...")
    authorIndex = []
    authorDegree = []

    authors = paths[0]
    pathNumber = paths[2]
    pathNumberAarry = []
    for pn in pathNumber.keys():
        pathNumberAarry.append(str(pn) + ":" + str(pathNumber.get(pn)))

    index = 0
    for author in authors:
        authorDegree.append(str(author) + ":" + str(len(G_apa[author])))
        authorIndex.append(str(author) + ":" + str(index))
        index = index + 1.0

#     unique_authors = authors

    authorsRDD = sparkContext.parallelize(authors)
    authorIndex = sparkContext.parallelize(authorIndex)
    pathNumber = sparkContext.parallelize(pathNumberAarry)
    authorDegree = sparkContext.parallelize(authorDegree)

    authors = authorsRDD.collect()

    ai = authorIndex.collect()
    authorIndex = dict()
    for a in ai:
        p = a.split(":")
        authorIndex[p[0]] = p[1]
#     print authorIndex

    ad = authorDegree.collect()
    authorDegree = dict()
    for a in ad:
        p = a.split(":")
        authorDegree[p[0]] = p[1]
#     print authorDegree

    pn = pathNumber.collect()
    pathNumber = dict()
    for a in pn:
        p = a.split(":")
        pathNumber[p[0]] = p[1]
#     print pathNumber

    def matEntry(author, authors):
        row = []
        #         for author in authors:
        for a in authors:
            if author == a:
                row.append((long(float(authorIndex[author])),
                            long(float(authorIndex[a])), 1.0))
            else:
                key = str(author) + str(a)
                if pathNumber.has_key(key):
                    row.append((long(float(authorIndex[author])),
                                long(float(authorIndex[a])),
                                2.0 * float(pathNumber.get(key)) /
                                (float(authorDegree[author]) +
                                 float(authorDegree[a]))))
                else:
                    row.append((long(float(authorIndex[author])),
                                long(float(authorIndex[a])), 0.0))

        return row

    def matEntryNoArgs():
        row = []
        for author in authors:
            for a in authors:
                if author == a:
                    row.append((long(float(authorIndex[author])),
                                long(float(authorIndex[a])), 1.0))
                else:
                    key = str(author) + str(a)
                    if pathNumber.has_key(key):
                        row.append((long(float(authorIndex[author])),
                                    long(float(authorIndex[a])),
                                    2.0 * float(pathNumber.get(key)) /
                                    (float(authorDegree[author]) +
                                     float(authorDegree[a]))))
                    else:
                        row.append((long(float(authorIndex[author])),
                                    long(float(authorIndex[a])), 0.0))

        return row

#     print matEntry()

    print "memememememememmmmmmemmmm"

    me = authorsRDD.map(matEntry(
        author, authors)).collect()  #.reduce(lambda x,y: x.append(y))
    #     me =  matEntry()
    #     me =  matEntryNoArgs()
    print "memememememememmmmmmemmmmOoooooooooooooooo"

    entries = sc.parallelize(me)
    print "ssssssssssssssss"
    #     # Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(entries)
    #
    print mat
    #     mat.saveAsTextFile("/home/xuepeng/uts/metapath.txt")

    # Get its size.
    print mat.numRows()  # 3
    print mat.numCols()  # 2
Esempio n. 29
0
train_lines = train_lines.filter(lambda line: line != header)
# Format Train Data (ItemID, UserID, Rating)
global train_rdd
train_rdd = train_lines.map(lambda line: line.split(',')).map(
    lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2])))

# Build Train Data Dict. with Format [(user, item)] = rating, for later check if the similar movie is rated
global train_dict
train_dict = {}
for x, y, z in train_rdd.collect():
    train_dict[(x, y)] = z

# -----------------------------------------   Build simPdsDF   -----------------------------------------------
# Form utilityMatrix to get simMat later
sqlCon = SQLContext(sc)
utilityMatrix = CoordinateMatrix(train_rdd)
# Similarity Btw. Items
simMat = utilityMatrix.toRowMatrix().columnSimilarities()
# Convert simMat to Pandas format
global simPdsDF
sparkDF = simMat.entries.map(lambda x: str(x.i) + "," + str(x.j) + "," + str(
    x.value)).map(lambda w: w.split(',')).toDF()
simPdsDF = sparkDF.toPandas()
# edit columns' name
simPdsDF.columns = ['ItemID_1', 'ItemID_2', 'Similarity']
# change data type
simPdsDF['ItemID_1'] = simPdsDF['ItemID_1'].astype(int)
simPdsDF['ItemID_2'] = simPdsDF['ItemID_2'].astype(int)
simPdsDF['Similarity'] = simPdsDF['Similarity'].astype(float)

# --------------------------------------- Used for RDD to calculate bias ---------------------------------------------
    IndexedRow(1, [4, 5, 6]),
    IndexedRow(2, [7, 8, 9]),
    IndexedRow(3, [10, 11, 12])
])
mat = IndexedRowMatrix(indexed)
print(mat)

# convert to row matrix
rowMat = mat.toRowMatrix()
print(rowMat)

# A CoordinateMatrix is distributed and stored in an object called a coordinate list.

from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

entries = sc.parallelize(
    [MatrixEntry(0, 0, 1.2),
     MatrixEntry(1, 0, 2.1),
     MatrixEntry(6, 1, 3.7)])
mat = CoordinateMatrix(entries)

m = mat.numRows()
n = mat.number_columns()

print(m)
print(n)

# convert to indexed row matrix
rowMat = mat.toIndexedRowMatrix()
print(rowMat)
Esempio n. 31
0
 def transpose(rm):
     cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap(
         lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
     return cm.transpose().toRowMatrix()
Esempio n. 32
0
txt = txt.sample(False, 0.001, 1)  # XXX: random sample for local testing
txt = txt.zipWithIndex().filter(lambda x: int(x[1]) >= 4).map(
    lambda x: x[0].split('\t'))

## Get graph Laplacian
N = txt.flatMap(lambda x: [int(xx) for xx in x]).max()

upper_entries = txt.map(
    lambda x: MatrixEntry(int(x[0]) - 1,
                          int(x[1]) - 1, 1.0))
lower_entries = txt.map(
    lambda x: MatrixEntry(int(x[1]) - 1,
                          int(x[0]) - 1, 1.0))
degrees = upper_entries.map(lambda entry: (entry.i, entry.value)).reduceByKey(
    lambda a, b: a + b)
W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N)

# XXX:
laplacian = sys.argv[1]

if laplacian == 'unnormalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1]))
    D = CoordinateMatrix(entries, numCols=N, numRows=N)
    L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix()
elif laplacian == 'normalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1]))
    D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix()
    I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)),
                         numCols=N,
                         numRows=N).toBlockMatrix()
    L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix()