Example #1
0
    def edgesToSDM(self, truncated_vertices):
        self.edges.createOrReplaceTempView("edges")
        truncated_vertices.createOrReplaceTempView("vertices")
        real_edges = self.spark.sql(
            "select * from edges where edges.src in (select id from vertices) or edges.dst in (select id from vertices)"
        ).persist(StorageLevel.MEMORY_AND_DISK)
        real_edges.createOrReplaceTempView("real_edges")
        noedge_vertices = self.spark.sql(
            "select * from vertices where vertices.id not in (select real_edges.src from real_edges) and vertices.id not in(select real_edges.dst from real_edges)"
        )
        arti_edges = noedge_vertices.withColumnRenamed("id", "src").join(
            truncated_vertices.select("id").withColumnRenamed("id", "dst"))
        arti_edges = arti_edges.filter(F.col('src') != F.col('dst'))

        # src to dst
        entries_1 = real_edges.rdd.map(
            lambda row: MatrixEntry(row.src, row.dst, 1))
        # dst to src
        entries_2 = real_edges.rdd.map(
            lambda row: MatrixEntry(row.dst, row.src, 1))
        # self transition
        entries_3 = truncated_vertices.select("id").rdd.map(
            lambda row: MatrixEntry(row.id, row.id, 1))
        # edges to avoid self-loop with no uncertainty (randomly distribute the importance of the current node [with artificial edges])
        entries_4 = arti_edges.rdd.map(
            lambda row: MatrixEntry(row.src, row.dst, 1))

        entries = entries_1.union(entries_2.union(
            entries_3.union(entries_4))).persist(StorageLevel.MEMORY_AND_DISK)
        size = truncated_vertices.count()
        self.edges_sdm = sdm.SparseDistributedMatrix(entries, size, size)
        return self.edges_sdm
def do_cartesian(sc, df, id_col=None, feature_col=None, **kwargs):
    import functools

    sigma = kwargs.get('sigma', 0.42)
    tol = kwargs.get('tol', 10e-10)
    standardize = kwargs.get('standardize', True)

    if isinstance(feature_col, list):
        feature_col, scaled_df = _make_feature_vector(df=df, feature_col=feature_col)

    if standardize:
        scaled_df = _scale_data_frame(scaled_df, vector=feature_col)

    if id_col:
        vector_dict = scaled_df.select(id_col, feature_col).rdd.collectAsMap()
    else:
        vector_dict = (scaled_df.select(feature_col)
            .rdd.zipWithIndex().map(lambda x: (x[1], x[0][feature_col]))
            .collectAsMap())
    bc_vec = sc.broadcast(vector_dict)

    index_rdd = df.rdd.map(lambda x: x[id_col]).cache()
    bfs = functools.partial(_compute_bfs)
    cartesian_demon = index_rdd.cartesian(index_rdd).filter(lambda x: x[0] >= x[1])
    cartesian_distance_demon = cartesian_demon.map(
        lambda x: MatrixEntry(x[0], x[1], bfs(
            vec_1=bc_vec.value.get(x[0]),
            vec_2=bc_vec.value.get(x[1]),
            sigma=sigma))
    )

    index_rdd.unpersist() # Memory cleanup!
    tol_cut = functools.partial(_tolerance_cut, tol=tol)
    return cartesian_distance_demon.filter(lambda x: tol_cut(x.value))
Example #3
0
def build_matrix(svo_path: str, cat1_instances: set,
                 cat2_instances: set) -> CoordinateMatrix:
    raw_df = spark.read.csv(svo_path, sep='\t')

    pairs_df = (raw_df.filter(
        (f.col('_c0').isin(cat1_instances)
         & f.col('_c2').isin(cat2_instances))
        | (f.col('_c0').isin(cat2_instances)
           & f.col('_c2').isin(cat1_instances))).rdd.map(lambda x: (tuple(
               sorted((x['_c0'], x['_c2']))), x['_c1'], int(x['_c3']))).toDF(
                   ['pair', 'verb', 'n']))

    named_coords = (pairs_df.selectExpr('pair', 'verb as left_verb', 'n').join(
        pairs_df.selectExpr('pair', 'verb as right_verb'),
        'pair').filter('left_verb < right_verb').groupby(
            ['left_verb', 'right_verb']).count())

    verb_to_id = (pairs_df.select('verb').distinct().rdd.zipWithIndex().map(
        lambda r: [r[0].verb, r[1]]).toDF(['verb', 'id']))

    coords = (named_coords.join(
        verb_to_id, named_coords.left_verb == verb_to_id.verb).selectExpr(
            'right_verb', 'id as left_verb_id', 'count').join(
                verb_to_id,
                named_coords.right_verb == verb_to_id.verb).selectExpr(
                    'left_verb_id', 'id as right_verb_id', 'count'))

    matrix = CoordinateMatrix(coords.rdd.map(lambda c: MatrixEntry(*c)))

    return matrix
Example #4
0
def MatrixTranspose(
    mat
):  #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help.
    ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it,
    #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix
    ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order
    ## use indexed matrix could partially fix this issue by reordering but this is too wierd
    '''
	transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector
	:param mat: the input row matrix
	:return a transposed row matrix
	ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark
	'''
    if isinstance(mat, IndexedRowMatrix):
        mat = mat.toRowMatrix()
    #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency
    transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
    transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF(
    ).orderBy("index")
    # back to sparse first then convert to indexedrowmatrix
    transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow(
        row["index"],
        MLLibVectors.sparse(
            row["vector"].size,
            np.nonzero(row["vector"].values)[0], row["vector"].values[
                np.nonzero(row["vector"].values)])))
    return IndexedRowMatrix(transposed_mat)
def readRatings(spark, f_name, ratio=[0.8, 0.2], seed=0):
    """ Read the rating of users for movies 
        Return the utility matrix"""
    df = spark.read.csv(f_name, header=True)
    #df = normalize(spark, df)
    rdd = df.rdd

    (training, test) = df.randomSplit(ratio, seed=seed)

    training_utility = CoordinateMatrix(
        training.rdd.map(lambda row: MatrixEntry(row['userId'], row[
            'movieId'], row['rating'])), users_total, movies_total)
    test_utility = CoordinateMatrix(
        test.rdd.map(lambda row: MatrixEntry(row['userId'], row[
            'movieId'], row['rating'])), users_total, movies_total)

    return (training_utility, test_utility)
    def setUp(self):
        spark = SparkSession(sparkContext=self.sc)
        y = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]])
        X_triangle = np.array([[1, 0, 0, 0, 0, 0], [2, 1, 0, 0, 0, 0],
                               [3, 4, 1, 0, 0, 0], [5, 6, 7, 1, 0, 0],
                               [1, 4, 2, 1, 1, 0], [1, 1, 1, 1, 1, 1]])
        self.y_shape = y.shape
        self.longMessage = True
        self.X_shape = X_triangle.shape
        self.X_real = X_triangle+X_triangle.T-np.eye(6)
        self.product = self.X_real.dot(y)
        self.rdd_y = (self.sc.parallelize(y)
            .map(lambda x: x.tolist()).map(lambda x: list(enumerate(x)))
            .zipWithIndex()
            .flatMap(lambda x: [MatrixEntry(i=x[1], j=jdx, value=val) for jdx, val in x[0]])
            .filter(lambda x: x.value != 0.))

        self.rdd_X = (self.sc.parallelize(X_triangle)
            .map(lambda x: x.tolist()).map(lambda x: list(enumerate(x)))
            .zipWithIndex()
            .flatMap(lambda x: [MatrixEntry(i=x[1], j=jdx, value=val) for jdx, val in x[0]])
            .filter(lambda x: x.value != 0.))
Example #7
0
 def multiply(self, B):
     a,b = self._pre_arithmetic_op(self, B)
     c = a.union(b).groupByKey().map(
         lambda x : MatrixEntry(x[0][0],x[0][1], x[1].data[0] * x[1].data[1] if len(x[1].data) == 2 else 0) 
     )
     
     return SparseDistributedMatrix(c, self.numRows(), self.numCols())
 
 # def multiply(self, b:float):
 #     c = self.entries.map(
 #         lambda entry : MatrixEntry(entry.i, entry.j, entry.value * b) 
 #     )
     
 #     return SparseDistributedMatrix(self.sc, c, self.numRows(), self.numCols())
Example #8
0
 def _dot1(self, B):
     if self.numCols() != B.numRows():
         raise Exception(f"size mismatch {(self.numRows(), self.numCols())}, {(B.numRows(), B.numCols())}")
     a = self._pre_dot(self, 'row')
     b = self._pre_dot(B, 'col')
     
     
     c = a.cartesian(b).map(
         lambda x: MatrixEntry(x[0][0], x[1][0], x[0][1].dot(x[1][1]))
     ).filter(
         lambda entry: entry.value != 0.0
     )
         
     return SparseDistributedMatrix(c, self.numRows(), B.numCols())
Example #9
0
 def newW(R, W, H):
     #W = np.multiply((X.dot(H.T))/(W.dot(H).dot(H.T)),W)
     a = R.multiply(H.transpose()).toCoordinateMatrix()\
      .map(lambda entries:((entries.i,entries.j),(0,entries.value)))
     b = W.multiply(H).multiply(H.transpose()).toCoordinateMatrix()\
      .map(lambda entries:((entries.i,entries.j),(1,entries.value)))
     c = a.union(b).reduceByKey(lambda a, b: (a[0] == 0 and (2, a[
         2] / b[2])) or (b[0] == 0 and 2, b[2] / a[2]) or b)
     #identify the right order of dividing
     c = c.map(lambda x: ((x[0][0], x[0][1]), x[1][1]))
     d = c.join(W.toCoordinateMatrix().map(lambda entries:((entries.i,entries.j),entries.value)))\
      .reduceByKey(lambda a,b:a*b)
     return CoordinateMatrix(
         d.map(lambda x: MatrixEntry(
             (x[0][0], x[0][1]), x[1][1]))).toBlockMatrix()
def get_vectors_df(playcounts_df):
    """
    Each row of playcounts_df has the following columns: recording_id, spark_user_id and a play count denoting how many times
    a user has played that recording. However, the correlation matrix requires a dataframe having a column of user
    vectors. Spark has various representations built-in for storing sparse matrices. Of these, two are Coordinate
    Matrix and Indexed Row Matrix. A coordinate matrix stores the matrix as tuples of (i, j, x) where matrix[i, j] = x.
    An Indexed Row Matrix stores it as tuples of row index and vectors.

    Our playcounts_df is similar in structure to a coordinate matrix. We begin with mapping each row of the
    playcounts_df to a MatrixEntry and then create a matrix of these entries. The recording_ids are rows, user_ids are
    columns and the playcounts are the values in the matrix. We convert the coordinate matrix to indexed row matrix
    form. Spark ML and MLlib have different representations of vectors, hence we need to manually convert between the
    two. Finally, we take the rows and create a dataframe from them.
    """
    tuple_mapped_rdd = playcounts_df.rdd.map(lambda x: MatrixEntry(x["recording_id"], x["spark_user_id"], x["count"]))
    coordinate_matrix = CoordinateMatrix(tuple_mapped_rdd)
    indexed_row_matrix = coordinate_matrix.toIndexedRowMatrix()
    vectors_mapped_rdd = indexed_row_matrix.rows.map(lambda r: (r.index, r.vector.asML()))
    return listenbrainz_spark.session.createDataFrame(vectors_mapped_rdd, ['index', 'vector'])
Example #11
0
def matrix_multiply(A, B):

    '''
    
    This function returns the cross product between two matrices represented in Coordinate matrix format
    It is implemented by making simple joins. The code is implemented by refering to the scala implementation in the below link
    https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703

    A: CoordinateMatrix Dataframe
    B: CoordinateMatrix Dataframe
    returns: CoordinateMatrix Dataframe of cross product between A and B

    '''

    A_rdd = A.entries.map(lambda x: (x.j,(x.i,x.value))) # Convert dataframe to rdd of (column,(row, value))
    B_rdd = B.entries.map(lambda x: (x.i,(x.j,x.value))) # Convert dataframe to rdd of (row,(column, value))

    interm_rdd = A_rdd.join(B_rdd).map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1]*x[1][1][1]))) # Join two rdds and convert to ((row,column),(value))
    C_rdd = interm_rdd.reduceByKey(add).map(lambda x: MatrixEntry(x[0][0],x[0][1],x[1])) # Add the product of same (row,column) pair and convert each row into a matrix entry of (row, column, value)
    return CoordinateMatrix(C_rdd)
Example #12
0
 def transpose(rm):
     cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap(
         lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
     return cm.transpose().toRowMatrix()
Example #13
0
 def mapFuncJ(entry):
     return MatrixEntry(entry.j, entry.i + self.numDimI * entry.k,
                        entry.val)
Example #14
0
 def mapFuncK(entry):
     return MatrixEntry(entry.k, entry.j + self.numDimJ * entry.i,
                        entry.val)
from pyspark.mllib.linalg.distributed import DenseMatrix
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry


if __name__ == "__main__":
    # set up spark context and configuration
    conf = SparkConf().setAppName("PythonPCAOnRowMatrixExample")
    sc = SparkContext(conf=conf)
    print(sc.getConf().getAll())    
    sqlContext = sql.SQLContext(sc)

    # load data
    data = sc.textFile("gs://dataproc-ae279739-4c78-478e-9024-8b7ea842f82e-us/heart1.txt")
    entries = data.map(lambda l: l.split(' ')).map(lambda l: MatrixEntry(np.long(l[0]), np.long(l[1]), np.float(l[2])))

    # create RowMatrix   
    premat = CoordinateMatrix(entries)
    mat = premat.toIndexedRowMatrix()

    print(mat.numCols())
    print(mat.numRows())

    # gramian
    start_time = time.time()
    decomp = mat.computeGramianMatrix()
    elapsedtime = time.time() - start_time
    print(elapsedtime)

    # svd
Example #16
0
 def mapFuncI(entry):
     return MatrixEntry(entry.i, entry.k + self.numDimK * entry.j,
                        entry.val)
    IndexedRow(1, [4, 5, 6]),
    IndexedRow(2, [7, 8, 9]),
    IndexedRow(3, [10, 11, 12])
])
mat = IndexedRowMatrix(indexed)
print(mat)

# convert to row matrix
rowMat = mat.toRowMatrix()
print(rowMat)

# A CoordinateMatrix is distributed and stored in an object called a coordinate list.

from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

entries = sc.parallelize(
    [MatrixEntry(0, 0, 1.2),
     MatrixEntry(1, 0, 2.1),
     MatrixEntry(6, 1, 3.7)])
mat = CoordinateMatrix(entries)

m = mat.numRows()
n = mat.number_columns()

print(m)
print(n)

# convert to indexed row matrix
rowMat = mat.toIndexedRowMatrix()
print(rowMat)
Example #18
0
def to_matrix_entry(x):
    i, j, v = x.split()
    return MatrixEntry(i, j, v)
Example #19
0
 def outer(self, v):
     c = self.rdd.cartesian(v.rdd).map(lambda x: MatrixEntry(
         x[0][0], x[1][0], float(x[0][1] * x[1][1]))).filter(
             lambda entry: entry.value != 0.0)
     return sdm.SparseDistributedMatrix(c, self.size, v.size)
Example #20
0
 def diag(vect):
     c = vect.rdd.map(
         lambda entry : MatrixEntry(entry[0], entry[0], entry[1])
     ) 
     return SparseDistributedMatrix(c, vect.size, vect.size)
Example #21
0
 def transpose(self):
     entries = self.entries.map(
         lambda entry: MatrixEntry(entry.j, entry.i, entry.value)
     )
     return SparseDistributedMatrix(entries, self.numCols(), self.numRows())
Example #22
0
	coo_matrix_input_all.cache()


	# Diagonalize RDD  

	diag_entries_1 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==0).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3),-value ));
	diag_entries_1.cache()
	diag_entries_2 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==1).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+1,-value ));
	diag_entries_2.cache()
	diag_entries_3 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==2).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+2,-value ));
	diag_entries_3.cache()

	diag_entries = diag_entries_1.union(diag_entries_2).union(diag_entries_3);
	
	coo_matrix_input_all  = coo_matrix_input_all.union(diag_entries);
	coo_matrix_entries = coo_matrix_input_all.map(lambda e: MatrixEntry(e[0], e[1], e[2]));
	coo_matrix = CoordinateMatrix(coo_matrix_entries);


	#SAVE TO A FILE
	coo_matrix_input_all.repartition(1).saveAsTextFile("./Laplacian_4v7o_4cores_1")
	t2 = timeit.default_timer()
	print("Elapsed time for construction: {:} s".format(t2 - t0))


	#Singular value decomposition
	
	dataRows = coo_matrix.toRowMatrix().rows

	k = int(args.k) #N_singvalues
	svd = RowMatrix(dataRows.persist()).computeSVD(k, computeU=True)
Example #23
0
def to_matrix_entry(s):
    ss = s.split()
    entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2]))
    return entry
Example #24
0
        .appName("linalgtest")\
        .getOrCreate()

#conf = SparkConf().setAppName('linalgtest')
#sc = SparkContext(conf=conf).getOrCreate()

#use local spark on computer
# findspark.init()
#from pyspark.sql import SparkSession

local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx'

rdd = spark.sparkContext.textFile(local_file_location)
rdd = rdd.map(lambda line: line.split(" "))
rdd = rdd.map(
    lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2])))

mat = CoordinateMatrix(rdd)
M = mat.toRowMatrix()
A = mat.toBlockMatrix()
At = mat.transpose().toBlockMatrix()

print("SVD")
print(M.numRows(), M.numCols())
start_svd = time.time()

NUM_TIMES = 10
#do it 10 times to get mean
for i in range(NUM_TIMES):
    svd = M.computeSVD(5, computeU=True)
Example #25
0
import sys

K = 5

## Read data.
txt = sc.textFile('./data/com-amazon.ungraph.txt')
txt = txt.sample(False, 0.001, 1)  # XXX: random sample for local testing
txt = txt.zipWithIndex().filter(lambda x: int(x[1]) >= 4).map(
    lambda x: x[0].split('\t'))

## Get graph Laplacian
N = txt.flatMap(lambda x: [int(xx) for xx in x]).max()

upper_entries = txt.map(
    lambda x: MatrixEntry(int(x[0]) - 1,
                          int(x[1]) - 1, 1.0))
lower_entries = txt.map(
    lambda x: MatrixEntry(int(x[1]) - 1,
                          int(x[0]) - 1, 1.0))
degrees = upper_entries.map(lambda entry: (entry.i, entry.value)).reduceByKey(
    lambda a, b: a + b)
W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N)

# XXX:
laplacian = sys.argv[1]

if laplacian == 'unnormalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1]))
    D = CoordinateMatrix(entries, numCols=N, numRows=N)
    L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix()
elif laplacian == 'normalized':