Python MatrixEntry Examples, pyspark.mllib.linalg.distributed.MatrixEntry Python Examples

Example #1

0

Show file

File: PandiNetwork.py Project: MohamedHmini/D-PandiSIM

    def edgesToSDM(self, truncated_vertices):
        self.edges.createOrReplaceTempView("edges")
        truncated_vertices.createOrReplaceTempView("vertices")
        real_edges = self.spark.sql(
            "select * from edges where edges.src in (select id from vertices) or edges.dst in (select id from vertices)"
        ).persist(StorageLevel.MEMORY_AND_DISK)
        real_edges.createOrReplaceTempView("real_edges")
        noedge_vertices = self.spark.sql(
            "select * from vertices where vertices.id not in (select real_edges.src from real_edges) and vertices.id not in(select real_edges.dst from real_edges)"
        )
        arti_edges = noedge_vertices.withColumnRenamed("id", "src").join(
            truncated_vertices.select("id").withColumnRenamed("id", "dst"))
        arti_edges = arti_edges.filter(F.col('src') != F.col('dst'))

        # src to dst
        entries_1 = real_edges.rdd.map(
            lambda row: MatrixEntry(row.src, row.dst, 1))
        # dst to src
        entries_2 = real_edges.rdd.map(
            lambda row: MatrixEntry(row.dst, row.src, 1))
        # self transition
        entries_3 = truncated_vertices.select("id").rdd.map(
            lambda row: MatrixEntry(row.id, row.id, 1))
        # edges to avoid self-loop with no uncertainty (randomly distribute the importance of the current node [with artificial edges])
        entries_4 = arti_edges.rdd.map(
            lambda row: MatrixEntry(row.src, row.dst, 1))

        entries = entries_1.union(entries_2.union(
            entries_3.union(entries_4))).persist(StorageLevel.MEMORY_AND_DISK)
        size = truncated_vertices.count()
        self.edges_sdm = sdm.SparseDistributedMatrix(entries, size, size)
        return self.edges_sdm

Example #2

0

Show file

File: lp_generate_graph.py Project: Visma-MLaaS/WorkflowCleaning

def do_cartesian(sc, df, id_col=None, feature_col=None, **kwargs):
    import functools

    sigma = kwargs.get('sigma', 0.42)
    tol = kwargs.get('tol', 10e-10)
    standardize = kwargs.get('standardize', True)

    if isinstance(feature_col, list):
        feature_col, scaled_df = _make_feature_vector(df=df, feature_col=feature_col)

    if standardize:
        scaled_df = _scale_data_frame(scaled_df, vector=feature_col)

    if id_col:
        vector_dict = scaled_df.select(id_col, feature_col).rdd.collectAsMap()
    else:
        vector_dict = (scaled_df.select(feature_col)
            .rdd.zipWithIndex().map(lambda x: (x[1], x[0][feature_col]))
            .collectAsMap())
    bc_vec = sc.broadcast(vector_dict)

    index_rdd = df.rdd.map(lambda x: x[id_col]).cache()
    bfs = functools.partial(_compute_bfs)
    cartesian_demon = index_rdd.cartesian(index_rdd).filter(lambda x: x[0] >= x[1])
    cartesian_distance_demon = cartesian_demon.map(
        lambda x: MatrixEntry(x[0], x[1], bfs(
            vec_1=bc_vec.value.get(x[0]),
            vec_2=bc_vec.value.get(x[1]),
            sigma=sigma))
    )

    index_rdd.unpersist() # Memory cleanup!
    tol_cut = functools.partial(_tolerance_cut, tol=tol)
    return cartesian_distance_demon.filter(lambda x: tol_cut(x.value))

Example #3

0

Show file

def build_matrix(svo_path: str, cat1_instances: set,
                 cat2_instances: set) -> CoordinateMatrix:
    raw_df = spark.read.csv(svo_path, sep='\t')

    pairs_df = (raw_df.filter(
        (f.col('_c0').isin(cat1_instances)
         & f.col('_c2').isin(cat2_instances))
        | (f.col('_c0').isin(cat2_instances)
           & f.col('_c2').isin(cat1_instances))).rdd.map(lambda x: (tuple(
               sorted((x['_c0'], x['_c2']))), x['_c1'], int(x['_c3']))).toDF(
                   ['pair', 'verb', 'n']))

    named_coords = (pairs_df.selectExpr('pair', 'verb as left_verb', 'n').join(
        pairs_df.selectExpr('pair', 'verb as right_verb'),
        'pair').filter('left_verb < right_verb').groupby(
            ['left_verb', 'right_verb']).count())

    verb_to_id = (pairs_df.select('verb').distinct().rdd.zipWithIndex().map(
        lambda r: [r[0].verb, r[1]]).toDF(['verb', 'id']))

    coords = (named_coords.join(
        verb_to_id, named_coords.left_verb == verb_to_id.verb).selectExpr(
            'right_verb', 'id as left_verb_id', 'count').join(
                verb_to_id,
                named_coords.right_verb == verb_to_id.verb).selectExpr(
                    'left_verb_id', 'id as right_verb_id', 'count'))

    matrix = CoordinateMatrix(coords.rdd.map(lambda c: MatrixEntry(*c)))

    return matrix

Example #4

0

Show file

File: matrixoperation.py Project: xinwang26/pyspark

def MatrixTranspose(
    mat
):  #have some issues --1. will cause errors for some data, not sure reasons butreducing number of rows could help.
    ###2. the transpose sometimes return wrong result which seems due to parition issue -- repartion(1) sometimes fix it,
    #also pypsark change the order of rows after transposed coordinate matrix convert to row matrix
    ## this bug ref:https://stackoverflow.com/questions/34451253/converting-coordinatematrix-to-rowmatrix-doesnt-preserve-row-order
    ## use indexed matrix could partially fix this issue by reordering but this is too wierd
    '''
	transpose a row matrix -- to save space/memory use sparse vector when input is sparse vector
	:param mat: the input row matrix
	:return a transposed row matrix
	ref: https://stackoverflow.com/questions/47102378/transpose-a-rowmatrix-in-pyspark
	'''
    if isinstance(mat, IndexedRowMatrix):
        mat = mat.toRowMatrix()
    #this line will turn everythign to some dense matrix entries, try avoid using this function for efficiency
    transposed_mat = CoordinateMatrix(mat.rows.zipWithIndex().flatMap(
        lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
    transposed_mat = transposed_mat.transpose().toIndexedRowMatrix().rows.toDF(
    ).orderBy("index")
    # back to sparse first then convert to indexedrowmatrix
    transposed_mat = transposed_mat.rdd.map(lambda row: IndexedRow(
        row["index"],
        MLLibVectors.sparse(
            row["vector"].size,
            np.nonzero(row["vector"].values)[0], row["vector"].values[
                np.nonzero(row["vector"].values)])))
    return IndexedRowMatrix(transposed_mat)

Example #5

0

Show file

File: FileLoader.py Project: WingCuengRay/Movie-recommending-system

def readRatings(spark, f_name, ratio=[0.8, 0.2], seed=0):
    """ Read the rating of users for movies 
        Return the utility matrix"""
    df = spark.read.csv(f_name, header=True)
    #df = normalize(spark, df)
    rdd = df.rdd

    (training, test) = df.randomSplit(ratio, seed=seed)

    training_utility = CoordinateMatrix(
        training.rdd.map(lambda row: MatrixEntry(row['userId'], row[
            'movieId'], row['rating'])), users_total, movies_total)
    test_utility = CoordinateMatrix(
        test.rdd.map(lambda row: MatrixEntry(row['userId'], row[
            'movieId'], row['rating'])), users_total, movies_total)

    return (training_utility, test_utility)

Example #6

0

Show file

File: test_naive_multiplication_rdd.py Project: Visma-MLaaS/WorkflowCleaning

    def setUp(self):
        spark = SparkSession(sparkContext=self.sc)
        y = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]])
        X_triangle = np.array([[1, 0, 0, 0, 0, 0], [2, 1, 0, 0, 0, 0],
                               [3, 4, 1, 0, 0, 0], [5, 6, 7, 1, 0, 0],
                               [1, 4, 2, 1, 1, 0], [1, 1, 1, 1, 1, 1]])
        self.y_shape = y.shape
        self.longMessage = True
        self.X_shape = X_triangle.shape
        self.X_real = X_triangle+X_triangle.T-np.eye(6)
        self.product = self.X_real.dot(y)
        self.rdd_y = (self.sc.parallelize(y)
            .map(lambda x: x.tolist()).map(lambda x: list(enumerate(x)))
            .zipWithIndex()
            .flatMap(lambda x: [MatrixEntry(i=x[1], j=jdx, value=val) for jdx, val in x[0]])
            .filter(lambda x: x.value != 0.))

        self.rdd_X = (self.sc.parallelize(X_triangle)
            .map(lambda x: x.tolist()).map(lambda x: list(enumerate(x)))
            .zipWithIndex()
            .flatMap(lambda x: [MatrixEntry(i=x[1], j=jdx, value=val) for jdx, val in x[0]])
            .filter(lambda x: x.value != 0.))

Example #7

0

Show file

 def multiply(self, B):
     a,b = self._pre_arithmetic_op(self, B)
     c = a.union(b).groupByKey().map(
         lambda x : MatrixEntry(x[0][0],x[0][1], x[1].data[0] * x[1].data[1] if len(x[1].data) == 2 else 0) 
     )
     
     return SparseDistributedMatrix(c, self.numRows(), self.numCols())
 
 # def multiply(self, b:float):
 #     c = self.entries.map(
 #         lambda entry : MatrixEntry(entry.i, entry.j, entry.value * b) 
 #     )
     
 #     return SparseDistributedMatrix(self.sc, c, self.numRows(), self.numCols())

Example #8

0

Show file

 def _dot1(self, B):
     if self.numCols() != B.numRows():
         raise Exception(f"size mismatch {(self.numRows(), self.numCols())}, {(B.numRows(), B.numCols())}")
     a = self._pre_dot(self, 'row')
     b = self._pre_dot(B, 'col')
     
     
     c = a.cartesian(b).map(
         lambda x: MatrixEntry(x[0][0], x[1][0], x[0][1].dot(x[1][1]))
     ).filter(
         lambda entry: entry.value != 0.0
     )
         
     return SparseDistributedMatrix(c, self.numRows(), B.numCols())

Example #9

0

Show file

File: ex5.py Project: hj940709/Big-Data-Framework

 def newW(R, W, H):
     #W = np.multiply((X.dot(H.T))/(W.dot(H).dot(H.T)),W)
     a = R.multiply(H.transpose()).toCoordinateMatrix()\
      .map(lambda entries:((entries.i,entries.j),(0,entries.value)))
     b = W.multiply(H).multiply(H.transpose()).toCoordinateMatrix()\
      .map(lambda entries:((entries.i,entries.j),(1,entries.value)))
     c = a.union(b).reduceByKey(lambda a, b: (a[0] == 0 and (2, a[
         2] / b[2])) or (b[0] == 0 and 2, b[2] / a[2]) or b)
     #identify the right order of dividing
     c = c.map(lambda x: ((x[0][0], x[0][1]), x[1][1]))
     d = c.join(W.toCoordinateMatrix().map(lambda entries:((entries.i,entries.j),entries.value)))\
      .reduceByKey(lambda a,b:a*b)
     return CoordinateMatrix(
         d.map(lambda x: MatrixEntry(
             (x[0][0], x[0][1]), x[1][1]))).toBlockMatrix()

Example #10

0

Show file

File: user_similarity.py Project: amCap1712/listenbrainz-server

def get_vectors_df(playcounts_df):
    """
    Each row of playcounts_df has the following columns: recording_id, spark_user_id and a play count denoting how many times
    a user has played that recording. However, the correlation matrix requires a dataframe having a column of user
    vectors. Spark has various representations built-in for storing sparse matrices. Of these, two are Coordinate
    Matrix and Indexed Row Matrix. A coordinate matrix stores the matrix as tuples of (i, j, x) where matrix[i, j] = x.
    An Indexed Row Matrix stores it as tuples of row index and vectors.

    Our playcounts_df is similar in structure to a coordinate matrix. We begin with mapping each row of the
    playcounts_df to a MatrixEntry and then create a matrix of these entries. The recording_ids are rows, user_ids are
    columns and the playcounts are the values in the matrix. We convert the coordinate matrix to indexed row matrix
    form. Spark ML and MLlib have different representations of vectors, hence we need to manually convert between the
    two. Finally, we take the rows and create a dataframe from them.
    """
    tuple_mapped_rdd = playcounts_df.rdd.map(lambda x: MatrixEntry(x["recording_id"], x["spark_user_id"], x["count"]))
    coordinate_matrix = CoordinateMatrix(tuple_mapped_rdd)
    indexed_row_matrix = coordinate_matrix.toIndexedRowMatrix()
    vectors_mapped_rdd = indexed_row_matrix.rows.map(lambda r: (r.index, r.vector.asML()))
    return listenbrainz_spark.session.createDataFrame(vectors_mapped_rdd, ['index', 'vector'])

Example #11

0

Show file

def matrix_multiply(A, B):

    '''
    
    This function returns the cross product between two matrices represented in Coordinate matrix format
    It is implemented by making simple joins. The code is implemented by refering to the scala implementation in the below link
    https://medium.com/balabit-unsupervised/scalable-sparse-matrix-multiplication-in-apache-spark-c79e9ffc0703

    A: CoordinateMatrix Dataframe
    B: CoordinateMatrix Dataframe
    returns: CoordinateMatrix Dataframe of cross product between A and B

    '''

    A_rdd = A.entries.map(lambda x: (x.j,(x.i,x.value))) # Convert dataframe to rdd of (column,(row, value))
    B_rdd = B.entries.map(lambda x: (x.i,(x.j,x.value))) # Convert dataframe to rdd of (row,(column, value))

    interm_rdd = A_rdd.join(B_rdd).map(lambda x: ((x[1][0][0],x[1][1][0]),(x[1][0][1]*x[1][1][1]))) # Join two rdds and convert to ((row,column),(value))
    C_rdd = interm_rdd.reduceByKey(add).map(lambda x: MatrixEntry(x[0][0],x[0][1],x[1])) # Add the product of same (row,column) pair and convert each row into a matrix entry of (row, column, value)
    return CoordinateMatrix(C_rdd)

Example #12

0

Show file

 def transpose(rm):
     cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap(
         lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
     return cm.transpose().toRowMatrix()

Example #13

0

Show file

 def mapFuncJ(entry):
     return MatrixEntry(entry.j, entry.i + self.numDimI * entry.k,
                        entry.val)

Example #14

0

Show file

 def mapFuncK(entry):
     return MatrixEntry(entry.k, entry.j + self.numDimJ * entry.i,
                        entry.val)

Example #15

0

Show file

File: main.py Project: lpkg/Software-Engineering-and-Cloud-Computing

from pyspark.mllib.linalg.distributed import DenseMatrix
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg.distributed import CoordinateMatrix
from pyspark.mllib.linalg.distributed import MatrixEntry


if __name__ == "__main__":
    # set up spark context and configuration
    conf = SparkConf().setAppName("PythonPCAOnRowMatrixExample")
    sc = SparkContext(conf=conf)
    print(sc.getConf().getAll())    
    sqlContext = sql.SQLContext(sc)

    # load data
    data = sc.textFile("gs://dataproc-ae279739-4c78-478e-9024-8b7ea842f82e-us/heart1.txt")
    entries = data.map(lambda l: l.split(' ')).map(lambda l: MatrixEntry(np.long(l[0]), np.long(l[1]), np.float(l[2])))

    # create RowMatrix   
    premat = CoordinateMatrix(entries)
    mat = premat.toIndexedRowMatrix()

    print(mat.numCols())
    print(mat.numRows())

    # gramian
    start_time = time.time()
    decomp = mat.computeGramianMatrix()
    elapsedtime = time.time() - start_time
    print(elapsedtime)

    # svd

Example #16

0

Show file

 def mapFuncI(entry):
     return MatrixEntry(entry.i, entry.k + self.numDimK * entry.j,
                        entry.val)

Example #17

0

Show file

File: MachineLearningSparkDataTypes.py Project: petersontylerd/spark-courses

    IndexedRow(1, [4, 5, 6]),
    IndexedRow(2, [7, 8, 9]),
    IndexedRow(3, [10, 11, 12])
])
mat = IndexedRowMatrix(indexed)
print(mat)

# convert to row matrix
rowMat = mat.toRowMatrix()
print(rowMat)

# A CoordinateMatrix is distributed and stored in an object called a coordinate list.

from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

entries = sc.parallelize(
    [MatrixEntry(0, 0, 1.2),
     MatrixEntry(1, 0, 2.1),
     MatrixEntry(6, 1, 3.7)])
mat = CoordinateMatrix(entries)

m = mat.numRows()
n = mat.number_columns()

print(m)
print(n)

# convert to indexed row matrix
rowMat = mat.toIndexedRowMatrix()
print(rowMat)

Example #18

0

Show file

def to_matrix_entry(x):
    i, j, v = x.split()
    return MatrixEntry(i, j, v)

Example #19

0

Show file

 def outer(self, v):
     c = self.rdd.cartesian(v.rdd).map(lambda x: MatrixEntry(
         x[0][0], x[1][0], float(x[0][1] * x[1][1]))).filter(
             lambda entry: entry.value != 0.0)
     return sdm.SparseDistributedMatrix(c, self.size, v.size)

Example #20

0

Show file

 def diag(vect):
     c = vect.rdd.map(
         lambda entry : MatrixEntry(entry[0], entry[0], entry[1])
     ) 
     return SparseDistributedMatrix(c, vect.size, vect.size)

Example #21

0

Show file

 def transpose(self):
     entries = self.entries.map(
         lambda entry: MatrixEntry(entry.j, entry.i, entry.value)
     )
     return SparseDistributedMatrix(entries, self.numCols(), self.numRows())

Example #22

0

Show file

	coo_matrix_input_all.cache()


	# Diagonalize RDD  

	diag_entries_1 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==0).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3),-value ));
	diag_entries_1.cache()
	diag_entries_2 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==1).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+1,-value ));
	diag_entries_2.cache()
	diag_entries_3 = coo_matrix_input_all.filter(lambda (row, col, value): col%3 ==2).map(lambda (row, _, value): (row, value)).reduceByKey(lambda x, y: x + y).map(lambda (row,value): (row, 3*(row/3)+2,-value ));
	diag_entries_3.cache()

	diag_entries = diag_entries_1.union(diag_entries_2).union(diag_entries_3);
	
	coo_matrix_input_all  = coo_matrix_input_all.union(diag_entries);
	coo_matrix_entries = coo_matrix_input_all.map(lambda e: MatrixEntry(e[0], e[1], e[2]));
	coo_matrix = CoordinateMatrix(coo_matrix_entries);


	#SAVE TO A FILE
	coo_matrix_input_all.repartition(1).saveAsTextFile("./Laplacian_4v7o_4cores_1")
	t2 = timeit.default_timer()
	print("Elapsed time for construction: {:} s".format(t2 - t0))


	#Singular value decomposition
	
	dataRows = coo_matrix.toRowMatrix().rows

	k = int(args.k) #N_singvalues
	svd = RowMatrix(dataRows.persist()).computeSVD(k, computeU=True)

Example #23

0

Show file

def to_matrix_entry(s):
    ss = s.split()
    entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2]))
    return entry

Example #24

0

Show file

        .appName("linalgtest")\
        .getOrCreate()

#conf = SparkConf().setAppName('linalgtest')
#sc = SparkContext(conf=conf).getOrCreate()

#use local spark on computer
# findspark.init()
#from pyspark.sql import SparkSession

local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx'

rdd = spark.sparkContext.textFile(local_file_location)
rdd = rdd.map(lambda line: line.split(" "))
rdd = rdd.map(
    lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2])))

mat = CoordinateMatrix(rdd)
M = mat.toRowMatrix()
A = mat.toBlockMatrix()
At = mat.transpose().toBlockMatrix()

print("SVD")
print(M.numRows(), M.numCols())
start_svd = time.time()

NUM_TIMES = 10
#do it 10 times to get mean
for i in range(NUM_TIMES):
    svd = M.computeSVD(5, computeU=True)

Example #25

0

Show file

import sys

K = 5

## Read data.
txt = sc.textFile('./data/com-amazon.ungraph.txt')
txt = txt.sample(False, 0.001, 1)  # XXX: random sample for local testing
txt = txt.zipWithIndex().filter(lambda x: int(x[1]) >= 4).map(
    lambda x: x[0].split('\t'))

## Get graph Laplacian
N = txt.flatMap(lambda x: [int(xx) for xx in x]).max()

upper_entries = txt.map(
    lambda x: MatrixEntry(int(x[0]) - 1,
                          int(x[1]) - 1, 1.0))
lower_entries = txt.map(
    lambda x: MatrixEntry(int(x[1]) - 1,
                          int(x[0]) - 1, 1.0))
degrees = upper_entries.map(lambda entry: (entry.i, entry.value)).reduceByKey(
    lambda a, b: a + b)
W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N)

# XXX:
laplacian = sys.argv[1]

if laplacian == 'unnormalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1]))
    D = CoordinateMatrix(entries, numCols=N, numRows=N)
    L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix()
elif laplacian == 'normalized':