L = CoordinateMatrix(edges).toBlockMatrix() L_transpose = CoordinateMatrix(edges_transpose).toBlockMatrix() h_init = [] for i in range(1000): h_init.append((i, 0, 1)) h = CoordinateMatrix(sc.parallelize(h_init)).toBlockMatrix() a = None for i in range(40): a_new = L_transpose.multiply(h) a_new_max = np.max(np.array(a_new.toLocalMatrix().toArray())) a_new_max_inverse = [] for j in range(1000): a_new_max_inverse.append((j, j, 1 / a_new_max)) a_new_max_inverse = CoordinateMatrix(sc.parallelize(a_new_max_inverse)).toBlockMatrix() a = a_new_max_inverse.multiply(a_new) h_new = L.multiply(a) h_new_max = np.max(np.array(h_new.toLocalMatrix().toArray())) h_new_max_inverse = [] for j in range(1000): h_new_max_inverse.append((j, j, 1 / h_new_max)) h_new_max_inverse = CoordinateMatrix(sc.parallelize(h_new_max_inverse)).toBlockMatrix() h = h_new_max_inverse.multiply(h_new)
end = time.time() elapsed_seconds = float("%.4f" % (end - start)) logging.info('%s: elapsed seconds: %s', name, elapsed_seconds) logging.getLogger().setLevel(logging.INFO) def to_matrix_entry(x): i, j, v = x.split(',') return MatrixEntry(i, j, v) sc = pyspark.SparkContext(appName="Matrix Multiplication") for i in range(1, 10): with time_usage("temps matrix multiplication"): matrix_a_raw = sc.textFile(sys.argv[1]) matrix_b_raw = sc.textFile(sys.argv[2]) spark = SparkSession(sc) entries_a = matrix_a_raw.map(to_matrix_entry) entries_b = matrix_b_raw.map(to_matrix_entry) mat_a = CoordinateMatrix(entries_a).toBlockMatrix() mat_b = CoordinateMatrix(entries_b).toBlockMatrix() product = mat_a.multiply(mat_b) product.toLocalMatrix() #for t in result: #print('%s, %s, %s' % (t[0], t[1], t[2]))
# create the context sc = pyspark.SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() small_data = sc.textFile('graph-small.txt') full_data = sc.textFile('graph-full.txt') BETA = 0.8 source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct() edges = source_dest_pair.map(lambda x: (x[1], x[0], 1)) degrees = source_dest_pair.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y).map(lambda x: (x[0], x[0], 1 / x[1])) edge_matrix = CoordinateMatrix(edges).toBlockMatrix() degree_inverse_matrix = CoordinateMatrix(degrees).toBlockMatrix() M = edge_matrix.multiply(degree_inverse_matrix) r_init = [] beta_init = [] teleport_init = [] for i in range(1000): r_init.append((i, 0, 1 / 1000)) beta_init.append((i, i, BETA)) teleport_init.append((i, 0, (1 - BETA) / 1000)) r = CoordinateMatrix(sc.parallelize(r_init)).toBlockMatrix() beta = CoordinateMatrix(sc.parallelize(beta_init)).toBlockMatrix() teleport = CoordinateMatrix(sc.parallelize(teleport_init)).toBlockMatrix() for i in range(40): r = teleport.add(beta.multiply(M).multiply(r))
W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N) # XXX: laplacian = sys.argv[1] if laplacian == 'unnormalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1])) D = CoordinateMatrix(entries, numCols=N, numRows=N) L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix() elif laplacian == 'normalized': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1])) D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix() I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)), numCols=N, numRows=N).toBlockMatrix() L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix() elif laplacian == 'symmetric': entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / sqrt(x[1]))) D_invsq = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix() I = sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0), N, N) tmp = D_invsq.multiply(W.toBlockMatrix()).multiply(D_invsq) L = I.toBlockMatrix().subtract(tmp) else: raise ValueError('Unknown type of Laplacian.') ## SVD, and transform from dense matrix to dataframe. svd = L.toRowMatrix().computeSVD(k=K, computeU=False) V = svd.V.toArray().tolist() VV = spark.createDataFrame(V) kmeans = KMeans().setK(K).setSeed(1) vecAssembler = VectorAssembler(inputCols=VV.schema.names, outputCol='features')