class MatrixUDTTests(MLlibTestCase): dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10]) dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True) sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0]) sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True) udt = MatrixUDT() def test_json_schema(self): self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for m in [self.dm1, self.dm2, self.sm1, self.sm2]: self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m))) def test_infer_schema(self): rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)]) df = rdd.toDF() schema = df.schema self.assertTrue(schema.fields[1].dataType, self.udt) matrices = df.rdd.map(lambda x: x._2).collect() self.assertEqual(len(matrices), 2) for m in matrices: if isinstance(m, DenseMatrix): self.assertTrue(m, self.dm1) elif isinstance(m, SparseMatrix): self.assertTrue(m, self.sm1) else: raise ValueError("Expected a matrix but got type %r" % type(m))
def _transform(self, data, X): logger.info("Transforming data") loadings = self.model.loadings[:self.n_components] loadings = DenseMatrix(X.numCols(), self.n_components, loadings.flatten()) X = X.multiply(loadings) data = join(data, X, self.spark) del X return data
def fourier(X: RowMatrix, n_features, seed=23, gamma=1): p = X.numCols() random_state = numpy.random.RandomState(seed) w = numpy.sqrt(2 * gamma) * random_state.normal(size=(p, n_features)) w = DenseMatrix(p, n_features, w.flatten(), isTransposed=True) b = random_state.uniform(0, 2 * numpy.pi, size=n_features) Y = fourier_transform(X, w, b) return Y, w, b
def _transform(self, data): logger.info("Transforming data") W = self.model.loadings[:, :self.n_components] W = DenseMatrix(numRows=W.shape[0], numCols=W.shape[1], isTransposed=True, values=W.flatten()) X = self._row_matrix(data).multiply(W) data = join(data, X, self.spark) del X return data
def test_repr_dense_matrix(self): mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)") mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True) self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)") mat = DenseMatrix(6, 3, zeros(18)) self.assertTrue( repr(mat), "DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)", )
def main(): datasetfile = sys.argv[1] beta = 0.8 iterations = 40 top_k = 5 sparkcontext = SparkContext("local", "Page Rank") data = sparkcontext.textFile(datasetfile) source_dest = data.map(make_key_value_pair_1) source_dest_count = data.map(make_key_value_pair_2) groupbykey = source_dest.groupByKey() number_of_nodes = groupbykey.count() out_degree = groupbykey.map(calc_out_degree) pair_map = groupbykey.collectAsMap() matrix_m = np.zeros(shape=(number_of_nodes, number_of_nodes)) for key, value in pair_map.items(): for ind_value in value: matrix_m[ind_value - 1][key - 1] += 1 / len(list(value)) matrix_m = sparkcontext.parallelize(matrix_m) matrix_m = RowMatrix(matrix_m) vector_r_prev = np.empty([number_of_nodes, 1]) vector_r_prev.fill(1 / number_of_nodes) vector_r_prev = DenseMatrix(number_of_nodes, 1, vector_r_prev) index = 0 while (index < iterations): mul_val = matrix_m.multiply(vector_r_prev).rows.collect() mul_val = [i * beta for i in mul_val] mul_val = [i + (1 - beta) / number_of_nodes for i in mul_val] vector_r_prev = DenseMatrix(number_of_nodes, 1, mul_val) index += 1 vector_r_prev = vector_r_prev.toArray() largest_values = heapq.nlargest(top_k, vector_r_prev) largest_indexes = heapq.nlargest(top_k, range(number_of_nodes), vector_r_prev.__getitem__) smallest_values = heapq.nsmallest(top_k, vector_r_prev) smallest_indexes = heapq.nsmallest(top_k, range(number_of_nodes), vector_r_prev.__getitem__) largest_indexes = [val + 1 for val in largest_indexes] smallest_indexes = [val + 1 for val in smallest_indexes] print("Value of largest n nodes\n", largest_values) print("Node numbers of largest n nodes\n", largest_indexes) print("Value of smallest n nodes\n", smallest_values) print("Node numbers of smallest n nodes\n", smallest_indexes) sparkcontext.stop()
def test_dense_matrix_is_transposed(self): mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True) mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9]) self.assertEqual(mat1, mat) expected = [[0, 4], [1, 6], [3, 9]] for i in range(3): for j in range(2): self.assertEqual(mat1[i, j], expected[i][j]) self.assertTrue(array_equal(mat1.toArray(), expected)) sm = mat1.toSparse() self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2])) self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5])) self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
def _transform(self, data, X): logger.info("Transforming data") loadings = self.model.loadings.T L = DenseMatrix(numRows=loadings.shape[0], numCols=loadings.shape[1], values=loadings.flatten(), isTransposed=True) data = join(data, X.multiply(L), self.spark) return data
def _fit(self, dataset): sc = SparkContext.getOrCreate() x = dataset.select(self.getFeaturesCol()) rddv = x.rdd.map(list) #calculate distance amtrix Aarr = self._dist_matrix(rddv, rddv, sc) np.fill_diagonal(Aarr, 0) D = list(map(lambda x: np.sum(x), Aarr)) Darr = np.diag(np.sqrt(np.divide(1, D))) #Laplacian matrix Ln = D - Aarr #Normalize Ln = np.matmul(np.matmul(Darr, Ln), Darr) #Eigenvectors V, U = spark_eigen.eigen(Ln, sc, self.getTolerance()) #K-Rank reduction K = self.getK() U.rows.count() proj = U.rows.map(lambda x: [x[i] for i in range(0, K)]) densep = DenseMatrix( proj.count(), K, functools.reduce(operator.iconcat, proj.collect(), []), True) return SpectralClusteringModel(featuresCol=self.getFeaturesCol(), predictionCol=self.getPredictionCol(), projection=densep, prevdata=rddv)
def NpToDense(arr): ''' turn numpy array to Pyspark Dense Matrix so that matrix multiplication could be done :param arr: a numpy array ''' nrows, ncols = arr.shape return DenseMatrix(nrows, ncols, arr.flatten(), 1)
def test_serialize(self): self._test_serialize(DenseVector(range(10))) self._test_serialize(DenseVector(array([1., 2., 3., 4.]))) self._test_serialize(DenseVector(pyarray.array('d', range(10)))) self._test_serialize(SparseVector(4, {1: 1, 3: 2})) self._test_serialize(SparseVector(3, {})) self._test_serialize(DenseMatrix(2, 3, range(6)))
def to_dense(rowmatrix): densev = rowmatrix.rows.collect() el = lambda x: [a for a in x] M = list(map(el, densev)) L = functools.reduce(operator.iconcat, M, []) return DenseMatrix(rowmatrix.numRows(), rowmatrix.numCols(), L, True)
def test_ml_mllib_matrix_conversion(self): # to ml # dense mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3]) mlDM2 = mllibDM.asML() self.assertEqual(mlDM2, mlDM1) # transposed mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True) mlDMt2 = mllibDMt.asML() self.assertEqual(mlDMt2, mlDMt1) # sparse mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM2 = mllibSM.asML() self.assertEqual(mlSM2, mlSM1) # transposed mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt2 = mllibSMt.asML() self.assertEqual(mlSMt2, mlSMt1) # from ml # dense mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4]) mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4]) mllibDM2 = Matrices.fromML(mlDM) self.assertEqual(mllibDM1, mllibDM2) # transposed mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True) mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True) mllibDMt2 = Matrices.fromML(mlDMt) self.assertEqual(mllibDMt1, mllibDMt2) # sparse mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4]) mllibSM2 = Matrices.fromML(mlSM) self.assertEqual(mllibSM1, mllibSM2) # transposed mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True) mllibSMt2 = Matrices.fromML(mlSMt) self.assertEqual(mllibSMt1, mllibSMt2)
def _compute_w_row(self, Xw, w, W, idx): g, gd = self._exp(Xw.multiply(DenseMatrix(len(w), 1, w))) w_new = column_means(elementwise_product(Xw, g, self.spark)) del g w_new = w_new - gd * w w_new = gs_decorrelate(w_new, W, idx) w_new /= scipy.sqrt((w_new**2).sum()) return w_new
def test_serialize(self): self._test_serialize(DenseVector(range(10))) self._test_serialize(DenseVector(array([1.0, 2.0, 3.0, 4.0]))) self._test_serialize(DenseVector(pyarray.array("d", range(10)))) self._test_serialize(SparseVector(4, {1: 1, 3: 2})) self._test_serialize(SparseVector(3, {})) self._test_serialize(DenseMatrix(2, 3, range(6))) sm1 = SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0]) self._test_serialize(sm1)
def decorator(row): for i in xrange(len(schema)): if type(schema[i][1]) == _Matrix: shape = row[i].shape # By default Mllib DenseMatrix constructs column-major matrix. # So Transposing ndarray to maintain consistency arr = row[i].transpose().flatten() row[i] = DenseMatrix(shape[0], shape[1], arr) return row
def test_matrix_indexing(self): mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) expected = [[0, 6], [1, 8], [4, 10]] for i in range(3): for j in range(2): self.assertEqual(mat[i, j], expected[i][j]) for i, j in [(-1, 0), (4, 1), (3, 4)]: self.assertRaises(IndexError, mat.__getitem__, (i, j))
def to_local_matrix(self): """ Converts the LD matrix to a local Spark matrix. .. caution:: Only call this method when the LD matrix is small enough to fit in local memory on the driver. :return: Matrix of Pearson correlation values. :rtype: `Matrix <https://spark.apache.org/docs/2.1.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix>`__ """ j_local_mat = self._jldm.toLocalMatrix() return DenseMatrix(j_local_mat.numRows(), j_local_mat.numCols(), list(j_local_mat.toArray()), j_local_mat.isTransposed())
def decorator(row): result = [] from pyspark.mllib.linalg import DenseMatrix for i in xrange(len(schema)): if type(schema[i][1]) == dtypes._Matrix: shape = row[i].shape arr = row[i].flatten() # By default Mllib DenseMatrix constructs column-major matrix. # Setting isTranposed=True, will construct row-major DenseMatrix dm = DenseMatrix(shape[0], shape[1], arr, isTransposed=True) result.append(dm) else: result.append(row[i]) return result
def test_eq(self): v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) dm1 = DenseMatrix(2, 2, [2, 0, 0, 0]) sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2]) self.assertEqual(v1, v2) self.assertEqual(v1, v3) self.assertFalse(v2 == v4) self.assertFalse(v1 == v5) self.assertFalse(v1 == v6) # this is done as Dense and Sparse matrices can be semantically # equal while still implementing a different __eq__ method self.assertEqual(dm1, sm1) self.assertEqual(sm1, dm1)
def to_local_matrix(self): """ Converts the LD matrix to a local Spark matrix. .. caution:: Only call this method when the LD matrix is small enough to fit in local memory on the driver. :return: Matrix of Pearson correlation values. :rtype: `Matrix <https://spark.apache.org/docs/2.1.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.MatrixTable>`__ """ from pyspark.mllib.linalg import DenseMatrix j_local_mat = self._jldm.toLocalMatrix() assert j_local_mat.majorStride() == j_local_mat.rows() assert j_local_mat.offset() == 0 assert j_local_mat.isTranspose() == False return DenseMatrix(j_local_mat.rows(), j_local_mat.cols(), list(j_local_mat.data()), False)
def multiply_matrices(A: np.ndarray, B: np.ndarray) -> np.ndarray: global counter print() print("No." + str(counter) + " matrix multiplication starts") start_time = time.time() print("matrix shape:", A.shape) rddA = sc.parallelize(A.tolist()) matA = RowMatrix(rddA) matB = DenseMatrix(B.shape[0], B.shape[1], B.flatten().tolist(), isTransposed=True) matC = matA.multiply(matB) rows = matC.rows.collect() res = np.array([row.toArray() for row in rows]) elapsed_time = time.time() - start_time print("No." + str(counter) + " matrix multiplication ends, takes time:", elapsed_time) counter = counter + 1 return res
def _whiten(self, X): s, v, _ = svd(X, X.numCols()) K = (v.T / s)[:, :self.n_components] S = K * scipy.sqrt(X.numRows()) S = DenseMatrix(S.shape[0], S.shape[1], S.flatten(), True) return X.multiply(S), K
def test_matrix_indexing(self): mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10]) expected = [[0, 6], [1, 8], [4, 10]] for i in range(3): for j in range(2): self.assertEquals(mat[i, j], expected[i][j])
def calculate_distance(self, sdf1, sdf2): """ This will calculate the distance between the vector-type columns of two spark dataframes :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector) :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector) :return: """ cov = RowMatrix( sdf1.select(["v1"]).withColumnRenamed("v1", "v").union( sdf2.select(["v2"]).withColumnRenamed( "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict( )["v"]))).computeCovariance().toArray() x, v = np.linalg.eigh(cov) indices = 1e-10 <= x # we are trying to enfore the data types to be only python types n = int(v.shape[0]) m = int(indices.sum()) v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()] v_spark = DenseMatrix(n, m, v_vals) x_vals = [ float(val) for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist() ] x_spark = DenseMatrix(m, m, x_vals) # we get the index to maintain the order _sdf1 = sdf1.rdd.zipWithIndex()\ .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF() _sdf1.persist() _sdf2 = sdf2.rdd.zipWithIndex()\ .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF() _sdf2.persist() # we get our indexed row matrix _sdf1_mat = IndexedRowMatrix( _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v1"])))) _sdf2_mat = IndexedRowMatrix( _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"], vector=Vectors.fromML( row.asDict()["v2"])))) # we apply our transformation and then set it as our new variable _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v1=indexed_row.vector)).toDF(), "index") _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\ .map(lambda indexed_row: Row(index=indexed_row.index, v2=indexed_row.vector)).toDF(), "index") @F.udf(DoubleType(), VectorUDT()) def tmp(vec): return float(vec[0].squared_distance(vec[1]))**0.5 all_sdf = _sdf1.crossJoin(_sdf2) dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff')) dist_sdf.persist() return dist_sdf
model = als.fit(ratings) #Embeddings a = model.itemFactors b = a.sort("id") b.show() #Creating a dense matrix from embedding for businesses values = (b.rdd.map(lambda x: (x.id, x.features)).sortByKey().flatMap( lambda (x, y): y).collect()) nrow = len(b.rdd.map(lambda x: x.features).first()) ncol = b.count() dm = DenseMatrix(nrow, ncol, values) dm.toArray().shape z = dm.toArray().transpose() #t-sne tsne = TSNE(n_components=2) X_tsne = tsne.fit_transform(z) # creating data frame with t-sne results and business_id e = sqlContext.createDataFrame(pd.DataFrame(X_tsne)) e_df = e.toPandas() j = b.select("id") j_df = j.toPandas() result = pd.concat([e_df, j_df], axis=1, ignore_index=True) result = pd.DataFrame(result)
tf_matrix2 = tf_matrix1.map(lambda i: [int(token) for token in i]) tf_matrix3 = tf_matrix2.map(lambda i: np.array(i)) tf_matrix4 = tf_matrix3.map(lambda i: (len(i), np.nonzero(i)[0], i[i!=0])) tf_matrix5 = tf_matrix4.map(lambda i: Vectors.sparse(i[0], i[1], i[2])) # 1min 49s , all the above process tf_matrix5.cache() row_tf_matrix = RowMatrix(tf_matrix5) random_matrix = DenseMatrix(WORDS_COUNT, K+P, np.random.randn(WORDS_COUNT * (K+P))) # local matrix Y = row_tf_matrix.multiply(random_matrix) # 38.5 s, Y is a rowmatrix(a distriuted matrix) # ============== QR Y_rdd = Y.rows # 17.5ms Y_rdd_with_index = Y_rdd.zipWithIndex() # 1min 48s Y_grouped = Y_rdd_with_index.groupBy(lambda i: i[1]/500) # 41.1ms def build_matris(tupl): iterables = tupl[1] result = list() for vec in iterables: ary = vec[0].toArray() result.append(ary)