Esempio n. 1
0
class MatrixUDTTests(MLlibTestCase):

    dm1 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10])
    dm2 = DenseMatrix(3, 2, [0, 1, 4, 5, 9, 10], isTransposed=True)
    sm1 = SparseMatrix(1, 1, [0, 1], [0], [2.0])
    sm2 = SparseMatrix(2, 1, [0, 0, 1], [0], [5.0], isTransposed=True)
    udt = MatrixUDT()

    def test_json_schema(self):
        self.assertEqual(MatrixUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for m in [self.dm1, self.dm2, self.sm1, self.sm2]:
            self.assertEqual(m, self.udt.deserialize(self.udt.serialize(m)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([("dense", self.dm1), ("sparse", self.sm1)])
        df = rdd.toDF()
        schema = df.schema
        self.assertTrue(schema.fields[1].dataType, self.udt)
        matrices = df.rdd.map(lambda x: x._2).collect()
        self.assertEqual(len(matrices), 2)
        for m in matrices:
            if isinstance(m, DenseMatrix):
                self.assertTrue(m, self.dm1)
            elif isinstance(m, SparseMatrix):
                self.assertTrue(m, self.sm1)
            else:
                raise ValueError("Expected a matrix but got type %r" % type(m))
Esempio n. 2
0
 def _transform(self, data, X):
     logger.info("Transforming data")
     loadings = self.model.loadings[:self.n_components]
     loadings = DenseMatrix(X.numCols(), self.n_components,
                            loadings.flatten())
     X = X.multiply(loadings)
     data = join(data, X, self.spark)
     del X
     return data
Esempio n. 3
0
def fourier(X: RowMatrix, n_features, seed=23, gamma=1):
    p = X.numCols()
    random_state = numpy.random.RandomState(seed)

    w = numpy.sqrt(2 * gamma) * random_state.normal(size=(p, n_features))
    w = DenseMatrix(p, n_features, w.flatten(), isTransposed=True)
    b = random_state.uniform(0, 2 * numpy.pi, size=n_features)

    Y = fourier_transform(X, w, b)
    return Y, w, b
Esempio n. 4
0
 def _transform(self, data):
     logger.info("Transforming data")
     W = self.model.loadings[:, :self.n_components]
     W = DenseMatrix(numRows=W.shape[0],
                     numCols=W.shape[1],
                     isTransposed=True,
                     values=W.flatten())
     X = self._row_matrix(data).multiply(W)
     data = join(data, X, self.spark)
     del X
     return data
Esempio n. 5
0
    def test_repr_dense_matrix(self):
        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
        self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)")

        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10], True)
        self.assertTrue(repr(mat), "DenseMatrix(3, 2, [0.0, 1.0, 4.0, 6.0, 8.0, 10.0], False)")

        mat = DenseMatrix(6, 3, zeros(18))
        self.assertTrue(
            repr(mat),
            "DenseMatrix(6, 3, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..., \
                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], False)",
        )
Esempio n. 6
0
def main():
    datasetfile = sys.argv[1]
    beta = 0.8
    iterations = 40
    top_k = 5

    sparkcontext = SparkContext("local", "Page Rank")
    data = sparkcontext.textFile(datasetfile)
    source_dest = data.map(make_key_value_pair_1)
    source_dest_count = data.map(make_key_value_pair_2)
    groupbykey = source_dest.groupByKey()
    number_of_nodes = groupbykey.count()
    out_degree = groupbykey.map(calc_out_degree)
    pair_map = groupbykey.collectAsMap()

    matrix_m = np.zeros(shape=(number_of_nodes, number_of_nodes))
    for key, value in pair_map.items():
        for ind_value in value:
            matrix_m[ind_value - 1][key - 1] += 1 / len(list(value))

    matrix_m = sparkcontext.parallelize(matrix_m)
    matrix_m = RowMatrix(matrix_m)

    vector_r_prev = np.empty([number_of_nodes, 1])
    vector_r_prev.fill(1 / number_of_nodes)
    vector_r_prev = DenseMatrix(number_of_nodes, 1, vector_r_prev)

    index = 0
    while (index < iterations):
        mul_val = matrix_m.multiply(vector_r_prev).rows.collect()
        mul_val = [i * beta for i in mul_val]
        mul_val = [i + (1 - beta) / number_of_nodes for i in mul_val]
        vector_r_prev = DenseMatrix(number_of_nodes, 1, mul_val)
        index += 1

    vector_r_prev = vector_r_prev.toArray()
    largest_values = heapq.nlargest(top_k, vector_r_prev)
    largest_indexes = heapq.nlargest(top_k, range(number_of_nodes),
                                     vector_r_prev.__getitem__)
    smallest_values = heapq.nsmallest(top_k, vector_r_prev)
    smallest_indexes = heapq.nsmallest(top_k, range(number_of_nodes),
                                       vector_r_prev.__getitem__)

    largest_indexes = [val + 1 for val in largest_indexes]
    smallest_indexes = [val + 1 for val in smallest_indexes]

    print("Value of largest n nodes\n", largest_values)
    print("Node numbers of largest n nodes\n", largest_indexes)
    print("Value of smallest n nodes\n", smallest_values)
    print("Node numbers of smallest n nodes\n", smallest_indexes)
    sparkcontext.stop()
Esempio n. 7
0
    def test_dense_matrix_is_transposed(self):
        mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
        mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
        self.assertEqual(mat1, mat)

        expected = [[0, 4], [1, 6], [3, 9]]
        for i in range(3):
            for j in range(2):
                self.assertEqual(mat1[i, j], expected[i][j])
        self.assertTrue(array_equal(mat1.toArray(), expected))

        sm = mat1.toSparse()
        self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
        self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
        self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))
Esempio n. 8
0
 def _transform(self, data, X):
     logger.info("Transforming data")
     loadings = self.model.loadings.T
     L = DenseMatrix(numRows=loadings.shape[0], numCols=loadings.shape[1],
                     values=loadings.flatten(), isTransposed=True)
     data = join(data, X.multiply(L), self.spark)
     return data
    def _fit(self, dataset):
        sc = SparkContext.getOrCreate()

        x = dataset.select(self.getFeaturesCol())
        rddv = x.rdd.map(list)
        #calculate distance amtrix
        Aarr = self._dist_matrix(rddv, rddv, sc)
        np.fill_diagonal(Aarr, 0)
        D = list(map(lambda x: np.sum(x), Aarr))
        Darr = np.diag(np.sqrt(np.divide(1, D)))
        #Laplacian matrix
        Ln = D - Aarr
        #Normalize
        Ln = np.matmul(np.matmul(Darr, Ln), Darr)
        #Eigenvectors
        V, U = spark_eigen.eigen(Ln, sc, self.getTolerance())
        #K-Rank reduction
        K = self.getK()
        U.rows.count()
        proj = U.rows.map(lambda x: [x[i] for i in range(0, K)])
        densep = DenseMatrix(
            proj.count(), K,
            functools.reduce(operator.iconcat, proj.collect(), []), True)
        return SpectralClusteringModel(featuresCol=self.getFeaturesCol(),
                                       predictionCol=self.getPredictionCol(),
                                       projection=densep,
                                       prevdata=rddv)
Esempio n. 10
0
def NpToDense(arr):
    '''
	turn numpy array to Pyspark Dense Matrix so that matrix multiplication could be done 
	:param arr: a numpy array
	'''
    nrows, ncols = arr.shape
    return DenseMatrix(nrows, ncols, arr.flatten(), 1)
Esempio n. 11
0
 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
     self._test_serialize(DenseVector(pyarray.array('d', range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
     self._test_serialize(SparseVector(3, {}))
     self._test_serialize(DenseMatrix(2, 3, range(6)))
def to_dense(rowmatrix):

    densev = rowmatrix.rows.collect()
    el = lambda x: [a for a in x]
    M = list(map(el, densev))
    L = functools.reduce(operator.iconcat, M, [])
    return DenseMatrix(rowmatrix.numRows(), rowmatrix.numCols(), L, True)
Esempio n. 13
0
 def test_ml_mllib_matrix_conversion(self):
     # to ml
     # dense
     mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM2 = mllibDM.asML()
     self.assertEqual(mlDM2, mlDM1)
     # transposed
     mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt2 = mllibDMt.asML()
     self.assertEqual(mlDMt2, mlDMt1)
     # sparse
     mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1],
                                       [2, 3, 4])
     mlSM2 = mllibSM.asML()
     self.assertEqual(mlSM2, mlSM1)
     # transposed
     mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4],
                                     True)
     mlSMt2 = mllibSMt.asML()
     self.assertEqual(mlSMt2, mlSMt1)
     # from ml
     # dense
     mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
     mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
     mllibDM2 = Matrices.fromML(mlDM)
     self.assertEqual(mllibDM1, mllibDM2)
     # transposed
     mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mllibDMt2 = Matrices.fromML(mlDMt)
     self.assertEqual(mllibDMt1, mllibDMt2)
     # sparse
     mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mllibSM2 = Matrices.fromML(mlSM)
     self.assertEqual(mllibSM1, mllibSM2)
     # transposed
     mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4],
                                    True)
     mllibSMt2 = Matrices.fromML(mlSMt)
     self.assertEqual(mllibSMt1, mllibSMt2)
Esempio n. 14
0
 def _compute_w_row(self, Xw, w, W, idx):
     g, gd = self._exp(Xw.multiply(DenseMatrix(len(w), 1, w)))
     w_new = column_means(elementwise_product(Xw, g, self.spark))
     del g
     w_new = w_new - gd * w
     w_new = gs_decorrelate(w_new, W, idx)
     w_new /= scipy.sqrt((w_new**2).sum())
     return w_new
Esempio n. 15
0
 def test_serialize(self):
     self._test_serialize(DenseVector(range(10)))
     self._test_serialize(DenseVector(array([1.0, 2.0, 3.0, 4.0])))
     self._test_serialize(DenseVector(pyarray.array("d", range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
     self._test_serialize(SparseVector(3, {}))
     self._test_serialize(DenseMatrix(2, 3, range(6)))
     sm1 = SparseMatrix(3, 4, [0, 2, 2, 4, 4], [1, 2, 1, 2], [1.0, 2.0, 4.0, 5.0])
     self._test_serialize(sm1)
Esempio n. 16
0
 def decorator(row):
     for i in xrange(len(schema)):
         if type(schema[i][1]) == _Matrix:
             shape = row[i].shape
             # By default Mllib DenseMatrix constructs column-major matrix.
             # So Transposing ndarray to maintain consistency
             arr = row[i].transpose().flatten()
             row[i] = DenseMatrix(shape[0], shape[1], arr)
     return row
Esempio n. 17
0
    def test_matrix_indexing(self):
        mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
        expected = [[0, 6], [1, 8], [4, 10]]
        for i in range(3):
            for j in range(2):
                self.assertEqual(mat[i, j], expected[i][j])

        for i, j in [(-1, 0), (4, 1), (3, 4)]:
            self.assertRaises(IndexError, mat.__getitem__, (i, j))
Esempio n. 18
0
 def test_ml_mllib_matrix_conversion(self):
     # to ml
     # dense
     mllibDM = Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM1 = newlinalg.Matrices.dense(2, 2, [0, 1, 2, 3])
     mlDM2 = mllibDM.asML()
     self.assertEqual(mlDM2, mlDM1)
     # transposed
     mllibDMt = DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt1 = newlinalg.DenseMatrix(2, 2, [0, 1, 2, 3], True)
     mlDMt2 = mllibDMt.asML()
     self.assertEqual(mlDMt2, mlDMt1)
     # sparse
     mllibSM = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM1 = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM2 = mllibSM.asML()
     self.assertEqual(mlSM2, mlSM1)
     # transposed
     mllibSMt = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt1 = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt2 = mllibSMt.asML()
     self.assertEqual(mlSMt2, mlSMt1)
     # from ml
     # dense
     mllibDM1 = Matrices.dense(2, 2, [1, 2, 3, 4])
     mlDM = newlinalg.Matrices.dense(2, 2, [1, 2, 3, 4])
     mllibDM2 = Matrices.fromML(mlDM)
     self.assertEqual(mllibDM1, mllibDM2)
     # transposed
     mllibDMt1 = DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mlDMt = newlinalg.DenseMatrix(2, 2, [1, 2, 3, 4], True)
     mllibDMt2 = Matrices.fromML(mlDMt)
     self.assertEqual(mllibDMt1, mllibDMt2)
     # sparse
     mllibSM1 = Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mlSM = newlinalg.Matrices.sparse(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4])
     mllibSM2 = Matrices.fromML(mlSM)
     self.assertEqual(mllibSM1, mllibSM2)
     # transposed
     mllibSMt1 = SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mlSMt = newlinalg.SparseMatrix(2, 2, [0, 2, 3], [0, 1, 1], [2, 3, 4], True)
     mllibSMt2 = Matrices.fromML(mlSMt)
     self.assertEqual(mllibSMt1, mllibSMt2)
Esempio n. 19
0
 def to_local_matrix(self):
     """
     Converts the LD matrix to a local Spark matrix.
     
     .. caution::
     
         Only call this method when the LD matrix is small enough to fit in local memory on the driver. 
     
     :return: Matrix of Pearson correlation values.
     :rtype: `Matrix <https://spark.apache.org/docs/2.1.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.Matrix>`__
     """
     j_local_mat = self._jldm.toLocalMatrix()
     return DenseMatrix(j_local_mat.numRows(), j_local_mat.numCols(),
                        list(j_local_mat.toArray()),
                        j_local_mat.isTransposed())
Esempio n. 20
0
 def decorator(row):
     result = []
     from pyspark.mllib.linalg import DenseMatrix
     for i in xrange(len(schema)):
         if type(schema[i][1]) == dtypes._Matrix:
             shape = row[i].shape
             arr = row[i].flatten()
             # By default Mllib DenseMatrix constructs column-major matrix.
             # Setting isTranposed=True, will construct row-major DenseMatrix
             dm = DenseMatrix(shape[0],
                              shape[1],
                              arr,
                              isTransposed=True)
             result.append(dm)
         else:
             result.append(row[i])
     return result
Esempio n. 21
0
 def test_eq(self):
     v1 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v2 = SparseVector(4, [(1, 1.0), (3, 5.5)])
     v3 = DenseVector([0.0, 1.0, 0.0, 5.5])
     v4 = SparseVector(6, [(1, 1.0), (3, 5.5)])
     v5 = DenseVector([0.0, 1.0, 0.0, 2.5])
     v6 = SparseVector(4, [(1, 1.0), (3, 2.5)])
     dm1 = DenseMatrix(2, 2, [2, 0, 0, 0])
     sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2])
     self.assertEqual(v1, v2)
     self.assertEqual(v1, v3)
     self.assertFalse(v2 == v4)
     self.assertFalse(v1 == v5)
     self.assertFalse(v1 == v6)
     # this is done as Dense and Sparse matrices can be semantically
     # equal while still implementing a different __eq__ method
     self.assertEqual(dm1, sm1)
     self.assertEqual(sm1, dm1)
Esempio n. 22
0
    def to_local_matrix(self):
        """
        Converts the LD matrix to a local Spark matrix.
        
        .. caution::
        
            Only call this method when the LD matrix is small enough to fit in local memory on the driver. 
        
        :return: Matrix of Pearson correlation values.
        :rtype: `Matrix <https://spark.apache.org/docs/2.1.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.MatrixTable>`__
        """
        from pyspark.mllib.linalg import DenseMatrix

        j_local_mat = self._jldm.toLocalMatrix()
        assert j_local_mat.majorStride() == j_local_mat.rows()
        assert j_local_mat.offset() == 0
        assert j_local_mat.isTranspose() == False
        return DenseMatrix(j_local_mat.rows(), j_local_mat.cols(), list(j_local_mat.data()), False)
Esempio n. 23
0
def multiply_matrices(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    global counter
    print()
    print("No." + str(counter) + " matrix multiplication starts")
    start_time = time.time()
    print("matrix shape:", A.shape)
    rddA = sc.parallelize(A.tolist())
    matA = RowMatrix(rddA)

    matB = DenseMatrix(B.shape[0],
                       B.shape[1],
                       B.flatten().tolist(),
                       isTransposed=True)

    matC = matA.multiply(matB)
    rows = matC.rows.collect()
    res = np.array([row.toArray() for row in rows])
    elapsed_time = time.time() - start_time
    print("No." + str(counter) + " matrix multiplication ends, takes time:",
          elapsed_time)
    counter = counter + 1
    return res
Esempio n. 24
0
 def _whiten(self, X):
     s, v, _ = svd(X, X.numCols())
     K = (v.T / s)[:, :self.n_components]
     S = K * scipy.sqrt(X.numRows())
     S = DenseMatrix(S.shape[0], S.shape[1], S.flatten(), True)
     return X.multiply(S), K
Esempio n. 25
0
 def test_matrix_indexing(self):
     mat = DenseMatrix(3, 2, [0, 1, 4, 6, 8, 10])
     expected = [[0, 6], [1, 8], [4, 10]]
     for i in range(3):
         for j in range(2):
             self.assertEquals(mat[i, j], expected[i][j])
Esempio n. 26
0
    def calculate_distance(self, sdf1, sdf2):
        """
        This will calculate the distance between the vector-type columns of two spark dataframes

        :param sdf1: This is to have a columns id1 (dtype int) and v1 (dtype Vector)
        :param sdf2: This is to have a columns id2 (dtype int) and v2 (dtype Vector)
        :return:
        """

        cov = RowMatrix(
            sdf1.select(["v1"]).withColumnRenamed("v1", "v").union(
                sdf2.select(["v2"]).withColumnRenamed(
                    "v2", "v")).rdd.map(lambda row: Vectors.fromML(row.asDict(
                    )["v"]))).computeCovariance().toArray()

        x, v = np.linalg.eigh(cov)

        indices = 1e-10 <= x

        # we are trying to enfore the data types to be only python types
        n = int(v.shape[0])
        m = int(indices.sum())

        v_vals = [float(val) for val in v[:, indices].reshape(-1, ).tolist()]

        v_spark = DenseMatrix(n, m, v_vals)

        x_vals = [
            float(val)
            for val in np.diag(x[indices]**-0.5).reshape(-1, ).tolist()
        ]

        x_spark = DenseMatrix(m, m, x_vals)

        # we get the index to maintain the order
        _sdf1 = sdf1.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id1=val_key[0].id1, v1=val_key[0].v1, index=val_key[1])).toDF()

        _sdf1.persist()

        _sdf2 = sdf2.rdd.zipWithIndex()\
            .map(lambda val_key: Row(id2=val_key[0].id2, v2=val_key[0].v2, index=val_key[1])).toDF()

        _sdf2.persist()

        # we get our indexed row matrix
        _sdf1_mat = IndexedRowMatrix(
            _sdf1.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v1"]))))

        _sdf2_mat = IndexedRowMatrix(
            _sdf2.rdd.map(lambda row: IndexedRow(index=row.asDict()["index"],
                                                 vector=Vectors.fromML(
                                                     row.asDict()["v2"]))))

        # we apply our transformation and then set it as our new variable
        _sdf1 = _sdf1.drop("v1").join(_sdf1_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v1=indexed_row.vector)).toDF(), "index")

        _sdf2 = _sdf2.drop("v2").join(_sdf2_mat.multiply(v_spark).multiply(x_spark).rows\
                                      .map(lambda indexed_row: Row(index=indexed_row.index,
                                                                   v2=indexed_row.vector)).toDF(), "index")

        @F.udf(DoubleType(), VectorUDT())
        def tmp(vec):
            return float(vec[0].squared_distance(vec[1]))**0.5

        all_sdf = _sdf1.crossJoin(_sdf2)

        dist_sdf = all_sdf.select("*", tmp(F.array('v1', 'v2')).alias('diff'))

        dist_sdf.persist()

        return dist_sdf
model = als.fit(ratings)

#Embeddings

a = model.itemFactors
b = a.sort("id")
b.show()

#Creating a dense matrix from embedding for businesses
values = (b.rdd.map(lambda x: (x.id, x.features)).sortByKey().flatMap(
    lambda (x, y): y).collect())

nrow = len(b.rdd.map(lambda x: x.features).first())
ncol = b.count()

dm = DenseMatrix(nrow, ncol, values)
dm.toArray().shape
z = dm.toArray().transpose()

#t-sne

tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(z)

# creating data frame with t-sne results and business_id
e = sqlContext.createDataFrame(pd.DataFrame(X_tsne))
e_df = e.toPandas()
j = b.select("id")
j_df = j.toPandas()
result = pd.concat([e_df, j_df], axis=1, ignore_index=True)
result = pd.DataFrame(result)
Esempio n. 28
0
tf_matrix2 = tf_matrix1.map(lambda i: [int(token) for token in i])

tf_matrix3 = tf_matrix2.map(lambda i: np.array(i))

tf_matrix4 = tf_matrix3.map(lambda i: (len(i), np.nonzero(i)[0], i[i!=0]))

tf_matrix5 = tf_matrix4.map(lambda i: Vectors.sparse(i[0], i[1], i[2]))

# 1min 49s , all the above process

tf_matrix5.cache()

row_tf_matrix = RowMatrix(tf_matrix5)

random_matrix = DenseMatrix(WORDS_COUNT, K+P, np.random.randn(WORDS_COUNT * (K+P))) # local matrix

Y = row_tf_matrix.multiply(random_matrix) # 38.5 s, Y is a rowmatrix(a distriuted matrix)

#  ============== QR 

Y_rdd = Y.rows  # 17.5ms
Y_rdd_with_index = Y_rdd.zipWithIndex()  # 1min 48s
Y_grouped = Y_rdd_with_index.groupBy(lambda i: i[1]/500)  # 41.1ms

def build_matris(tupl):
	iterables = tupl[1]
	result = list()
	for vec in iterables:
		ary = vec[0].toArray()
		result.append(ary)