Example #1
0
def elementwise_product(X: RowMatrix, Y: RowMatrix, spark):
    X = as_df_with_idx(X, "idx", spark)
    Y = as_df_with_idx(Y, "idx", spark)
    Y = Y.withColumnRenamed("_1", "_2")
    X = X.join(Y, on="idx").drop("idx")
    X = X.rdd.map(lambda x: scipy.array(x[0]) * scipy.array(x[1]))
    return X
    def __init__(self, filename):

        ratings = spark.read.option("inferSchema",
                                    "true").option("header",
                                                   "true").csv(filename)
        # self.num_users, self.num_items, _, _= ratings.agg(*(countDistinct(col(c)).alias(c) for c in ratings.columns)).first()
        self.num_users, self.num_items = ratings.groupBy(
            "userId").max(), ratings.groupBy("movieId").max()
        # msk = np.random.choice(np.arange(1,self.num_items+1), round(self.num_items*0.3), replace=False)
        function = udf(lambda c: c in msk, BooleanType())
        newdf = ratings.withColumn('test', function(col('movieId'))).drop(
            col('timestamp'))
        self.trainDF = newdf.filter(col('test') == 'False')
        self.testDF = newdf.filter(col('test') == 'True')
        aggTup = udf(lambda x, y: x.zip(y))
        rdd_tr = [
            self.trainDF.rdd.map(lambda r: Vectors.sparse(
                self.num_items, {r.movieId: r.rating}))
        ]
        rdd_te = [
            self.testDF.rdd.map(lambda r: Vectors.sparse(
                self.num_items, {r.movieId: r.rating}))
        ]
        # tr_rows = sc.parallelize(rdd_tr)
        # te_rows = sc.parallelize(rdd_te)
        # print(type(tr_rows))
        # tr_rowsBc = sc.broadcast(tr_rows)
        import ipdb
        ipdb.set_trace()
        # te_rowsBc = sc.broadcast(te_rows)
        self.tr_mat = RowMatrix(rdd_tr)
        self.te_mat = RowMatrix(rdd_te)
Example #3
0
    def pca(self, df, k=1):
        cov = RowMatrix(
            df.rdd.map(lambda x: list(x))).computeCovariance().toArray()
        col = cov.shape[1]
        eigVals, eigVecs = np.linalg.eigh(cov)
        inds = np.argsort(eigVals)
        eigVecs = eigVecs.T[inds[-1:-(col + 1):-1]]
        eigVals = eigVals[inds[-1:-(col + 1):-1]]
        components = RowMatrix(
            df.rdd.map(lambda x: list(x))).computePrincipalComponents(k)

        train_data = df.rdd.map(
            lambda x: Row(features=Vectors.dense(x))).toDF()

        pca = PCA(k=k, inputCol="features", outputCol="pcaFeatures")
        model = pca.fit(train_data)
        score = model.transform(train_data)

        res = {
            "components":
            components.toArray(),
            "score":
            np.array(
                score.select("pcaFeatures").rdd.map(
                    lambda x: list(x[0])).collect()),
            "eigVectors":
            eigVecs,
            "eigValues":
            eigVals
        }

        return res
Example #4
0
def compute_similarities(X, sc, threshold=0):
    """ Compute column similarities using Spark:
    Efficient dealing of sparsity with a threshold
    that makes sure that only relevant similarities are computed.
    
    Parameters
    ----------
    X: an array whose features are the rows
    sc: SparkContext
    threshold: the similarity threshold
    
    Return
    ---------
    Symetric similarity matrix shape (X.shape[1], X.shape[1])
    """
    n = X.shape[1]
    rows = sc.parallelize(X)
    mat = RowMatrix(rows)

    sims = mat.columnSimilarities(threshold)
    # Convert to scipy sparse matrix
    # Each element is a Matrix entry object (i, j, value)
    rows_index = np.array(
        sims.entries.map(lambda x: x.i).collect()).astype(int)
    cols_index = np.array(
        sims.entries.map(lambda x: x.j).collect()).astype(int)
    values = np.array(sims.entries.map(lambda x: x.value).collect())
    triang_sup = coo_matrix((values, (rows_index, cols_index)), shape=(n, n))
    triang_inf = coo_matrix((values, (cols_index, rows_index)), shape=(n, n))

    return ((triang_sup + triang_inf).tocsr())
    def cluster(self, df, session, repartition_num=8):
        n = df.count()
        # index rows
        df_index = df.select((row_number().over(
            Window.partitionBy(lit(0)).orderBy(self.featureCol)) -
                              1).alias('id'), "*")
        df_features = df_index.select('id', self.featureCol)

        # prep for joining
        df_features = df_features.repartitionByRange(repartition_num, 'id')

        left_df = df_features.select(
            df_features['id'].alias('left_id'),
            df_features[self.featureCol].alias('left_features'))
        right_df = df_features.select(
            df_features['id'].alias('right_id'),
            df_features[self.featureCol].alias('right_features'))

        # join on self where left_id does not equal right_id
        joined_df = left_df.join(right_df,
                                 left_df['left_id'] != right_df['right_id'])

        # comupte cosine similarity between vectors
        joined_df = joined_df.select(
            'left_id', 'right_id',
            cosine_similarity_udf(
                array(joined_df['left_features'],
                      joined_df['right_features'])).alias('norm'))
        ranked = joined_df.select(
            'left_id', 'right_id',
            rank().over(
                Window.partitionBy('left_id').orderBy('norm')).alias('rank'))
        knn = ranked.where(ranked['rank'] <= 5)
        knn_grouped = knn.groupBy('left_id').agg(
            f.collect_list('right_id').alias('nn'))

        # generate laplacian
        laplacian = knn_grouped.select(
            'left_id',
            laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'],
                                 lit(n),
                                 lit(self.k_nearest)).alias('lap_vector'))

        laplacian_matrix = RowMatrix(
            laplacian.select('lap_vector').rdd.map(lambda x: x[0]))
        eigenvectors = laplacian_matrix.computePrincipalComponents(
            k=self.num_eigenvectors)

        eigenvectors = [
            (idx, Vectors.dense([float(item) for item in row]))
            for idx, row in enumerate(eigenvectors.toArray().tolist())
        ]

        eigen_df = session.createDataFrame(eigenvectors,
                                           ['id', self.featureCol])
        model = KMeans(featuresCol=self.featureCol,
                       predictionCol=self.predictionCol,
                       k=self.k).fit(eigen_df)
        predictions = model.transform(eigen_df).join(df_index, on='id')
        return predictions
Example #6
0
 def test_row_matrix_from_dataframe(self):
     from pyspark.sql.utils import IllegalArgumentException
     df = self.spark.createDataFrame([Row(Vectors.dense(1))])
     row_matrix = RowMatrix(df)
     self.assertEqual(row_matrix.numRows(), 1)
     self.assertEqual(row_matrix.numCols(), 1)
     with self.assertRaises(IllegalArgumentException):
         RowMatrix(df.selectExpr("'monkey'"))
Example #7
0
def within_group_scatter(data: pyspark.sql.DataFrame, features, response,
                         targets):
    p = len(features)
    sw = numpy.zeros((p, p))
    for target in targets:
        df_t = data.filter("{} == '{}'".format(response, target))
        X_t = RowMatrix(df_t.select(features).rdd.map(numpy.array))
        sw += X_t.computeCovariance().toArray() * (df_t.count() - 1)
    return sw
Example #8
0
def join(data: sql.DataFrame, X: RowMatrix, spark, on=FEATURES__):
    as_ml = udf(lambda v: v.asML() if v is not None else None, VectorUDT())

    X = spark.createDataFrame(X.rows.map(lambda x: (x,)))
    X = X.withColumnRenamed("_1", on)
    X = X.withColumn(on, as_ml(on))

    ri = "row_index"
    X = X.withColumn(ri, func.monotonically_increasing_id())
    data = data.withColumn(ri, func.monotonically_increasing_id())
    data = data.join(X[ri, on], on=[ri]).drop(ri)

    return data
Example #9
0
def main():
    datasetfile = sys.argv[1]
    beta = 0.8
    iterations = 40
    top_k = 5

    sparkcontext = SparkContext("local", "Page Rank")
    data = sparkcontext.textFile(datasetfile)
    source_dest = data.map(make_key_value_pair_1)
    source_dest_count = data.map(make_key_value_pair_2)
    groupbykey = source_dest.groupByKey()
    number_of_nodes = groupbykey.count()
    out_degree = groupbykey.map(calc_out_degree)
    pair_map = groupbykey.collectAsMap()

    matrix_m = np.zeros(shape=(number_of_nodes, number_of_nodes))
    for key, value in pair_map.items():
        for ind_value in value:
            matrix_m[ind_value - 1][key - 1] += 1 / len(list(value))

    matrix_m = sparkcontext.parallelize(matrix_m)
    matrix_m = RowMatrix(matrix_m)

    vector_r_prev = np.empty([number_of_nodes, 1])
    vector_r_prev.fill(1 / number_of_nodes)
    vector_r_prev = DenseMatrix(number_of_nodes, 1, vector_r_prev)

    index = 0
    while (index < iterations):
        mul_val = matrix_m.multiply(vector_r_prev).rows.collect()
        mul_val = [i * beta for i in mul_val]
        mul_val = [i + (1 - beta) / number_of_nodes for i in mul_val]
        vector_r_prev = DenseMatrix(number_of_nodes, 1, mul_val)
        index += 1

    vector_r_prev = vector_r_prev.toArray()
    largest_values = heapq.nlargest(top_k, vector_r_prev)
    largest_indexes = heapq.nlargest(top_k, range(number_of_nodes),
                                     vector_r_prev.__getitem__)
    smallest_values = heapq.nsmallest(top_k, vector_r_prev)
    smallest_indexes = heapq.nsmallest(top_k, range(number_of_nodes),
                                       vector_r_prev.__getitem__)

    largest_indexes = [val + 1 for val in largest_indexes]
    smallest_indexes = [val + 1 for val in smallest_indexes]

    print("Value of largest n nodes\n", largest_values)
    print("Node numbers of largest n nodes\n", largest_indexes)
    print("Value of smallest n nodes\n", smallest_values)
    print("Node numbers of smallest n nodes\n", smallest_indexes)
    sparkcontext.stop()
def get_svd_U(tfidf_rdd, n_topics=3):
    # distributed matrix
    matrix_rdd = RowMatrix(tfidf_rdd)

    matrix_rdd.numRows
    # matrix_rdd.rows.take(3)
    svd = matrix_rdd.computeSVD(3, computeU=True)
    
    # left singular vectors
    # type = RowMatrix
    svd_u = svd.U
    
    # array of DenseVectors, m_documents x n_topics
    # [[topic_i, ...], ...]
    return svd_u.rows.collect()
Example #11
0
def distribution_data():
    vectors = data.map(lambda p: p.features)
    """
    通过数据的每一行构成RowMatrix
    """
    matrix = RowMatrix(vectors)
    matrixSummary = matrix.computeColumnSummaryStatistics()
    print "mean of each column:"
    print matrixSummary.mean()
    print "min of each column:"
    print matrixSummary.min()
    print "max of each column:"
    print matrixSummary.max()
    print "variance of each column:"
    print matrixSummary.variance()
Example #12
0
 def similarity_processing(self, tag_path):
     conf = SparkConf().setAppName("Test").setMaster("local")
     sc = SparkContext(conf=conf)
     spark = SparkSession.builder.config(conf=conf).getOrCreate()
     df = spark.read.format('com.databricks.spark.csv').options(
         header='true', inferschema='true').load(tag_path, header=True)
     df = df.drop("tagId")
     print(df.columns)
     rdd = df.rdd.map(list)
     mat = RowMatrix(rdd)
     print(mat.numCols(), mat.numRows())
     cs = mat.columnSimilarities()
     for x in cs.entries.collect():
         print(x)
     print(cs.numRows(), cs.numCols())
def spark_abs(M1, sc):

    asarr = lambda x: x.toArray().tolist()
    M = M1.rows.collect()
    V1 = list(map(asarr, M))
    L = np.abs(V1).tolist()
    return RowMatrix(sc.parallelize(L))
Example #14
0
 def U(self):
     """
     Returns a RowMatrix whose columns are the left singular vectors of the SVD if computeU was set to be True.
     """
     u = self.call("U")
     if u is not None:
         return RowMatrix(u)
def spark_sub(M1, M2, sc):

    V1 = M1.rows.collect()
    V2 = M2.rows.collect()
    lsub = lambda x1, x2: x1 - x2
    V3 = list(map(lsub, V1, V2))
    return RowMatrix(sc.parallelize(V3))
Example #16
0
 def _preprocess_data(self, data):
     X = self._feature_matrix(data)
     n = X.count()
     self.__means, var = column_statistics(X)
     var = var * (n - 1) / n
     X = RowMatrix(center(X, means=self.__means))
     return X, self.__means, var
Example #17
0
 def _preprocess_data(self, data):
     if isinstance(data, pyspark.sql.DataFrame):
         X = self._feature_matrix(data)
     else:
         X = data.rows
     X, self.__means, self.__vars = scale(X)
     return RowMatrix(X)
Example #18
0
    def test_pca(self):
        expected_pcs = array([[0.0, 1.0, 0.0],
                              [sqrt(2.0) / 2.0, 0.0,
                               sqrt(2.0) / 2.0],
                              [sqrt(2.0) / 2.0, 0.0, -sqrt(2.0) / 2.0]])
        n = 3
        denseMat = RowMatrix(self.sc.parallelize(self.denseData))
        sparseMat = RowMatrix(self.sc.parallelize(self.sparseData))
        for mat in [denseMat, sparseMat]:
            for k in range(1, 4):
                pcs = mat.computePrincipalComponents(k)
                self.assertEqual(pcs.numRows, n)
                self.assertEqual(pcs.numCols, k)

                # We can just test the updated principal component for equality.
                self.assertEqualUpToSign(pcs.toArray()[:, k - 1],
                                         expected_pcs[:, k - 1])
def plot_pca(myrdd, title, color):
    mat = RowMatrix(myrdd)
    pc = mat.computePrincipalComponents(2)
    # Project the rows to the linear space spanned by the top 2 principal components.
    projected = mat.multiply(pc)
    a = projected.rows.collect()
    sum_pca1 = 0
    sum_pca2 = 0
    for i in a:
        sum_pca1 = sum_pca1 + i[0]
        sum_pca2 = sum_pca2 + i[1]
        plt.plot(i[0], i[1], 'o', color=color)
    ave_pca1 = sum_pca1 / len(a)
    ave_pca2 = sum_pca2 / len(a)
    plt.plot(ave_pca1, ave_pca2, '^', markersize=10, color='red')
    plt.title(title)
    plt.show()
    def _transform(self, dataset):
        sc = SparkContext.getOrCreate()

        #Get spectral clustering projecction
        P = self.getProjection()
        #Get data
        x = dataset.select(self.getFeaturesCol())
        rdd2 = x.rdd.map(list)
        #Get data adopted to calculate projection
        rdd = self.getPrevdata()
        #Calculate distance between new data and "training one"
        Aarr = self._dist_matrix(rdd, rdd2, sc)
        Arm = RowMatrix(sc.parallelize(Aarr))
        #Transform new data
        result = Arm.multiply(P)
        df = result.rows.map(lambda x: Row(x.toArray().tolist())).toDF()
        return df.withColumnRenamed("_1", "projection")
Example #21
0
    def test_row_matrix_invalid_type(self):
        rows = self.sc.parallelize([[1, 2, 3], [4, 5, 6]])
        invalid_type = ""
        matrix = RowMatrix(rows)
        self.assertRaises(TypeError, matrix.multiply, invalid_type)

        irows = self.sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6])])
        imatrix = IndexedRowMatrix(irows)
        self.assertRaises(TypeError, imatrix.multiply, invalid_type)
Example #22
0
def set_elem(rowMatrix, i, j, value):
    n = rowMatrix.numRows()
    listOfElems = [
        rowMatrix.rows.collect()[my_iter].toArray() for my_iter in range(n)
    ]
    a = np.array(listOfElems)
    np.put(a, i * n + j, value)

    return RowMatrix(sc.parallelize(a), n, n)
Example #23
0
def fourier(X: RowMatrix, n_features, seed=23, gamma=1):
    p = X.numCols()
    random_state = numpy.random.RandomState(seed)

    w = numpy.sqrt(2 * gamma) * random_state.normal(size=(p, n_features))
    w = DenseMatrix(p, n_features, w.flatten(), isTransposed=True)
    b = random_state.uniform(0, 2 * numpy.pi, size=n_features)

    Y = fourier_transform(X, w, b)
    return Y, w, b
Example #24
0
def svd(data: RowMatrix, n_components=None):
    """
    Computes a singular value decomposition on a data matrix and the variance
    that is explained by the first n_components.

    :param data: a data frame
    :param n_components: number of components to be returned
    :return: returns the estimated components of a SVD.
    :rtype: a triple of (s, V, var)
    """

    logger.info("Computing SVD")
    svd = data.computeSVD(data.numCols(), computeU=False)
    s = svd.s.toArray()
    V = svd.V.toArray().T
    var = scipy.dot(s, s)
    if n_components is not None:
        var = scipy.dot(s[n_components:], s[n_components:])
        s, V = s[:n_components], V[:n_components]
    return s, V, var
Example #25
0
File: lsa.py Project: ochik100/Hops
    def singular_value_decomposition(self, n_components):
        # mat = RowMatrix(self.spark_context.parallelize(np.asarray(self.tfidf.select(
        #     'id', 'features').rdd.map(lambda row: row[1].toArray()).collect()).T))

        rdd = self.tfidf.select(
            'id', 'features').rdd.map(lambda row: row[1].toArray())
        rdd.persist(ps.StorageLevel.MEMORY_AND_DISK)
        mat = RowMatrix(self.rdd_transpose(rdd))

        # rdd_ = self.tfidf.select(
        #     'id', 'features').rdd
        # rdd_ = rdd_.map(lambda row: row[1].toArray())
        svd = computeSVD(mat, n_components, computeU=True)
        print svd.U.numCols(), svd.U.numRows()
        print type(svd.V)
        self.vt = svd.V
        self.similarity_matrix = cosine_similarity(svd.V.toArray())

        self.five_most_similar_beers = self.sql_context.createDataFrame(
            map(lambda x: np.argsort(x)[::-1][:6].tolist(),
                self.similarity_matrix),
            ['id', 'first', 'second', 'third', 'fourth', 'fifth'])

        self.tfidf = self.tfidf.join(self.five_most_similar_beers, ['id'],
                                     'inner')

        self.token, self.db = database.connect_to_database()

        # use default arguments to avoid closure of the environment of the token and db variables
        def save_to_firebase(x, token=self.token, db=self.db):
            data = {
                'brewery_name': x.brewery_name,
                'beer_name': x.beer_name,
                'state': x.state,
                'beer_style': x.beer_style,
                'first': x.first,
                'second': x.second,
                'third': x.third,
                'fourth': x.fourth,
                'fifth': x.fifth,
                'top1': x.top1,
                'top2': x.top2,
                'top3': x.top3,
                'top4': x.top4,
                'top5': x.top5,
                'top6': x.top6,
                'top7': x.top7
            }
            # name = {'brewery_name': x.brewery_name, 'beer_name': x.beer_name}
            db.child('beers').child(x.id).set(data, token)
            # db.child('beer_names').child(x.id).set(name, token)
            # sleep.(0.1)

        self.tfidf.rdd.foreach(lambda x: save_to_firebase(x))
    def add(self,vector):     
        if count_nonzero(vector) == 0:
            return
        
        # If the approximate matrix is full, call the operate method to free half of the columns
        if self.emptyRows <= 0:
            self.svd = self.distributedSketchMatrix.computeSVD(self.rows, computeU=False)
            #self.U = self.svd.U       # The U factor is a distributed RowMatrix.
            #self.S[:] = self.svd.s[:]      # The singular values are stored in a local dense vector.
            self.S = self.svd.s.array      # The singular values are stored in a local dense vector.
            self.S.flags.writeable = True
            self.V = self.svd.V       # The V factor is a local dense matrix.
            self.reduceRank()

        # Push the new vector to the next zero row and increase the next zero row index
        self.localSketchMatrix[self.nextZeroRow,:] = vector
        del(self.distributedSketchMatrix)
        self.distributedSketchMatrix = RowMatrix(self.sc.parallelize(self.localSketchMatrix))
        self.nextZeroRow += 1
        self.emptyRows -= 1
Example #27
0
def similarity(feature_vecs, columnSimilarities_threshold):
    # transpose `prod_features_rdd`
    def transpose(rm):
        cm = CoordinateMatrix(rm.rows.zipWithIndex().flatMap(
            lambda x: [MatrixEntry(x[1], j, v) for j, v in enumerate(x[0])]))
        return cm.transpose().toRowMatrix()

    rowmat = RowMatrix(feature_vecs)
    colmat = transpose(rowmat)
    sims = colmat.columnSimilarities(columnSimilarities_threshold)
    return sims
Example #28
0
def lu_factorization(A):
    n = A.numRows()

    L = RowMatrix(sc.parallelize(np.eye(n)), n, n)
    U = RowMatrix(sc.parallelize(np.zeros((n, n))), n, n)

    for k in range(0, n):
        for i in range(k + 1, n):
            L = set_elem(L, i, k, get_elem(A, i, k) / get_elem(A, k, k))

        for j in range(k, n):
            U = set_elem(U, k, j, get_elem(A, k, j))

        for i in range(k + 1, n):
            for j in range(k + 1, n):
                A = set_elem(
                    A, i, j,
                    get_elem(A, i, j) - get_elem(L, i, k) * get_elem(U, k, j))

    return L, U
Example #29
0
def main():
    if len(sys.argv) < 2:
        print('USAGE: lu_factorization.py <dim of matrix>')
        return

    n = int(sys.argv[1])
    rows = sc.parallelize(np.random.randint(n * n, size=(n, n)))
    mat = RowMatrix(rows, n, n)

    L, U = lu_factorization(mat)
    print('**************finish LU Factorization!')
    def __init__(self ,sc , rows, columns, op='fd'):
        """
        Matrix Sketching using Frequent Direction.
        Choose 'fd' for normal Frequent Direction, 'ssd' for Space Saving Direction, 'cfd' for Compensative Frequent Direction, 'isvd' for iterative SVD, and a number between 0 and 1 for Parameterized Frequent Direction
        """
        self.class_name = 'MatrixSketching'
        self.sc = sc
        self.op = op
        self.columns = columns
        self.rows = rows
        self.localSketchMatrix = zeros((self.rows, self.columns)) 
        self.distributedSketchMatrix = RowMatrix(self.sc.parallelize(self.localSketchMatrix))
        self.S = zeros(self.rows)
        self.U = []
        self.V = []
        self.step = 1
        self.nextZeroRow = 0
        self.emptyRows = self.rows
        

        # Parsing the operation parameter
        if self.op == 'fd':
            print("Matrix Sketching Using Frequent Direction")
            self.reduceRank = self.__FDOperate__
        elif self.op == 'ssd':
            print("Matrix Sketching Using Space Saving Direction")
            self.op = 2
            self.reduceRank = self.__SSDOperate__
        elif self.op == 'cfd':
            print("Matrix Sketching Using Compensative Frequent Direction")
            self.reduceRank = self.__CFDperate__
        elif self.op == 'isvd':
            print("Matrix Sketching Using iSVD")
            self.reduceRank = self.__iSVDOperate__
        elif type(self.op) != str and self.op > 0 and self.op < 1:
            print("Matrix Sketching Using Parameterized Frequent Direction")
            self.reduceRank = self.__PFDOperate__
            self.DELTA = 0
        else:
            print("Type of Reduce Rank algorithm is not correct")
            raise ValueError
Example #31
0
    def test_svd(self):
        denseMat = RowMatrix(self.sc.parallelize(self.denseData))
        sparseMat = RowMatrix(self.sc.parallelize(self.sparseData))
        m = 4
        n = 3
        for mat in [denseMat, sparseMat]:
            for k in range(1, 4):
                rm = mat.computeSVD(k, computeU=True)
                self.assertEqual(rm.s.size, k)
                self.assertEqual(rm.U.numRows(), m)
                self.assertEqual(rm.U.numCols(), k)
                self.assertEqual(rm.V.numRows, n)
                self.assertEqual(rm.V.numCols, k)

        # Test that U returned is None if computeU is set to False.
        self.assertEqual(mat.computeSVD(1).U, None)

        # Test that low rank matrices cannot have number of singular values
        # greater than a limit.
        rm = RowMatrix(self.sc.parallelize(tile([1, 2, 3], (3, 1))))
        self.assertEqual(rm.computeSVD(3, False, 1e-6).s.size, 1)
def svd(mat, k=1000):
    matRow = RowMatrix(mat)
    matSVD = matRow.computeSVD(k=k, computeU=True)
    return matSVD
# In[66]:

createVector(sampleItem)


# In[67]:

tfidfVector.persist(StorageLevel.MEMORY_AND_DISK)
tfidfVector.count()
docVect.unpersist()


# In[68]:

#/* Constructing row matrix for terms and metaData of each video */
mat = RowMatrix(tfidfVector.values())
m = mat.numRows()# /* Number of rows in a matrix */
n= mat.numCols()# /* Number of columns in a matrix */
#/* Computing svd from the 'mat' to obtain matrices*/
# svd = mat.computeSVD(30, computeU=true)


# In[69]:

type(mat)


# In[70]:

# http://stackoverflow.com/questions/33428589/pyspark-and-pca-how-can-i-extract-the-eigenvectors-of-this-pca-how-can-i-calcu/33500704#33500704
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
Example #34
0
# $example on$
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonSVDExample")

    # $example on$
    rows = sc.parallelize([
        Vectors.sparse(5, {1: 1.0, 3: 7.0}),
        Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
        Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    ])

    mat = RowMatrix(rows)

    # Compute the top 5 singular values and corresponding singular vectors.
    svd = mat.computeSVD(5, computeU=True)
    U = svd.U       # The U factor is a RowMatrix.
    s = svd.s       # The singular values are stored in a local dense vector.
    V = svd.V       # The V factor is a local dense matrix.
    # $example off$
    collected = U.rows.collect()
    print("U factor is:")
    for vector in collected:
        print(vector)
    print("Singular values are: %s" % s)
    print("V factor is:\n%s" % V)
    sc.stop()
from pyspark import SparkContext
# $example on$
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonPCAOnRowMatrixExample")

    # $example on$
    rows = sc.parallelize([
        Vectors.sparse(5, {1: 1.0, 3: 7.0}),
        Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
        Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    ])

    mat = RowMatrix(rows)
    # Compute the top 4 principal components.
    # Principal components are stored in a local dense matrix.
    pc = mat.computePrincipalComponents(4)

    # Project the rows to the linear space spanned by the top 4 principal components.
    projected = mat.multiply(pc)
    # $example off$
    collected = projected.rows.collect()
    print("Projected Row Matrix of principal component:")
    for vector in collected:
        print(vector)
    sc.stop()
# -*- coding:utf-8 -*-
# author [email protected]
import os
import sys

from pyspark import SparkContext

local_path = os.path.dirname(__file__)
sys.path.append(local_path + "/../lib")
sys.path.append(local_path + "/../")

from pyspark.mllib.linalg import Vector
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

def main(sc, sqlContext, isHive = True):
    pass

if __name__ == "__main__":
    os.environ["SPARK_HOME"] = "C:\spark-1.6.1-bin-hadoop2.6"
    sc = SparkContext('local[1]')
    rddRows = sc.parallelize(["1 0 2 0 0 1", "0 0 4 2 0 0"])

    rddRows.map(lambda x: Vectors.dense([float(each) for each in str(x).split(" ")]))
    mat = RowMatrix(rddRows)

    simsPerfect = mat.columnSimilarities()