Python IndexedRowMatrix.toBlockMatrix Examples

Programming Language: Python

Namespace/Package Name: pyspark.mllib.linalg.distributed

Class/Type: IndexedRowMatrix

Method/Function: toBlockMatrix

Examples at hotexamples.com: 4

Python IndexedRowMatrix.toBlockMatrix - 4 examples found. These are the top rated real world Python examples of pyspark.mllib.linalg.distributed.IndexedRowMatrix.toBlockMatrix extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

IndexedRowMatrix(30)

multiply(12)

toBlockMatrix(4)

numRows(3)

columnSimilarities(2)

computeSVD(2)

numCols(2)

add(1)

cache(1)

Example #1

Show file

File: cosine_similarity.py Project: dv66/Distributed-Active-Learning



data = sc.textFile('hdfs://node1:9000/input/vectors_3000x500.txt')
data = data.map(lambda _ : np.array(_.strip().split()).astype(float))
data = data.map(lambda _ : _/np.linalg.norm(_))
U = data.zipWithIndex().map(lambda _ : IndexedRow(_[1], _[0]))
U = IndexedRowMatrix(U)



UT = U.toCoordinateMatrix()
UT = UT.transpose()



U = U.toBlockMatrix()
UT = UT.toBlockMatrix()

S = U.multiply(UT)

S_coord = S.toCoordinateMatrix()
sim = S_coord.entries
print(sim.take(100))





debug.TIMESTAMP(2)

Example #2

Show file

# Open some context to allow for toDF function to work or something ??
sql.SQLContext(sc)
data = sc.textFile(dataset)

#data = (data.map(lambda s: (list(map(lambda x: float(x), s.split()))))).zipWithIndex().map(lambda x: ((x[1], 0), DenseMatrix(1, 1000, x[0])))

# Read matrix normally to format of (rownumber, vector)
data = data.map(lambda s: (list(map(lambda x: float(x), s.split())))
                ).zipWithIndex().map(lambda x: (x[1], x[0]))

# Create a transpose for the matrix
tdata = sc.textFile(dataset).map(lambda s: list(
    map(lambda x:
        (x[0], float(x[1])), enumerate(s.split())))).zipWithIndex().flatMap(
            lambda x: map(lambda y: (y[0], (x[1], y[1])), x[0])).groupByKey()

# Map the transpose data to same format as normal matrix
tdata = tdata.map(lambda x: (x[
    0], map(lambda s: s[1], sorted(list(x[1]), key=itemgetter(0)))))

# Create BlockMatrix for the normal matrix and its transpose
mat = IndexedRowMatrix(data)
mat = mat.toBlockMatrix()
matTranspose = IndexedRowMatrix(tdata).toBlockMatrix()

# Get final result by multiplying mat * mat^T * mat
matTranspose = mat.multiply(matTranspose)
matRes = matTranspose.multiply(mat)

print('Done')

Example #3

Show file

    def get_Total_Related_Downloads(self, dfmain):
        #total downloads
        download_count = dfmain.groupby(['_id'])['_id'].agg(['count'])

        #build datasets vs ip similarity matrix
        group = pd.DataFrame({
            'download_count':
            dfmain.groupby(['_id', 'ip']).size()
        }).reset_index()
        person_u = list(group.ip.unique())
        dataset_u = list(group._id.unique())

        outF = open(self.DATA_LIST_FILE, "w")
        for line in dataset_u:
            outF.write(str(line))
            outF.write("\n")
        outF.close()

        data = group['download_count'].tolist()
        row = group._id.astype('category', categories=dataset_u).cat.codes
        cols = group.ip.astype('category', categories=person_u).cat.codes
        len_dataset = len(dataset_u)
        len_person = len(person_u)
        print("Datasets vs Ips :", str(len_dataset),
              str(len_person))  #(309235, 81566)
        sparsemat = sparse.csr_matrix((data, (row, cols)),
                                      dtype=np.int8,
                                      shape=(len_dataset, len_person))
        m, n = sparsemat.shape

        def f(x):
            d = {}
            for i in range(len(x)):
                d[str(i)] = float(x[i])
            return d

        # load PySpark using findSpark package

        #SparkContext.setSystemProperty('spark.executor.memory', '5g')
        #SparkContext.setSystemProperty('spark.driver.memory', '5g')
        #SparkContext.setSystemProperty('spark.executor.heartbeatInterval', '1000000000s')

        #conf = SparkConf().setAppName("simdownload")
        #conf = (conf.setMaster('local[*]').set('spark.executor.memory', '4G'))#.set('spark.executor.heartbeatInterval','1000000s')
        #sc = SparkContext(conf=conf)
        #sc = SparkContext("local", "simdownload")
        sc = SparkContext(appName="simdownload")
        sqlContext = SQLContext(sc)
        #print(sc._conf.getAll())
        sv_rdd = sc.parallelize(sparsemat.toarray())
        #populate the values from rdd to dataframe
        dfspark = sv_rdd.map(lambda x: Row(**f(x))).toDF()

        row_with_index = Row(*["id"] + dfspark.columns)

        def make_row(columns):
            def _make_row(row, uid):
                row_dict = row.asDict()
                return row_with_index(*[uid] +
                                      [row_dict.get(c) for c in columns])

            return _make_row

        print('parallelize-ok')

        f = make_row(dfspark.columns)
        # create a new dataframe with id column (use indexes)
        dfidx = (dfspark.rdd.zipWithIndex().map(lambda x: f(*x)).toDF(
            StructType([StructField("id", LongType(), False)] +
                       dfspark.schema.fields)))
        #compute cosine sim by rows
        pred = IndexedRowMatrix(
            dfidx.rdd.map(lambda row: IndexedRow(row.id, row[1:])))
        pred1 = pred.toBlockMatrix().transpose().toIndexedRowMatrix()
        pred_sims = pred1.columnSimilarities()
        #convert coordinatematrix (pred_sims) into a dataframe
        columns = ['from', 'to', 'sim']
        vals = pred_sims.entries.map(lambda e: (e.i, e.j, e.value))
        dfsim = sqlContext.createDataFrame(vals, columns)

        print('Sim Done!')
        print('Time Sim Done: ' + time.strftime("%H:%M:%S"))

        json_data = {}
        for i in range(m):
            target_id = int(dataset_u[i])
            dftemp = dfsim.where((psf.col("from") == i)
                                 | (psf.col("to") == i)).sort(
                                     psf.desc("sim")).limit(
                                         self.num_top_dataset)
            df = dftemp.toPandas()
            # v = df.iloc[:, :-1].values
            # ii = np.arange(len(df))[:, None]
            # ji = np.argsort(v == i, axis=1)  # replace `1` with your ID
            # related_ids = (v[ii, ji][:, 0]).tolist()
            # related_datasets = [dataset_u[i] for i in related_ids]
            myarr = []
            for index, rw in df.iterrows(
            ):  #this is a bit faster than numpy above
                from_id = rw['from']
                to_id = rw['to']
                if (from_id != i):
                    myarr.append(int(from_id))
                if (to_id != i):
                    myarr.append(int(to_id))
            related_datasets = [int(dataset_u[i]) for i in myarr]

            downloads = download_count.loc[target_id]['count']
            data = {}
            data['related_datasets'] = related_datasets
            data['total_downloads'] = int(downloads)
            json_data[target_id] = data

        print('Time JSONUSAGE_FILE 1: ' + time.strftime("%H:%M:%S"))
        with open(self.JSONUSAGE_FILE, 'w') as fp:
            json.dump(json_data, fp)

        print('Time JSONUSAGE_FILE 2: ' + time.strftime("%H:%M:%S"))
        sc.stop()

Example #4

Show file

File: PySpark_matrix.py Project: Saukkoriipi/PySpark_Project

A = A.map(lambda s : [float(x) for x in s.split()])

# Zip index values with cell values
A = A.zipWithIndex().map(lambda x: (x[1], x[0]))

# Print step 1 ready. With full set 1min.
print(" ")
print("Step 1 ready")
print(" ")

# Conver A to IndexedRowMatrix
A = IndexedRowMatrix(A)

# Convert A to blockmatrices and set block size.
#A = A.toBlockMatrix(1000, 1000) # Works with sample set. Data to 1 block.
A = A.toBlockMatrix(100, 1000) # Testing with full dataset

# Cache A, because it is used multiple times
A.cache()

# Print step 2 ready. With full set 3mins.
print(" ")
print("Step 2 ready")
print(" ")

# Next multiplications. We need to calculate A*AT*A.
# Size of the A is 1000000 x 1000 so size of the A*AT would be 100000*100000.
# and AT*A would be 1000*1000. For better performance use matrix multiplication 
# rule (A*AT)*A = A*(AT*A).

# Calculate A transpose