Example #1
0
def process(sparkContext,sqlContext):
    
    # Define database connection parameters
    MYSQL_USERNAME = '******'
    MYSQL_PASSWORD = '******'
    MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"dblp.author_sample").load()
 
    rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect()
    colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect()

    rawData = df.map(lambda p: (long(rows.index(p.name_hash)),long(colums.index(p.paper_hash)),1.0)).cache()

#   Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(rawData)
    
    rowMat = mat.toRowMatrix()
    
    print mat.numRows()  # 3
    print rowMat.numCols()  
    
#     transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax))))
    for r in rowMat.rows().collect():
        print r
 def calculate_similarity(self):
     train = self.train_data
     train_user_mean = train.groupBy("userId").agg(F.mean('rating'))
     train_user_mean = train_user_mean.withColumnRenamed("avg(rating)",
                                                         "user_mean")
     train_rating_avg = train.join(train_user_mean, 'userId',
                                   how='left_outer')
     train_rating_avg = train_rating_avg.select(
         '*',
         (train_rating_avg.rating - train_rating_avg.user_mean)
         .alias('rating_norm'))
     rdd = (train_rating_avg.select('movieId', 'userId', 'rating_norm')
                            .rdd.map(tuple))
     coord = CoordinateMatrix(rdd)
     mat = coord.toRowMatrix()
     similarities = mat.columnSimilarities()
     similarities_df = similarities.entries.toDF()
     window = (Window.partitionBy(similarities_df['i'])
                     .orderBy(similarities_df['value'].desc()))
     similarities_df_ranked = (
         similarities_df
         .select('*', F.row_number().over(window).alias('row_number'))
         .filter(F.col('row_number') <= 100))
     similarities_df_ranked.write.parquet(SIMILARITY_FILE_SORTED,
                                          mode='overwrite')
Example #3
0
def process(sparkContext, sqlContext):

    # Define database connection parameters
    MYSQL_USERNAME = "******"
    MYSQL_PASSWORD = "******"
    MYSQL_CONNECTION_URL = "jdbc:mysql://qcis4:3306/dblp?user="******"&password="******"jdbc")
        .options(url=MYSQL_CONNECTION_URL, driver="com.mysql.jdbc.Driver", dbtable="dblp.author_sample")
        .load()
    )

    rows = df.select("name_hash").distinct().map(lambda r: r.name_hash).collect()
    colums = df.select("paper_hash").distinct().map(lambda r: r.paper_hash).collect()

    rawData = df.map(lambda p: (long(rows.index(p.name_hash)), long(colums.index(p.paper_hash)), 1.0)).cache()

    #   Create an CoordinateMatrix from an RDD of MatrixEntries.
    mat = CoordinateMatrix(rawData)

    rowMat = mat.toRowMatrix()

    print mat.numRows()  # 3
    print rowMat.numCols()

    #     transpose = rowMat.rows().zipWithIndex().map(lambda rvect, i : rvect.zipWithIndex().map( lambda ax, j : (j,(i,ax))))
    for r in rowMat.rows().collect():
        print r
Example #4
0
global train_rdd
train_rdd = train_lines.map(lambda line: line.split(',')).map(
    lambda tokens: (int(tokens[0]), int(tokens[1]), float(tokens[2])))

# Build Train Data Dict. with Format [(user, item)] = rating, for later check if the similar movie is rated
global train_dict
train_dict = {}
for x, y, z in train_rdd.collect():
    train_dict[(x, y)] = z

# -----------------------------------------   Build simPdsDF   -----------------------------------------------
# Form utilityMatrix to get simMat later
sqlCon = SQLContext(sc)
utilityMatrix = CoordinateMatrix(train_rdd)
# Similarity Btw. Items
simMat = utilityMatrix.toRowMatrix().columnSimilarities()
# Convert simMat to Pandas format
global simPdsDF
sparkDF = simMat.entries.map(lambda x: str(x.i) + "," + str(x.j) + "," + str(
    x.value)).map(lambda w: w.split(',')).toDF()
simPdsDF = sparkDF.toPandas()
# edit columns' name
simPdsDF.columns = ['ItemID_1', 'ItemID_2', 'Similarity']
# change data type
simPdsDF['ItemID_1'] = simPdsDF['ItemID_1'].astype(int)
simPdsDF['ItemID_2'] = simPdsDF['ItemID_2'].astype(int)
simPdsDF['Similarity'] = simPdsDF['Similarity'].astype(float)

# --------------------------------------- Used for RDD to calculate bias ---------------------------------------------
global train_pdsDF
train_pdsDF = pd.read_csv('train.dat', sep=",")
import pyspark
from pyspark import SparkContext
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from operator import add
from pyspark.sql import SparkSession

sc = SparkContext()
r=sc.textFile("part-00000")
m=r.flatMap(lambda x: x.split('\n')).filter(lambda x : "A" in x).map(lambda x : (x.strip("A, ")).split(' ')).map(lambda x: tuple(list(map(int, x))))
#n=m.map(lambda x : MatrixEntry(tuple(x)))

spark = SparkSession(sc)
#m.toDF().show()
print(hasattr(m,"toDF"))

cmat=CoordinateMatrix(m)
#mat = CoordinateMatrix(n)
#o=mat.take(5)
print(cmat.numRows()) # 3
print(cmat.numCols())

rowmat = cmat.toRowMatrix()

print(rowmat.numRows()) # 3
print(rowmat.numCols())

Example #6
0
#conf = SparkConf().setAppName('linalgtest')
#sc = SparkContext(conf=conf).getOrCreate()

#use local spark on computer
# findspark.init()
#from pyspark.sql import SparkSession

local_file_location = 'file:///wasp/pdb1HYS.mtx.mtx'

rdd = spark.sparkContext.textFile(local_file_location)
rdd = rdd.map(lambda line: line.split(" "))
rdd = rdd.map(
    lambda line: MatrixEntry(int(line[0]), int(line[1]), float(line[2])))

mat = CoordinateMatrix(rdd)
M = mat.toRowMatrix()
A = mat.toBlockMatrix()
At = mat.transpose().toBlockMatrix()

print("SVD")
print(M.numRows(), M.numCols())
start_svd = time.time()

NUM_TIMES = 10
#do it 10 times to get mean
for i in range(NUM_TIMES):
    svd = M.computeSVD(5, computeU=True)

end_svd = time.time()
print("Time elapsed: ", (end_svd - start_svd) /
      NUM_TIMES)  # CPU seconds elapsed (floating point)
Example #7
0
debug.TIMESTAMP(1)


def to_matrix_entry(s):
    ss = s.split()
    entry = MatrixEntry(float(ss[0]), float(ss[1]), float(ss[2]))
    return entry


data = sc.textFile('hdfs://node1:9000/input/sqr.txt')
mat = data.map(to_matrix_entry)
rdd = sc.parallelize(mat.collect())

coord_mat = CoordinateMatrix(rdd)
coord_mat = coord_mat.transpose()
row_mat = coord_mat.toRowMatrix()
sim = row_mat.columnSimilarities()
print(sim.entries.take(10))

debug.TIMESTAMP(2)
'''
data = data.map(lambda _ : np.array(_.strip().split()).astype(float))
unitMatrix = data.map(lambda _ : _/np.linalg.norm(_))

#unitMatrix = sc.parallelize(np.array([[1,2,3,5,7], [6,2,1,-1,3], [7,0,1,2,-4]]).T)
mat = RowMatrix(unitMatrix)
S = mat.columnSimilarities()

sims = S.entries.collect()
print(len(sims))
print(sims)
Example #8
0
	diag_entries = diag_entries_1.union(diag_entries_2).union(diag_entries_3);
	
	coo_matrix_input_all  = coo_matrix_input_all.union(diag_entries);
	coo_matrix_entries = coo_matrix_input_all.map(lambda e: MatrixEntry(e[0], e[1], e[2]));
	coo_matrix = CoordinateMatrix(coo_matrix_entries);


	#SAVE TO A FILE
	coo_matrix_input_all.repartition(1).saveAsTextFile("./Laplacian_4v7o_4cores_1")
	t2 = timeit.default_timer()
	print("Elapsed time for construction: {:} s".format(t2 - t0))


	#Singular value decomposition
	
	dataRows = coo_matrix.toRowMatrix().rows

	k = int(args.k) #N_singvalues
	svd = RowMatrix(dataRows.persist()).computeSVD(k, computeU=True)
	U = svd.U # The U factor is a RowMatrix.
	s = svd.s # The singular values are stored in a local dense vector.
	V = svd.V #The V factor is a local dense matrix


	sc.parallelize(V.toArray()).repartition(1).saveAsTextFile("EigenVectors_4v7o_4cores")
	sc.parallelize(s.toArray()).repartition(1).saveAsTextFile("EigenValues_4v7o_4cores")

	t4 = timeit.default_timer()
	print("Elapsed time for SVD: {:} s".format(t4 - t2))
	print("Total memory = {:}, used memory = {:}, free memory = {:}".format(psutil.virtual_memory().total/mb, (psutil.virtual_memory().total - psutil.virtual_memory().free) / mb, psutil.virtual_memory().free/mb));
	print("System size = {:} atoms".format(Natoms.sum()))