users.distinct().count() #We don't have to extract data to its own RDD #This command counts the distinct movies #There are 1,682 movies clean_data.map(lambda y: int(y[1])).distinct().count() #Need to import functions / objects from the MLlib from pyspark.mllib.recommendation import MatrixFactorizationModel, Rating from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import LinearRegressionWithSGD #We'll need to map the movielens data to a Ratings object #A Ratings object is made up of (user, item, rating) mls = movielens.map(lambda l: l.split('\t')) ratings = mls.map(lambda x: Rating(int(x[0]),\ int(x[1]), float(x[2]))) #Need a training and test set train, test = ratings.randomSplit([0.7,0.3],7856) train.count() #70,005 test.count() rank = 5 # Latent Factors to be made numIterations = 100 # Times to repeat process #Create the model on the training data model = LinearRegressionWithSGD.train(train, rank, numIterations) # Evaluate the model on testdata # dropping the ratings on the tests data testdata = test.map(lambda p: (p[0], p[1]))
from pyspark import SparkConf from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating sc.stop() conf = SparkConf().setMaster("local").setAppName("Question2") sc = SparkContext(conf=conf) spark = SparkSession.builder.getOrCreate() # Load and parse the data data = sc.textFile("dbfs:/FileStore/tables/ratings.dat").map(lambda line: line.split("::")).map(lambda x: Rating(int(x[0]), int(x[1]), int(x[2]))) splits = data.randomSplit([6, 4], 24) trainData = splits[0] testData = splits[1] rank = 10 iterations = 20 model = ALS.train(trainData, rank, iterations) testLabel = testData.map(lambda p: ((p[0], p[1]), p[2])) testData = testData.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testData).map(lambda r: ((r[0], r[1]), r[2])) combined_result = predictions.join(testLabel) MSE = combined_result.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE))
def Rating_info(sc, file_path): rating = sc.textFile(file_path).map(lambda x: x.split('\t')).map( lambda x: Rating(x[0], x[1], x[2])) return rating
d = 'NMF_tests_10iter/' n_iter = 10 k_range = np.arange(200,201,10) split = False #alpha_range = [0.0001,0.001,0.01,0.05,1.0,5,10] #reg_range =[1e-8,1e-04,0.01,0.1,0.5,1.0] raw_data = sc.textFile("mf_format.txt").map(lambda row: [int(val) for val in row.strip().split(',')]) if split: rand_a,rand_b = raw_data.randomSplit(weights=[0.5,0.5],seed=99).persist() ratings_a = rand_a.map(lambda row: Rating(row[0],row[1],row[2])).persist() ratings_b = rand_b.map(lambda row: Rating(row[0],row[1],row[2])).persist() else: ratings = raw_data.map(lambda row: Rating(row[0],row[1],row[2])).persist() base_model_name = d+'model_' #with open(d+'log_rmse','a') as fout: for k in k_range: start = time.time() if split: model_a = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True) model_b = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True) model_a.save(sc,'model_rand_a_'+str(k)) model_b.save(sc,'model_rand_b_'+str(k))
import pyspark sc = pyspark.SparkContext(master="local[3]", appName="ML project 2") from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from helpers import row_col_spark, load_data, create_csv_submission r = re.compile(r'r(\d+)_c(\d+)') # Load and parse the data data2 = sc.textFile("data_train.csv") header = data2.first() #extract header data2 = data2.filter(lambda row: row != header) ratings = data2.map(lambda l: l.split(',')) ratings = ratings.map(lambda l: Rating(*row_col_spark(l[0], r), float(l[1]))) train2, test2 = ratings.randomSplit([0.9, 0.1], seed=4242) #Need to cache the data to speed up training train2.cache() test2.cache() sc.setCheckpointDir('checkpoint/') l_s = 0.09 r_s = 100 print(""" ============================ lambda = {} rank = {} ============================
def parseLine(line, sep=','): '''Parse RDD into a Ratings() object for ALS model: user,item,rating''' fields = line.split(sep) return Rating(user=int(fields[0]), product=int(fields[1]), rating=float(fields[2]))
import json from crux.util import loadPickle from operator import add from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark import SQLContext, SparkContext, SparkConf reload(sys) sys.setdefaultencoding('utf-8') conf = SparkConf().setAppName("chencheng's task").setMaster( "spark://anti-spam-spark-001.yz.momo.com:8081,anti-spam-spark-002.yz.momo.com:8081" ) sc = SparkContext(conf=conf) user_artist_data = sc.textFile( "hdfs://antispam/user/hadoop/output/chencheng/crux/data/bobby/book/") #tfidf=loadPickle('/home/hadoop/chen.cheng/moa/book_tfidf.pkl') #b = sc.broadcast(tfidf) ratings = user_artist_data.map(lambda x: json.loads(x))\ .flatMap(lambda x: [[x[0], item] for item in x[1]]) \ .filter(lambda x: x[0] and x[1] )\ .map(lambda x: Rating(int(x[0]), int(x[1]), 1)) ratings.cache() rank = 10 numIterations = 20 model = ALS.trainImplicit(ratings, rank, numIterations, alpha=10.0) model.save( sc, "hdfs://antispam/user/hadoop/output/chencheng/model/als_book_alpha=10")
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating import itertools test_ratings=[] ratings = sc.textFile("/FileStore/tables/ratings.dat") ratings=ratings.map(lambda x: x.split("::")).map(lambda row: [int(row[0]), int(row[1]),float(row[2])]) training, test = ratings.randomSplit([0.6, 0.4]) test_data=test.map(lambda r: (r[0],r[1])) training=training.map(lambda row: Rating(row[0],row[1],row[2])) rank = 50 numIterations =20 model = ALS.train(training, rank, numIterations,0.01) predictions = model.predictAll(test_data).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) pred=ratesAndPreds.map(lambda r:(r[1][0],round(r[1][1]))) accuracy = 100 *(pred.filter(lambda pl: pl[0] == pl[1]).count())/ test.count() print('Model Accuracy: {}'.format(accuracy))
def parseLine(line): fields = line.split("|") return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
1]], [float(a[2])])).reduceByKey(lambda a, b: a + b).map( lambda a: (a[0], sum(a[1]) / len(a[1]))).collect() for id in missing_businessid: business_average[id[0]] = id[1] #print(business_average) validation_RDD = sc.textFile(validation_file_path).map(lambda a: a.split(",")) # saving the header # header_1 = validation_RDD.first() # filtering based on header # validation_data = validation_RDD.filter(lambda a: a != header_1) #print(validation_data.count()) # Model Based Collaborative Filtering # if case == 1: ratings = training_data.map(lambda a: Rating(int(user_dict[a[ 0]]), int(business_dict[a[1]]), float(a[2]))) rank = 8 numIterations = 10 model = ALS.train(ratings, rank, numIterations, 0.2) #checking the existence of ids from validation set in training set # def check_id(id): if id[0] not in user_dict: user_id = -1 elif id[0] in user_dict: user_id = user_dict[id[0]] if id[1] not in business_dict: business_id = -1 elif id[1] in business_dict: business_id = business_dict[id[1]] return ((user_id, business_id))
def create_rating(rating_record): tokens = rating_record.split(',') userID = int(tokens[0]) productID = int(tokens[1]) rating = float(tokens[2]) return Rating(userID, productID, rating)
user_data = sys.argv[2] output = sys.argv[3] userId = 112132212 movie = Row("id", "movieName") movie_table = sc.textFile(movie_data + str("/movies.dat")) rating_table = sc.textFile(movie_data + str("/ratings.dat")) user_data_table = sc.textFile(movie_data + str("/users.dat")) new_user = sc.textFile(user_data) movieRDD = movie_table.map(lambda movie: movie.split("::")) ratingDF = (rating_table.map(lambda rating: rating.split("::")).map( lambda rate: (int(rate[0]), int(rate[1]), float(rate[2]))).map( lambda (uid, mid, rate): Rating(uid, mid, rate))).toDF() newUserRDD = new_user.map(lambda movie: movie.split(" ", 1)) joinRDD = movieRDD.cartesian(newUserRDD) joinRDD = (joinRDD.map(lambda (movie, umovie): (movie[0], movie[1], umovie[ 0], umovie[1])).map(lambda (id, movie, urate, umovie): (umovie, ( id, urate, levenshtein(movie, umovie)))).reduceByKey( lambda x1, x2: min(x1, x2, key=lambda x: x[-1]))) userMovie = ( joinRDD.map(lambda (key, value): (userId, value[0], value[1])).map( lambda (uid, mid, rate): Rating(uid, int(mid), float(rate)))).toDF() trainDF = ratingDF.cache()
namedict = dict() file = open("./ml-100k/ml-100k/u.item") for record in file: fields = record.split("|") namedict[(int(fields[0]))] = fields[1].encode(encoding="ascii", errors="ignore") return namedict conf = SparkConf().setMaster("local[*]").setAppName("ALS recommendation") sc = SparkContext(conf=conf) namedict = names() lines = sc.textFile("file:///SparkCourse/ml-100k/ml-100k/u.data") train_data = lines.map(lambda x: x.split()).map( lambda x: Rating(int(x[0]), int(x[1]), x[2])).cache() numiter = 10 rank = 10 print("Training model") model = ALS.train(train_data, rank, numiter) id = int(sys.argv[1]) movies = train_data.filter(lambda x: x[0] == id) for i in movies.collect(): print(namedict[i[1]]) #top 10 recommendation
def main(data_source, users_source, output, number_recs): users_data = {} with open(users_source) as csvfile: reader = csv.reader(csvfile) for row in reader: users_data[int(row[0]) - 1] = row[1] # This should be changed if running on cluster conf = SparkConf().setMaster("local[*]").setAppName("AptoideALS") sc = SparkContext(conf=conf) sc.setLogLevel("OFF") # Load and parse the data data = sc.textFile(data_source) ratings = data.map(lambda l: l.split(','))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() # Build the recommendation model using Alternating Least Squares seed = 5L iterations = 10 # Is a basic L2 Regularizer to reduce overfitting regularization_parameter = 0.1 # Number of features used to describe items rank = 50 # Is the confidence that we have that the user likes the item alpha = 100.0 model = ALS.trainImplicit(ratings, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter, alpha=alpha) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) # Use Spotify annoy to get items neighbors based on the feature vectors # n_trees -> a larger value will give more accurate results, but larger indexes # search_k -> a larger value will give more accurate results, but will take longer time to return index = AnnoyIndex(rank, 'angular') items = model.userFeatures().collect() for i, vector in items: # Annoy start at index 0, while Spark starts at index 1. We need to -1 every index index.add_item(i - 1, vector) # n_trees index.build(300) index.save("index.ann") sc.addPyFile("index.ann") # Broadcast: improve performance by sending once per node rather than a once per task names = sc.broadcast(users_data) # Major function to get recommendations based on features vectors # assumes items are numbered 0 ... n-1 def find_neighbors(iter): t = AnnoyIndex(rank) t.load(SparkFiles.get("index.ann")) # search_k return ((x[0] - 1, t.get_nns_by_item(x[0] - 1, int(number_recs))) for x in iter) # Function to convert into the format required # Need to convert inside the RDD so it make us of spark's file writer def construct_string(x): array = [] order = int(number_recs) for item in x[1]: if item != x[0]: array.append("(\"{}\",{})".format(names.value[item], str(order))) order -= 1 result = "\"{}\",{}".format(names.value[x[0]], str(array)).replace( " ", "").replace("[", "").replace("]", "").replace("'", "") return result similarRDD = model.productFeatures().mapPartitions(find_neighbors) similarRDD.map(construct_string).saveAsTextFile(output)
playerprob = sc.textFile("finalcluster2") playerprob=playerprob.map(lambda x:x.split(',')) playerprobfinal=playerprob.map(lambda x:compute(x)) player_schema = StructType() \ .add("Bat", "string") .add("Bowl","string").add("0","float").add("1","float").add("2","float").add("3","float") .add("4","float").add("5","float").add("w","float") dataframe = sqlContext.createDataFrame(playerprobfinal, player_schema) rddtodf = [StringIndexer(inputCol=j, outputCol=j+"_i") for j in list(set(dataframe.columns)-set(['0'])-set(['1'])-set(['2'])-set(['3'])-set(['4'])-set(['6'])-set(['w']))] dataframe1 = Pipeline(stages=rddtodf) indexed = dataframe1.fit(dataframe).transform(dataframe) playerprob = indexed.rdd.map(tuple) rank = 10 numIterations = 10 ratingszero = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[2]))) ratingsone = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[3]))) ratingstwo = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[4]))) ratingsthree = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[5]))) ratingsfour = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[6]))) ratingssix = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[7]))) ratingswickets = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[8]))) modelzero = ALS.train(ratingszero, rank, numIterations) modelone = ALS.train(ratingsone, rank, numIterations) modeltwo = ALS.train(ratingstwo, rank, numIterations) modelthree = ALS.train(ratingsthree, rank, numIterations) modelfour = ALS.train(ratingsfour, rank, numIterations) modelsix = ALS.train(ratingssix, rank, numIterations) modelwickets = ALS.train(ratingswickets, rank, numIterations) models = [modelzero,modelone,modeltwo,modelthree,modelfour,modelsix,modelwickets]
from pyspark.sql import SQLContext import pandas as pd import pyspark.sql.functions as f from time import time #initialize spark conf = SparkConf().setAppName('test') sc = SparkContext(conf=conf) sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) #read train file data = sqlContext.read.format('com.databricks.spark.csv').options( header='true').load('train_2.csv') ratings = data.rdd.map( lambda l: Rating(int(l.userID), int(l.movieID), float(l.rating))) print ratings.take(5) ################ Create model using train set ###################### t0 = time() rank = 10 numIterations = 10 model = ALS.train(ratings, rank, numIterations) tt = time() - t0 print "Model trained in %s seconds" % round(tt, 3) ################ Apply model on test set ###################### #read test file test_data = sqlContext.read.format('com.databricks.spark.csv').options( header='true').load('test_2.csv') test_all = test_data.rdd.map(
sc = SparkContext("local", "collaborative_filtering") #initializing sc sqlContext = SQLContext(sc) df = sqlContext.read.load("./tables/ratings") #movieId, rating, timestamp, userid num_ratings = df.select("rating").count() num_movies = df.select("movieId").distinct().count() num_users = df.select("userId").distinct().count() ratings = "./ratings.csv" #The path to ratings file. change this according to file location #Loading the data using SparkContext data = sc.textFile(ratings) ratings_data = data.map(lambda l: l.split(',')) ratings = ratings_data.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) #Building the recommendation model using Alternating Least Squares rank = 10 numIterations = 5 model = ALS.train(ratings, rank, numIterations) #Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) #Lets save the model for future use print("Model Computed. Saving the model...")
@author: BuleSky ''' from pyspark import SparkConf, SparkContext # Configure the Spark environment sparkConf = SparkConf().setAppName("WordCounts").setMaster("local") sc = SparkContext(conf=sparkConf) from pyspark.mllib.recommendation import ALS, Rating #file_path="hdfs://blue:9000/data/md/u.data" file_path = "file:///E:/spark/learn/ml-100k/" rawRatings = sc.textFile(file_path).map(lambda line: line.split("\t")[0:3]) ratings = rawRatings.map( lambda fields: Rating(int(fields[0]), int(fields[1]), int(fields[2]))) print ratings.first() model = ALS.train(ratings, 50, 10, 0.01) userFeaturesCount = model.userFeatures().count() productFeaturesCount = model.productFeatures().count() print "userCount {0} productCount {1}".format(userFeaturesCount, productFeaturesCount) predicted = model.predict(789, 123) print predicted userId = 789 k = 10
"""" Program:als.py Description:sparl内置als算法调用 Author: zhenglei - [email protected] Date: 2016-01-14 12:56:53 Last modified: 2016-01-14 14:45:34 Python release: 2.7 """ from pyspark import SparkContext from pyspark.mllib.recommendation import ALS, Rating if __name__ == '__main__': sc = SparkContext() data = sc.textFile("alsTest.data") ratings = data.map(lambda l: l.split(',')).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) print ratings.collect() rank = 10 numIterations = 10 # 训练模型, rank是隐含影响特征,一般是初始为5-10,然后递增查看训练效果,直到效果不再改变,确定rank的值 model = ALS.train(ratings, rank, numIterations) testdata = ratings.map(lambda p: (p[0], p[1])) # 对输入的数据进行预测 predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) print predictions.collect() # 获取所有预测及测试数据 ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) print ratesAndPreds.collect() # 计算误差
def train(): conf = SparkConf() \ .setAppName("project") \ .setMaster("local[*]") \ .set("spark.driver.memory","4g") sc = SparkContext(conf=conf) # check model dir if not os.path.exists(model_dir): os.mkdir(model_dir) # rename raw_data = sc.textFile(train_file).map(json.loads).persist(StorageLevel.MEMORY_AND_DISK) u_table1 = raw_data.map(lambda x: x['user_id']).distinct().collect() u_set1 = set(u_table1) b_table1 = raw_data.map(lambda x: x['business_id']).distinct().collect() b_set1 = set(b_table1) user_avg = support.getAvg(user_avg_file) business_avg = support.getAvg(business_avg_file) u_set2 = set(user_avg.keys()) b_set2 = set(business_avg.keys()) b_table3 = sc.textFile(business_json).map(json.loads).map(lambda x: x['business_id']).collect() b_set3 = set(b_table3) u_table = list(u_set1.union(u_set2)) b_table = list(b_set1.union(b_set2).union(b_set3)) u_d = {u_table[i]: i for i in range(len(u_table))} b_d = {b_table[i]: i for i in range(len(b_table))} # agmentation business_avg = support.getAvg(business_avg_file) n_b_avg = {b_d[k]: business_avg[k] for k in business_avg} # get stopwords stopwords = sc.textFile(stopwords_file).collect() b_profile = sc.textFile(business_json) \ .map(json.loads) \ .map(lambda x: (x['business_id'], x['categories'])) \ .map(lambda x: (b_d[x[0]], x[1])) \ .mapValues(lambda v: processCategories(v, stopwords)) \ .collectAsMap() b_list = list(sorted(b_profile.keys())) b_length = len(b_profile) jaccard_sim = sc.parallelize(b_list) \ .flatMap(lambda x: getJS(x, b_profile, b_list)) \ .reduceByKey(lambda x, y: x + y) \ .mapValues(lambda vs: {k: v for k, v in vs}) \ .collect() agm_data = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \ .map(lambda x: (u_d[x[0]], b_d[x[1]], x[2])) \ .map(lambda x: (x[0], [(x[1], x[2])])) \ .reduceByKey(lambda x, y: x + y) \ .mapValues(lambda vs: processValues(vs, jaccard_sim, n_b_avg)) \ .flatMap(lambda x: [(x[0], b, star) for b, star in x[1]]) \ .persist(StorageLevel.MEMORY_AND_DISK) # asl agm_train = agm_data.map(lambda x: ((u_table[x[0]], b_table[x[1]]), x[2])).collect() support.writeDownRenameTable(agm_train, agm_train_file) lonely_user = agm_data.map(lambda x: (x[0], 1)) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] < LONELY_USER_THRESHOLD) \ .map(lambda x: x[0]) \ .collect() lonely_business = agm_data.map(lambda x: (x[1], 1)) \ .reduceByKey(lambda x, y: x + y) \ .filter(lambda x: x[1] < LONELY_BUSINESS_THRESHOLD) \ .map(lambda x: x[0]) \ .collect() stars_data = agm_data.filter(lambda x: x[0] not in lonely_user and x[1] not in lonely_business) \ .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK) sc.setCheckpointDir(checkpoint_file) ALS.checkpointInterval = 2 modelRDD = ALS.train(ratings=stars_data, rank=1, iterations=70, lambda_=0.01, nonnegative=True) saveAlsModel(modelRDD, u_table, b_table, als_model_file)
def modelbased(inputfile,valfile,outputfile): sc = SparkContext(appName="Task2.1") start = time.time() data = sc.textFile(inputfile) data_header = data.first() input_data_final = data.filter(lambda rec: rec != data_header).map(lambda string_record: (string_record.split(','))) userdata = input_data_final.map(lambda x: x[0]).collect() businessdata = input_data_final.map(lambda x: x[1]).collect() usermap = {} businessmap = {} reverseusermap = {} reversebusinessmap = {} for idx, user in enumerate(userdata): usermap[user] = idx reverseusermap[idx] = user # idx+=1 for idx, business in enumerate(businessdata): businessmap[business] = idx reversebusinessmap[idx] = business # idx+=1 ratings = input_data_final.map(lambda x: Rating(int(usermap[x[0]]), int(businessmap[x[1]]), float(x[2]))) rank = 2 numIterations = 5 model = ALS.train(ratings, rank, numIterations) test1 = sc.textFile(valfile) test_data_header = test1.first() testRDD = test1.filter(lambda rec: rec != test_data_header).map(lambda string_record: (string_record.split(','))) test_user = testRDD.map(lambda x: x[0]).collect() test_business = testRDD.map(lambda x: x[1]).collect() for newIdx, user in enumerate(test_user): if user not in usermap: while newIdx in usermap.values(): newIdx += 1 usermap[user] = newIdx reverseusermap[newIdx] = user for newIdx, business in enumerate(test_business): if business not in businessmap: while newIdx in businessmap.values(): newIdx += 1 businessmap[business] = newIdx reversebusinessmap[newIdx] = business testingRDD = testRDD.map(lambda x: Rating(int(usermap[x[0]]), int(businessmap[x[1]]), float(x[2]))) testing_data = testingRDD.map(lambda x: (x[0], x[1])) testprediction = model.predictAll(testing_data).map(lambda x: ((x[0], x[1]), x[2])).cache() predictions = testprediction.map(lambda x: (x[0][0], x[0][1], x[1])).collect() file = open(outputfile, "w") file.write('user_id, business_id, prediction\n') for pred in predictions: file.write( str(reverseusermap[pred[0]]) + "," + str(reversebusinessmap[pred[1]]) + "," + str(pred[2]) + "\n") file.close() end = time.time() print("Duration: ", end - start) ratesAndPreds = testingRDD.map(lambda r: ((r[0], r[1]), r[2])).join(testprediction) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean() print("Mean Squared Error = ", str(MSE ** 0.5))
def parseRating(line): # Parsing SteamID,AppID,Rating line = line.split(',') return Rating(int(line[0]), int(line[1]), float(line[2]))
def compute_model_cf(ftrain, ftest): test_data_1 = sc.textFile(ftest) data = test_data_1.map(lambda line: readtestdata(line)) header_info = data.first() test_data = data.filter(lambda ratings: ratings != header_info).map( lambda rowData: (int(rowData[0]), int(rowData[1]))).persist() train_data_1 = sc.textFile(ftrain).map(lambda lines: readtraindata(lines)) header_info_train = train_data_1.first() train_data = train_data_1.filter( lambda rating: rating != header_info_train).map(lambda rowData: ( (int(rowData[0]), int(rowData[1])), float(rowData[2]))) user_movies = train_data.map(lambda row: row[0]) #print user_movies.collect() train_movies = user_movies.subtract(test_data).map(lambda r: (r, 0)) #print train_movies.collect() training_movies = train_data.join(train_movies).map(lambda r: (r[0], r[1][0])) #print training_movies.collect() test_movies = test_data.map(lambda r: (r, 0)) test_movie_rates = train_data.join(test_movies) test_movie_ratings = train_data.join(test_movie_rates).map(lambda r: (r[0], r[1][0])) ratings = training_movies.map(lambda r: Rating(r[0][0], r[0][1], r[1])) rank = 7 iters = 10 model = ALS.train(ratings, rank, iters) predictions = model.predictAll(test_data).map(lambda r: ((r[0], r[1]), r[2])) outliers = predictions.filter(lambda r: r[1] < 0.0 or r[1] > 5.0) true_Predictions = predictions.subtract(outliers) predicted_movies_rdd = predictions.map(lambda r: r[0]) missing_movies_rdd = test_data.subtract(predicted_movies_rdd) missing_Outliers = outliers.map(lambda r: r[0]).union(missing_movies_rdd) userRatings = predictions.map(lambda r: (r[0][0], r[1])) mean_Ratings = userRatings.aggregateByKey( (0.0, 0.0), lambda U, s: (U[0] + s, U[1] + 1), lambda U, V: (U[0] + V[0], U[1] + V[1])).mapValues( lambda res: 1.0 * res[0] / res[1]) user_movie_ratings = missing_Outliers.join(mean_Ratings).map( lambda r: ((r[0], r[1][0]), r[1][1])) predictions_union = true_Predictions.union(user_movie_ratings) ratesAndPreds = test_movie_ratings.join(predictions_union).sortByKey() MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() RMSE = math.sqrt(MSE) difference = ratesAndPreds.map(lambda r: abs(r[1][0] - r[1][1])) bet_0_1 = difference.filter(lambda r: r >= 0 and r < 1.0).count() bet_1_2 = difference.filter(lambda r: r >= 1.0 and r < 2.0).count() bet_2_3 = difference.filter(lambda r: r >= 2.0 and r < 3.0).count() bet_3_4 = difference.filter(lambda r: r >= 3.0 and r < 4.0).count() bet_4_5 = difference.filter(lambda r: r >= 4.0).count() #squared_error = ratesAndPreds.map(lambda r : math.pow((r[1][0] - r[1][1]),2)).mean() print() print(">=0 and <1: ", bet_0_1) print(">=1 and <2: ", bet_1_2) print(">=2 and <3: ", bet_2_3) print(">=3 and <4: ", bet_3_4) print(">=4: ", bet_4_5) print("RMSE = ", RMSE) headers = [] headers = [["UserId", "MovieId", "Pred_rating"]] header = sc.parallelize(headers) resultRDD = ratesAndPreds.map( lambda r: (str(r[0][0]), str(r[0][1]), str(r[1][1]))).repartition(1) fileRDD = header.union(resultRDD).map( lambda r: r[0] + "," + r[1] + "," + r[2]) fileRDD = fileRDD.repartition(1) fileRDD.saveAsTextFile("Prashanth_Manja_ModelBasedCF.txt") end_time = time.time() total_time = end_time - start_time print("Total_Execution_Time_Is-->", str(total_time))
def tup_to_rating(tup): user, subreddit, num = tup user = javahash(user) subreddit = javahash(subreddit) num = float(num) return Rating(user, subreddit, num)
inputRDD = sc.textFile(inputFile) testRDD = sc.textFile(testFile) header = inputRDD.first() #extract header inputRDD = inputRDD.filter(lambda row: row != header) header2 = testRDD.first() #extract header testRDD = testRDD.filter(lambda row: row != header2) inputRDD = inputRDD.map(lambda line: line.split(',')).map( lambda x: ((int(x[0]), int(x[1])), float(x[2]))) testRDD = testRDD.map(lambda line: line.split(',')).map( lambda x: ((int(x[0]), int(x[1])), 1)) input1 = inputRDD.subtractByKey(testRDD) input = input1.map(lambda x: Rating(x[0][0], x[0][1], x[1])) sc.setCheckpointDir('/tmp') rank = 8 numIterations = 10 lmbda = 0.1 numBlocks = 16 nonnegative = True model = ALS.train(input, rank, numIterations, lmbda, nonnegative=True, seed=42) testRDD = testRDD.map(lambda x: (x[0][0], x[0][1])).distinct()
user_map[user_set[i]] = user_id user_id += 1 business_id = 0 for i in range(0, len(business_set)): if business_set[i] not in business_map: business_map[business_set[i]] = business_id business_id += 1 Expected_rating = test_rdd.map(lambda x: ( (user_map[x[0][0]], business_map[x[0][1]]), float(x[1]))) test_data = Expected_rating.map(lambda x: (x[0][0], x[0][1])) training_data = train_rdd.map(lambda x: Rating(user_map[x[0][ 0]], business_map[x[0][1]], float(x[1]))) model = ALS.train(training_data, rank=2, iterations=20, lambda_=0.5) predictions = model.predictAll(test_data).map(lambda x: ((x[0], x[1]), x[2])) prediction_list = predictions.collect() sum_predicted_ratings = predictions.values().sum() total_count_ratings = predictions.count() average_predicted_rating = float(sum_predicted_ratings) / float( total_count_ratings) test_data_list = test_data.collect()
numHits = 0.0 for (i, p) in zip(range(k), predK): if actual.filter(lambda x: x == p).count() != 0: numHits += 1.0 score += numHits / (i + 1.0) if actual.isEmpty(): return 1 else: return (score / min((actual.count(), k))) #转换为Rating类 rawData = sc.textFile("../data/ml-100k/u.data") rawRatings = rawData.map(lambda lines: lines.split('\t')[:3]) ratings = rawRatings.map( lambda fields: Rating(int(fields[0]), int(fields[1]), float(fields[2]))) implicit_ratings = ratings.map( lambda fields: Rating(fields[0], fields[1], int(to_implicit(fields[2])))) """ 模型参数: rank: ALS模型中的因子个数,即低阶近似矩阵中的隐含特征个数 10-200 iterations: 迭代次数 10左右 lambda: 控制模型的正则化过程,防止过拟合;需要通过交叉验证来进行标定 classmethod train(ratings, rank, iterations=5, lambda_=0.01, blocks=-1, nonnegative=False, seed=None) classmethod trainImplicit(ratings, rank, iterations=5, lambda_=0.01, blocks=-1, alpha=0.01, nonnegative=False, seed=None) """ model = ALS.train(ratings, 50, 10, 0.01) model_implicit = ALS.train(ratings, 50, 10) """
# Apache Spark Config Settings conf = (SparkConf().setAll([("spark.driver.memory", "5G"), ("spark.driver.maxResultSize", "2G"), ("spark.executor.memory", "3G")])) sc = SparkContext(conf=conf, appName="CollaborativeFiltering") # Loading normalized train and testdata from hadoop filesystem train_norm_data = sc.textFile( 'file:///users/jeevan4/challenge2/train_norm_data.txt') test_norm_data = sc.textFile( 'file:///users/jeevan4/challenge2/test_norm_data.txt') # Converting the data into Ratings to supply the data into ALS algorithm train_ratings = train_norm_data.map(lambda x: x.split(",")).map( lambda l: Rating(l[0], l[1], float(l[2]))).persist( StorageLevel.MEMORY_ONLY) test_ratings = test_norm_data.map(lambda x: x.split(",")).map( lambda l: Rating(l[0], l[1], float(l[2]))).persist( StorageLevel.MEMORY_ONLY) # ALS algorithm parameters and building recommendation model using Alternating Least Squares rank = 10 numIterations = 10 model = ALS.train(train_ratings, rank, numIterations, 0.01) #Evaluate the model on test data testdata = test_ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
ratings[k] = 0 ratings[k] += meta2[2][k] return (users,items,ratings) # perform datapass to compute stats rating_stats = df.rdd.map(lambda row: metaMap(row,user_index,item_index,rating_index)).reduce(lambda x,y:metaReduce(x,y)) print("Total number of distinct items: "+ str(len(rating_stats[1]))) print("Total number of distinct users: "+ str(len(rating_stats[0]))) print("Rating frequencies:") ratings_range = sorted(rating_stats[2].keys()) for rating in ratings_range: print("\t"+str(rating)+": "+str(rating_stats[2][rating])) ratings = df.map(lambda l: Rating(int(l[user_index]), int(l[item_index]), float(l[rating_index]))) # Build the recommendation model using Alternating Least Squares model = ALS.train(ratings, rank, iterations, lmbda, blocks, seed=random_seed) # Save and load model model.save(sc, model_path) saveDummy(sc,dummypath) ascontext.setModelContentFromPath("dummy",dummypath)
#c.execute("SELECT * FROM ratings WHERE user_id = %s",(user,)) c.execute("SELECT user_id, movie_id, rating FROM ratings ") query = c.fetchall() #### MYSQL ######################################################################### conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS") sc = SparkContext(conf=conf) sc.setCheckpointDir('checkpoint') print("\nLoading movie names...") nameDict = loadMovieNames() data = sc.parallelize(query) # data = sc.textFile("file:/Users/alejandroaparicio/Documents/SparkCourse/ml-latest-small/ratings.csv") ratings = data.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() # Build the recommendation model using Alternating Least Squares print("\nTraining recommendation model...") #rank = 10 # Lowered numIterations to ensure it works on lower-end systems #numIterations = 10 #model = ALS.train(ratings, rank, numIterations) seed = 5 iterations = 10 regularization_parameter = 0.1 rank = 8 model = ALS.train(ratings, rank, seed=seed,