def fit_final_model(train): #model params iterations = 20 reg = 0.0875 rank = 6 model = ALS.train(train.rdd.map(lambda x: (x[0], x[1], x[2])), rank=rank, nonnegative=True, iterations=iterations, lambda_=reg) return model
def model_param_sweep(train, test): #model params iterations = 20 regularization_param_list = np.linspace(0.05, 0.2, 5) #params used in keeping track of error between different ranks rank_list = [4, 6, 8] errors = np.zeros(len(regularization_param_list)*len(rank_list)) err = 0 min_error = float('inf') max_class_rate = 0 best_rank = -1 best_iteration = -1 for rank in rank_list: for reg in regularization_param_list: model = ALS.train(train.rdd.map(lambda x: (x[0], x[1], x[2])), rank=rank, nonnegative=True, iterations=iterations, lambda_=reg) predictions = model.predictAll(test.rdd.map(lambda r: (r[0], r[1]) )).map(lambda x: ((int(x[0]), int(x[1])), float(x[2])) ) rates_and_preds = test.rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) correct_count = rates_and_preds.filter(lambda r:( abs(r[1][0] - r[1][1]) < 1) or (r[1][0] < 6 and r[1][1] < 6) ).count() total_count = rates_and_preds.count() class_rate = correct_count*1./total_count error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) errors[err] = error err += 1 print 'For rank=%s, regParam=%s the RMSE is %s with a correct classification rate of %0.3f' % (rank, reg, error, class_rate) if class_rate > max_class_rate: max_class_rate = class_rate best_rank = (rank, reg) print 'The best model was trained with (rank, regParam): %s and had class rate %0.3f' %(str(best_rank), max_class_rate)
def alq_spark(A, k, sc, **kwargs): """ Args: - A: sign matrix (csr_matrix) - k: number of clusters - sc: the spark context - kwargs: parameters for ALS.train except for ratings https://spark.apache.org/docs/1.5.1/api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS.train Return: X: np.ndarray (n x k) Y: np.ndarray (k x n) """ edges = indexed_entries(A) edges_rdd = sc.parallelize(edges) model = ALS.train(edges_rdd, rank=k, **kwargs) u_ft = model.userFeatures() p_ft = model.productFeatures() X = u_ft.sortByKey(ascending=True).collect() Y = p_ft.sortByKey(ascending=True).collect() X = np.array(list(zip(*X))[1]) Y = np.transpose(np.array(list(zip(*Y))[1])) return X, Y
def train(self): "Train the model with new data and write to file" user_lookup, course_lookup = self.__prepare_data() # send list of (user_id, course_id, rating) triples to the ML algorithm log.info('Loading ratings data') ratings_RDD_raw = self.sc.parallelize(m.UserCourse.objects) self.ratings_RDD = (ratings_RDD_raw .filter(lambda ratings: ratings.course_review.interest is not None) .map(lambda ratings: (user_lookup[str(ratings.user_id)], course_lookup[ratings.course_id], float(ratings.course_review.interest))) ).cache() training_error, test_error = self._report_error(self.ratings_RDD) log.info('Training model') model = ALS.train(self.ratings_RDD, _PARAMS['rank'], _PARAMS['num_iterations'], _PARAMS['reg_param']) log.info('Model trained!') model_path = os.path.join(os.path.dirname(__file__), '%s/trained_model' % c.RECOMMENDATION_DIR) if os.path.isdir(model_path): rmtree(model_path) model.save(self.sc, model_path) self._report_metrics(num_courses=self.ratings_RDD.count(), training_error=training_error, test_error=test_error)
def main(): training_data = sc.textFile(training_inputs) testing_data = sc.textFile(testing_inputs) training_ratings = training_data.map(get_tuple).cache() testing_ratings = testing_data.map(get_tuple).cache() testing_all = testing_ratings.map(lambda (uid, mid, rating): (uid, mid)).cache() ratings = testing_ratings.map(to_Rating) ranks = [2, 4, 8, 16, 32, 64, 128, 256] reg_params = [0.1, 0.01] for i in range(len(reg_params)): RMSES = [] for rank in ranks: model = ALS.train(training_ratings, rank=rank, lambda_=reg_params[i], seed=10) predictions = model.predictAll(testing_all).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() RMSE = math.sqrt(MSE) RMSES.append(RMSE) plt.plot(range(len(ranks)), RMSES, label=str(reg_params[i])) plt.xticks(range(len(ranks)), ranks, size='small') plt.legend() plt.show()
def grid_search(train_df, test_df, X_test_df, y_test): ranks = [6] # , 8, 12, 18] lambdas = list(np.arange(0.1, 0.5, 0.1)) numIters = [20] bestModel = None bestValidationRmse = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters): model = ALS.train(train_df, rank, numIter, lmbda) validationRmse = computeRMSE(model, test_df, X_test_df, len(y_test)) print "RMSE (validation) = %f for the model trained with " % validationRmse + "rank = %d, lambda = %.1f, and numIter = %d." % ( rank, lmbda, numIter, ) if validationRmse < bestValidationRmse: bestModel = model bestValidationRmse = validationRmse bestRank = rank bestLambda = lmbda bestNumIter = numIter testRmse = computeRMSE(bestModel, test_df, X_test_df, len(y_test)) # evaluate the best model on the test set print "The best model was trained with rank = %d and lambda = %.1f, " % ( bestRank, bestLambda, ) + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse) return bestModel
def train(self, rank, iterations=10, lambda_=0.01, seed=0, **kwargs): """ Train the model. Parameters ---------- rank : int The number of factors in the underlying model. Generally, larger numbers of factors lead to better models, but increase the memory required. A rank in the range of 10 to 200 is usually reasonable. iterations : int, optional The number of iterations to perform. With each iteration, the model improves. ALS typically converges quickly, so a value of 10 is recommended. lambda : float, optional This parameter controls regularization, which controls overfitting. The higher the value of lambda applies more regularization. The appropriate value here depends on the problem, and needs to be tuned by train/test techniques, which measure overfitting. Returns ------- out: : model A RecommenderModel. This can be used to make predidictions on how a user would rate an item. """ ratings = self._prepare_ratings() model = ALS.train(ratings.to_rdd(), rank, iterations=iterations, lambda_=lambda_, seed=seed, **kwargs) return MatrixFactorizationModel(model, self.ratings, self.user_col, self.item_col, self.rating_col)
def build_ALS_model(ratings): # Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 20 model = ALS.train(ratings, rank, numIterations) return model
def main(): """ Train and evaluate an ALS recommender. """ # Set up environment sc = SparkContext("local[*]", "RecSys") # Load and parse the data data = sc.textFile("./data/ratings.dat") ratings = data.map(parse_rating) # Build the recommendation model using Alternating Least Squares rank = 10 iterations = 20 model = ALS.train(ratings, rank, iterations) movies = sc.textFile("./data/movies.dat")\ .map(parse_movie) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata)\ .map(lambda r: ((r[0], r[1]), r[2])) rates_and_preds = ratings.map(lambda r: ((r[0], r[1]), r[2]))\ .join(predictions) MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE))
def als(data): train, test = data.randomSplit(weights=[0.8, 0.2]) X_train = train.map(lambda r : Rating(r[0], r[1], r[2])) y = test.map(lambda r : ((r[0], r[1]), r[2])) X_test = test.map(lambda r : (r[0], r[1])) rank = 7 X_train.cache() X_test.cache() lambdas = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] numIterations = 10 nonnegative=True bestModel = None error = float('Inf') errors = [] #Use ALS to predict play time for test users and choose the best parameter for lambda for lmbda in lambdas: model = ALS.train(X_train, rank, numIterations, lmbda, nonnegative=nonnegative) y_hat = model.predictAll(X_test).map(lambda r : ((r[0], r[1]), r[2])) ratesAndPreds = y.join(y_hat) MSE = ratesAndPreds.map(lambda r : ((r[1][0]) - (r[1][1]))**2).mean() errors.append(MSE) if MSE < error: bestModel = model error = MSE #Plot mean square error v.s. lambda plt.plot(lambdas, errors, 'ro') plt.xlabel(r'$\lambda$') plt.ylabel(r'$MSE$') plt.title(r'MSE v.s. $\lambda$') plt.savefig('cross_validation_p.png') #Make Prediction by using the best model y_hat = model.predictAll(X_test).map(lambda r : (r[0], r[1], r[2])) y_hat.map(toCVSLine).saveAsTextFile('prediction') return bestModel, error
def find_best_model(data): global bestRank global bestLambda global bestNumIter bestRank = 0 bestLambda = -1.0 bestNumIter = -1 ranks = [8, 12] lambdas = [0.1, 10.0] numIters = [10, 20] min_error = float('inf') training, validation, test = data.randomSplit([0.6, 0.2, 0.2], 6) for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters): ALS.checkpointInterval = 2 training_data = training.map(lambda xs: [int(x) for x in xs]) model = ALS.train(training_data, rank, numIter, lmbda) validation_data = validation.map(lambda p: (int(p[0]), int(p[1]))) predictions = model.predictAll(validation_data).map(lambda r: ((r[0], r[1]), r[2])) ratings_and_predictions = validation.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) error = sqrt(ratings_and_predictions.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print 'For rank %s the RMSE is %s' % (rank, error) if error < min_error: min_error = error bestRank = rank bestLambda = lmbda bestNumIter = numIter print 'The best model was trained with rank %s' % bestRank
def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed, iterations=self.iterations, lambda_=self.regularization_parameter) logger.info("ALS model built!")
def evaluate(sc, raw_user_movies, raw_hot_movies): movies_name = build_movies(raw_hot_movies) user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap() ratings = build_ratings(raw_user_movies, user_id_to_int) num_iterations = 10 for rank in [10, 50]: for lam in [1.0, 0.01, 0.0001]: model = ALS.train(ratings, rank, num_iterations, lam) user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1])) predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2])) print predictions.take(3) rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions) print rates_and_preds.take(3) mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print "(rank:%d, lambda: %f,) Mean Squared Error = %f" % (rank, lam, mse) for rank in [10, 50]: for lam in [1.0, 0.01, 0.0001]: for alpha in [1.0, 40.0]: model = ALS.trainImplicit(ratings, rank, num_iterations, lam, alpha=alpha) user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1])) predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2])) rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions) print rates_and_preds.take(3) mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print "(rank:%d, lambda: %f, alpha: %f, implicit ) Mean Squared Error = %f" % (rank, lam, alpha, mse)
def __train_model(self, ratings_RDD): """Train the ALS model with the current dataset """ model = ALS.train(ratings_RDD, self.rank, seed=self.seed, iterations=self.iterations, lambda_=self.regularization_parameter) return model
def _recommend(self, train_ratings, users): from pyspark.mllib.recommendation import ALS, Rating # Preparing the user/item mapping as integers, since Spark's ALS implementation only works with integer values train_ratings['user'] = train_ratings['user'].astype('category') train_ratings['item'] = train_ratings['item'].astype('category') user_cat, item_cat = train_ratings['user'].cat, train_ratings['item'].cat self.user_cat = user_cat self.item_cat = item_cat self.train_ratings = train_ratings # Training the model self.ratings = self.sc.parallelize(Rating(u, i, rating) for u, i, rating in zip(user_cat.codes, item_cat.codes, train_ratings.rating)) if self.implicit: model = ALS.trainImplicit(self.ratings, **self.spark_args) else: model = ALS.train(self.ratings, **self.spark_args) # Getting predictions from the model self.ratings_to_predict = self.sc.parallelize((user, item) for user in users for item in item_cat.codes.unique()) self.predictions = model.predictAll(self.ratings_to_predict).collect() # Presenting the recommendations as a DataFrame self.predictions = [(user_cat.categories[p.user], item_cat.categories[p.product], p.rating) for p in self.predictions] self.predictions_df = pd.DataFrame(self.predictions, columns=['user', 'item', 'rating']) return self.predictions_df
def train_and_predict(): training = sc.textFile('member_item_file').map(parseRating).cache() #now train the model using ALS rank=10 number_of_iterations = 10 model = ALS.train(training, rank, number_of_iterations) print model
def train_als(data): # map ratings into Ratings object comprised of [user, movie, rating] data = data.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) rank = 10 numIterations = 10 model = ALS.train(data, rank, numIterations) return model, data
def compare_recommand_result_byMSE(rank,inter,lambda_value): rating_data=row_data_rdd.map(lambda x:x.split("\t")).\ map(lambda raw_rating_data:Rating(int(raw_rating_data[0]),int(raw_rating_data[1]),float(raw_rating_data[2]))) rating_model=ALS.train(rating_data,rank,inter,lambda_value) user_product=rating_data.map(lambda r:(r[0],r[1])) predictions =rating_model.predictAll(user_product).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds=rating_data.map(lambda r:((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print "Mean Squared Error = " + str(MSE)
def _report_error(self, data_set): training_set, test_set = data_set.randomSplit([7, 3], seed=0L) model = ALS.train(training_set, _PARAMS['rank'], _PARAMS['num_iterations'], _PARAMS['reg_param']) training_error = self._get_rsme(model, training_set) test_error = self._get_rsme(model, test_set) return training_error, test_error
def run(): log.info("train model start...") sql_user_rating = "select uid,pid,rating from user_rating" cur.execute(sql_user_rating) rdd_user_rating = sc.parallelize(cur.fetchall()) ratings = rdd_user_rating.map(lambda x:Rating(int(x[0]),int(x[1]),float(x[2]))) mod = ALS.train(ratings,50) return mod
def train(self, rank=3, iterations=20, lambda_=0.01, alpha=None, blocks=-1): """ train a mf model against the given parameters """ if alpha: model = ALS.trainImplicit(self.train_data, rank, iterations, lambda_, blocks, alpha) else: model = ALS.train(self.train_data, rank, iterations, lambda_) return model
def model(sc, raw_user_movies, raw_hot_movies): movies_name = build_movies(raw_hot_movies) user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap() print user_id_to_int['adamwzw'] user_int_to_id = {v: k for k, v in user_id_to_int.iteritems()} rating_data = build_ratings(raw_user_movies, user_id_to_int) model = ALS.train(rating_data, 50, 10, 0.0001) print model.userFeatures().collect()[:2] for user_int in xrange(1, 30): check_recommend_result(user_int, raw_user_movies, movies_name, user_int_to_id, model)
def recommand_movies(user_id): rating_data=load_rating_data() #rating_model=ALS.train(rating_data,50,10,0.1) rating_model=ALS.train(rating_data,100,10,0.1) #预测特定用户对特定电影的评分 moveid=123 predict_result=rating_model.predict(user_id,moveid) print "预测用户:%s 对电影: %s 的评测分数是:%s" %(user_id,moveid,predict_result) #为789用户推荐10个商品 recommand_result=rating_model.recommendProducts(user_id,10) return recommand_result
def trainModel(limit,data,rank,num_iterations): save_file = "models/"+str(limit)+"rank"+str(rank)+"iterations"+str(num_iterations) if isdir(save_file): print("Rank "+str(rank)+" and Iterations "+str(num_iterations)+" Model already exists, loading...") model = MatrixFactorizationModel.load(sc, save_file) else: print("Model does not exist, training ALS with rank "+str(rank)+" and "+str(num_iterations)+" iterations") model = ALS.train(data, rank, num_iterations) print("Saving new model") model.save(sc,save_file) return model
def __train_model(self, rank, seed, iterations, reg): logger.info("Training Movie Rec Engine (ALS)...") model = ALS.train(self.ratings_RDD, rank, seed=seed, iterations=iterations, lambda_=reg) logger.info("Movie Rec Engine Trained!") self.rank = rank self.seed = seed self.iterations = iterations self.reg = reg return model
def train_model(self): """ rank:对应ALS模型中的因子个数,也就是在低阶近似矩阵中的隐含特征个数。因子个数一般越多越好。但它也会直接影响模型训练和保存时所需的内存开销,尤其是在用户和物品很多的时候。因此实践中该参数常作为训练效果与系统开销之间的调节参数。通常,其合理取值为10到200。 iterations:对应运行时的迭代次数。ALS能确保每次迭代都能降低评级矩阵的重建误差,但一般经少数次迭代后ALS模型便已能收敛为一个比较合理的好模型。这样,大部分情况下都没必要迭代太多次(10次左右一般就挺好)。 lambda:该参数控制模型的正则化过程,从而控制模型的过拟合情况。其值越高,正则化越严厉。该参数的赋值与实际数据的大小、特征和稀疏程度有关。和其他的机器学习模型一样,正则参数应该通过用非样本的测试数据进行交叉验证来调整。 试过一些参数,使用的rank、iterations和lambda参数的值分别为50、10和0.01,效果稍好: """ self.model = ALS.train(self.ratings, 50, 10, 0.01) user_features = self.model.userFeatures() item_features = self.model.productFeatures() self.user_num = user_features.count() # 用户数量 self.item_num = user_features.count() # 电影数量
def test_rmse(): # TODO: revised so that it will take user's inputs instead of hardcoded values movies_schema = None ratings_schema = None # load the schemas with open("movielens_20m_movies_schema.json", "r") as json_schema_file: movies_schema = StructType.fromJson(json.load(json_schema_file)) with open("movielens_20m_ratings_schema.json", "r") as json_schema_file: ratings_schema = StructType.fromJson(json.load(json_schema_file)) # create a hdfs directory os.system("hdfs dfs -mkdir datasets") # load the json file into the hdfs directory os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz") # create a DataFrame based on the content of the json file ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema) # explicitly repartition RDD after loading so that more tasks can run on it in parallel # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3) # parse ratings DataFrame into an RDD of [(userId, itemId, rating)] ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating)) ratingsRDD.cache() # split data into train (60%), test (40%) # TODO: add validation in the future? train (60%), validation (20%), test(20%)? trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4]) trainingRDD.cache() testRDD.cache() # run training algorithm to build the model # without validation with Timer() as t: model = ALS.train(trainingRDD, rank=3) print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs # make a prediction with Timer() as t: testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache() print "testPredRDD: %s seconds" % t.secs # calculate RMSE with Timer() as t: testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD) print "testRmse: %s seconds" % t.secs print "testRmse", testRmse return
def __train_model(self): """Train the ALS model with the current dataset """ try: logger.info("Loading the ALS model...") # self.model = MatrixFactorizationModel.load(self.sc, "als_model.data") self.model = MatrixFactorizationModel.load(self.sc, "als_model.data") except: logger.info("Training the ALS model...") self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed, iterations=self.iterations, lambda_=self.regularization_parameter) self.model.save(self.sc,"als_model.data") logger.info("ALS model built!")
def recommend(training_data_file, meta_data, user_plays, numPartitions, rank, iterations, _lambda): my_plays = ((0, 2858, user_plays[0]), (0, 480, user_plays[1]), (0, 589, user_plays[2]), (0, 2571, user_plays[3]), (0, 1270, user_plays[4])) tracks_listened = set([_play[1] for _play in my_plays]) with spark_manager() as context: training = context.textFile(training_data_file) \ .filter(lambda x: x and len(x.split('::')) == 4) \ .map(parse_play) \ .values() \ .repartition(numPartitions) \ .cache() model = ALS.train(training, rank, iterations, _lambda) songs_rdd = context.textFile(training_data_file) \ .filter(lambda x: x and len(x.split('::')) == 4) \ .map(parse_play) songs = songs_rdd.values() \ .map(lambda r: (r[1], 1)) \ .reduceByKey(add) \ .map(lambda r: r[0]) \ .filter(lambda r: r not in films_seen) \ .collect() candidates = context.parallelize(tracks) \ .map(lambda x: (x, 1)) \ .repartition(numPartitions) \ .cache() predictions = model.predictAll(candidates).collect() # getting the top 10 recommendations recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:10] tracks = {} with open(meta_data, 'r') as open_file: tracks = {int(line.split('::')[0]): line.split('::')[1] for line in open_file if len(line.split('::')) == 3} for track_id, _, _ in recommendations: print tracks[track_id] if track_id in tracks else track_id
def trainModel(self): train_ratings = self._loadRatings(self.ctx, self.trainFile) ratings_valid = train_ratings.sample(False, 0.1, 12345) ratings_train = train_ratings.subtract(ratings_valid) print(20*'-','TRAINING STARTED',20*'-') ranks = [8] lambdas = [1.0, 10.0, 5.0] numIters = [10] bestModel = None bestValidationMSE = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters): print(rank, lmbda, numIter) model = ALS.train(ratings_train, rank, numIter, lmbda) testdata = ratings_valid.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings_valid.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() if (MSE < bestValidationMSE): bestModel = model bestValidationMSE = MSE bestRank = rank bestLambda = lmbda bestNumIter = numIter # evaluate the best model on the test set #model = ALS.train(ratings, rank, numIterations) print(20*'-','TRAINING FINISHED',20*'-') # # Evaluate the model on testing data print(20*'-','TESTING STARTED',20*'-') #TODO: stop gap for evaluation. using trainFile itself as test file test_ratings = self._loadRatings(self.ctx, self.trainFile) testdata = test_ratings.map(lambda p: (p[0], p[1])) predictions = bestModel.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() MAE = ratesAndPreds.map(lambda r: (abs(abs(r[1][0]) - abs(r[1][1])))).mean() print("Mean Squared Error = " + str(MSE)) print("Mean Absolute Error = " + str(MAE)) print("Root Mean Square Error = ", str(MSE**.5)) print(20*'-','TESTING FINISHED',20*'-')
ra = 0 lam = 0 min_error = float('inf') best_lambda = -1 best_lambda_index = -1 best_model = None best_rank = -1 best_rank_index = -1 # Loop over all possible value fr lambda and rank to find the best parameters for our model that minimize the rmse for rank in ranks: for regParam in regularization_parameter: model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regParam) predictions = model.predictAll(validation_for_predict_RDD).map( lambda r: ((r[0], r[1]), r[2])) rates_and_preds = validation_RDD.map(lambda r: ( (int(r[0]), int(r[1])), float(r[2]))).join(predictions) error = np.sqrt( rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) errors[ra][lam] = error print 'For lambda %s and rank %s the RMSE is %s' % (regParam, rank, error) if error < min_error: min_error = error best_lambda = regParam best_model = model
seed = 5L iterations = 5 regularizationParameter = 0.1 ranks = [4, 8, 12] errors = [0, 0, 0] err = 0 tolerance = 0.03 minError = float('inf') bestRank = -1 bestIteration = -1 for rank in ranks: model = ALS.train(trainingRDD, rank, seed=seed, iterations=iterations, lambda_=regularizationParameter) predictedRatingsRDD = model.predictAll(validationForPredictRDD) error = computeError(predictedRatingsRDD, validationRDD) errors[err] = error err += 1 print 'For rank %s the RMSE is %s' % (rank, error) if error < minError: minError = error bestRank = rank print 'The best model was trained with rank %s' % bestRank myModel = ALS.train(trainingRDD, ranks[2],
# check if spark context is defined print(sc.version) # importing the MF libraries from pyspark.mllib.recommendation import ALS, \ MatrixFactorizationModel, Rating # reading the movielens data df_rdd = sc.textFile('C:/Users/Vusal/Desktop/DDA new/ml-1m/ratings.dat')\.map(lambda x: x.split("::")) ratings= df_rdd.map(lambda l:\Rating(int(l[0]),int(l[1]),float(l[2]))) # Splitting the data into train and test sets. X_train, X_test= ratings.randomSplit([0.8, 0.2]) # Training the model rank = 10 numIterations = 10 model = ALS.train(X_train, rank, numIterations) # Evaluate the model on testdata # dropping the ratings on the tests data testdata = X_test.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) # joining the prediction with the original test dataset ratesAndPreds = X_test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) # calculating error MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE))
print("Training: %d, validation: %d, test: %d" % (numTraining, numValidation, numTest)) # train models and evaluate them on the validation set ranks = [8, 12] lambdas = [0.1, 10.0] numIters = [10, 20] bestModel = None bestValidationRmse = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters): model = ALS.train(training, rank, numIter, lmbda) validationRmse = computeRmse(model, validation, numValidation) print("RMSE (validation) = %f for the model trained with " % validationRmse + \ "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter)) if (validationRmse < bestValidationRmse): bestModel = model bestValidationRmse = validationRmse bestRank = rank bestLambda = lmbda bestNumIter = numIter testRmse = computeRmse(bestModel, test, numTest) # evaluate the best model on the test set print("The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \ + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse))
db='baidutraffice', charset='utf8') cursor = connect.cursor() table = 'hospitalUser' DROP = "drop table if exists %s" % table create = "create table %s (userId int,hospitalId int,recommend float)" % table insert = "insert into " + table + " values(%s,%s,%s)" spark = SparkSession.builder.master("local").appName( "hospitalALS").getOrCreate() df = spark.read.text('file:///export/servers/pycharm_project/hospital.txt') attrs_rdd1 = df.rdd.map(lambda x: x[0].split(',')).map( lambda x: Row(userId=1, hospitalId=x[0], recommend1=x[8])) attrs_rdd2 = df.rdd.map(lambda x: x[0].split(',')).map( lambda x: Row(userId=2, hospitalId=x[0], recommend2=x[9])) rdd1 = attrs_rdd1.map(lambda x: (x[2], x[0], x[1])) rdd2 = attrs_rdd2.map(lambda x: (x[2], x[0], x[1])) model1 = ALS.train(rdd1, 10, 10, 0.01) model2 = ALS.train(rdd2, 10, 10, 0.01) cursor.execute(DROP) cursor.execute(create) attrs1 = model1.recommendProducts(1, 500) attrs2 = model2.recommendProducts(2, 500) for a1 in attrs1: cursor.execute(insert, (a1[0], a1[1], a1[2])) connect.commit() for a2 in attrs2: cursor.execute(insert, (a2[0], a2[1], a2[2])) connect.commit()
training.count() 705486 test.count() 176359 # 1. Build the recommendation model using Alternating Least Squares # Call the ALS.train mehod to train the model from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating rank = 10 numIterations = 10 model = ALS.train(training, rank, numIterations) model <pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7f7afa38e850> # Evaluate the model on training data testdata = test.map(lambda r: (r[0],r[1])) type(testdata) <class 'pyspark.rdd.PipelinedRDD'> testdata.take(5) [(2866, 90950), (2866, 83908), (3997, 137454), (4101, 376365), (4101, 320620)] # Making predictions
from pyspark.mllib.recommendation import Rating from pyspark.mllib.recommendation import ALS user_id = int(sys.argv[1]) csv = sc.textFile("rank_data.csv") # print(csv.collect()) data = csv.map(lambda line: line.split(",")) header = data.first() rank_data = data.filter(lambda x: x != header) # rank_data.take(5) model = ALS.train(rank_data, 10, 10, 0.01) # print(model) recommend = model.recommendProducts(user_id, 3) # for p in recommend: # print(f"[user_id: {p[0]}, food_id: {p[1]}, rank: {p[2]}]") print(recommend) result = [] for rmd in recommend: result.append({ 'user_id': rmd[0], 'food_id': rmd[1], 'rank': round(rmd[2], 2) }) with open('recommend.json', 'w') as f:
seed = 5 iterations = 12 regularization_parameter = 0.1 rank = 4 #errors = [0, 0, 0] #err = 0 #tolerance = 0.02 training_RDD, test_RDD = ratings_RDD.randomSplit([8, 2], 0) training_1 = training_RDD.map(lambda l: (l[0], l[1] // 5, l[2])) training_2 = training_RDD.map(lambda l: (l[0], (l[1] + 3) // 5, l[2])) training_3 = training_RDD.map(lambda l: (l[0], (l[1] - 3) // 5, l[2])) model_1 = ALS.train(training_1, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\ nonnegative = True) model_2 = ALS.train(training_2, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\ nonnegative = True) model_3 = ALS.train(training_3, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\ nonnegative = True) test_for_predict_RDD = test_RDD.map( lambda x: (x[0], x[1], x[1] // 5, (x[1] + 3) // 5, (x[1] - 3) // 5)) preds = test_for_predict_RDD.map(lambda x: (x[0], x[1], model_1.predict(x[0], x[2]), model_2.predict(x[0], x[3]),\ model_3.predict(x[0], x[4]))) preds = preds.map(lambda x: ((x[0], x[1]), (x[2][2] + x[3][2] + x[4][2]) / 3)) print(preds.take(3).collect()) rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))) #rates_and_preds = rates_and_preds.join(preds) '''
sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD) # checkpointing helps prevent stack overflow errors sc.setCheckpointDir('checkpoint/') # Read the ratings and accommodations data from Cloud SQL dfRates = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Rating', useSSL='false').load() dfAccos = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Accommodation', useSSL='false').load() print("read ...") # train the model model = ALS.train(dfRates.rdd, 20, 20) # you could tune these numbers, but these are reasonable choices print("trained ...") # use this model to predict what the user would rate accommodations that she has not rated allPredictions = None for USER_ID in range(0, 100): dfUserRatings = dfRates.filter(dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect() rddPotential = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings) pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0])) predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2]))) predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5 print("predicted for user={0}".format(USER_ID)) if (allPredictions == None): allPredictions = predictions else: allPredictions.extend(predictions)
''' Created on 2015/12/08 @author: charles ''' from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark.context import SparkContext sc = SparkContext("local") # Load and parse the data data = sc.textFile("data/mllib/als/test.data") ratings = data.map(lambda l: l.split(',')).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 10 model = ALS.train(ratings, rank, numIterations) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "myModelPath") sameModel = MatrixFactorizationModel.load(sc, "myModelPath")
cursor.close() cnxn.close() if song_index != 'm3': continue else: print(new_user_ratings) new_user_ratings_RDD = sc.parallelize(new_user_ratings) print('New user ratings: %s' % new_user_ratings_RDD.take(3)) # merge new data into old data complete_data_with_new_ratings_RDD = complete_ratings_data.union(new_user_ratings_RDD) # train model again with new data from time import time t0 = time() new_ratings_model = ALS.train(complete_data_with_new_ratings_RDD, best_rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) tt = time() - t0 print("New model trained in %s seconds" % round(tt,3)) new_user_ratings_ids = map(lambda x: x[1], new_user_ratings) # get just music IDs # keep just those not on the ID list new_user_unrated_music_RDD = (complete_music_data.filter(lambda x: x[0] not in new_user_ratings_ids).map(lambda x: (new_user_ID, x[0]))) # Use the input RDD, new_user_unrated_music_RDD, with new_ratings_model.predictAll() to predict new ratings for the musics new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_music_RDD) # get every predicct result for new user new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating)) # merge data with music info new_user_recommendations_rating_title_and_count_RDD = new_user_recommendations_rating_RDD.join(complete_music_titles).join(music_rating_counts_RDD) new_user_recommendations_rating_title_and_count_RDD.take(3) # transfer data format
from pyspark import SparkContext from pyspark.mllib.recommendation import ALS sc = SparkContext() completeRDD = sc.textFile('datasets/ml-latest-small/ratings.csv') header = completeRDD.first() completeRDD = completeRDD.filter(lambda line : line != header)\ .map(lambda line : line.split(","))\ .map(lambda line : (line[0],line[1],line[2])) trainingRDD, testRDD = completeRDD.randomSplit([0.7, 0.3]) model = ALS.train(trainingRDD, rank=10, iterations=10) model.save(sc, "target/model")
lambda x: (x[0], sum(x[1]) / len(x[1]))).collectAsMap() # (business, {user} -> reduceByKey (business) --> (business, {user1, user2,...}) businessUserDict = trainData.map(lambda x: (x[0][1], {x[0][ 0]})).reduceByKey(lambda old, new: old.union(new)).collectAsMap() """ MODEL-BASED CF: spark.mllib currently supports model-based collaborative filtering, in which users and products are described by a small set of latent factors that can be used to predict missing entries. spark.mllib uses the alternating least squares (ALS) algorithm to learn these latent factors. """ if caseID == 1: # parse the training data ((userIdx, businessIdx), star) and Rating(userIdx, businessIdx, rating) # using persist() we can use various storage levels ratings = trainData.map(lambda x: Rating(x[0][0], x[0][1], x[1])) # Build the recommendation model using Alternating Least Squares model = ALS.train( ratings, RANK, numIterations, nonnegative=True, seed=SEED) #default method which assumes ratings are explicit # lambda_= 0.01 , blocks = 6, # validation data with Rating(userIdx, businessIdx, rating) and without rating: (userIdx, businessIdx) valDataWithRating = valData.map( lambda x: Rating(x[0][0], x[0][1], x[1])) valDataWORating = valDataWithRating.map(lambda x: (x[0], x[1])) # userDataWORating = valDataWithRating.map(lambda x: x[0]) # businessDataWORating = valDataWithRating.map(lambda x: x[1]) # Evaluate the model on validation data predictions = model.predictAll(valDataWORating).map( lambda x: ((x[0], x[1]), x[2]))
seed = 5L iterations = 10 regularization_parameter = 0.01 ranks = range(2, 12) errors = [0] * len(ranks) err = 0 tolerance = 0.02 min_error = float('inf') best_rank = -1 best_iteration = -1 for rank in ranks: model = ALS.train(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) predictions = model.predictAll(validation_for_predict_RDD).map( lambda r: ((r[0], r[1]), r[2])) rates_and_preds = validation_RDD.map( lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) error = math.sqrt( rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) errors[err] = error err += 1 print 'For rank %s the RMSE is %s' % (rank, error) if error < min_error: min_error = error best_rank = rank
# train model #training_RDD, validation_RDD = ratings_RDD.randomSplit([8, 2], 0) #validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])) #print(training_RDD.collect().take(3)) seed = 5 iterations = 12 regularization_parameter = 0.1 rank = 4 #errors = [0, 0, 0] #err = 0 #tolerance = 0.02 training_RDD, test_RDD = ratings_RDD.randomSplit([8, 2], 0) complete_model = ALS.train(training_RDD, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\ nonnegative = True) test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1])) predictions = complete_model.predictAll(test_for_predict_RDD).map( lambda r: ((r[0], r[1]), r[2])) rates_and_preds = test_RDD.map( lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) mae = rates_and_preds.map(lambda r: (abs(r[1][0] - r[1][1]))).mean() rmse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) logs = rates_and_preds.map(lambda r: (math.log(r[1][1] + 1) - math.log(r[1][0] + 1))) rmsle = math.sqrt(logs.map(lambda x: x**2).mean()) print("The MAE is {:G}".format(mae)) print("The RMSE is {:G}".format(rmse))
def test(rdd): if len(rdd.collect()) == 0: pass else: new_user_ratings = [] msgValue = rdd.map(lambda x: json.loads(x[1])).collect() print(msgValue) print(type(msgValue[0])) new_user_ID = msgValue[0]['user_id'] song_id = msgValue[0]['shoes_id'] rating = msgValue[0]['rating'] print(new_user_ID, song_id, rating) data = (new_user_ID, song_id, rating) new_user_ratings.append(data) print(new_user_ratings) new_user_ratings_RDD = sc.parallelize(new_user_ratings) print('New user ratings: %s' % new_user_ratings_RDD.take(3)) # merge new data into old data complete_data_with_new_ratings_RDD = complete_ratings_data.union(new_user_ratings_RDD) # train model again with new data from time import time t0 = time() new_ratings_model = ALS.train(complete_data_with_new_ratings_RDD, best_rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) tt = time() - t0 print("New model trained in %s seconds" % round(tt,3)) #print(new_user_ratings) #print(type(new_user_ratings)) #print(map(lambda x: x[1], new_user_ratings)) try: new_user_ratings_ids = map(lambda x: x[1], new_user_ratings) # get just music IDs except: print(new_user_ratings,new_user_ratings_ids) # keep just those not on the ID list new_user_unrated_music_RDD = (complete_music_data.filter(lambda x: x[0] not in new_user_ratings_ids).map(lambda x: (new_user_ID, x[0]))) # Use the input RDD, new_user_unrated_music_RDD, with new_ratings_model.predictAll() to predict new ratings for the musics new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_music_RDD) # get every predicct result for new user new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating)) # merge data with music info new_user_recommendations_rating_title_and_count_RDD = new_user_recommendations_rating_RDD.join(complete_music_titles).join(music_rating_counts_RDD) new_user_recommendations_rating_title_and_count_RDD.take(3) # transfer data format new_user_recommendations_rating_title_and_count_RDD = new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1])) # sort data by rating score and list first 25 data top_musics = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=25).takeOrdered(25, key=lambda x: -x[1]) # print('TOP recommended musics (with more than 25 reviews):\n%s' % '\n'.join(map(str, top_musics))) # result_r = r.hset('shoes', new_user_ID, str(top_musics)) # j = {'user': new_user_ID, 'music': top_musics} # result_m = shoe_recommend.insert_one(j) new_user_ratings = [] return sc.parallelize(top_musics)
return movie_names conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities") sc = SparkContext(conf=conf) print("Loading Movie Names") movie_names = load_movie_names() data = sc.textFile("file:///SparkCourse/ml-100k/u.data") ratings = data.map(lambda line: line.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() rank = 10 num_iters = 15 model = ALS.train(ratings, rank, num_iters) user_ratings = ratings.filter(lambda x: x[0] == user_id).collect() print("User {} ratings:".format(user_id)) for rating in user_ratings: print("{}, score: {}".format(movie_names[int(rating[1])], rating[2])) recs = model.recommendProducts(user_id, 10) print("Top 10 recommendations:") for rec in recs: print("{}, score: {}".format(movie_names[int(rec[1])], rec[2]))
def main(): conf = SparkConf().setAppName("YeJoo_Park_task2_ModelBasedCF")\ .setMaster("local") sc = SparkContext.getOrCreate(conf) sc.setLogLevel("ERROR") ratingsFilePath = sys.argv[1] testFilePath = sys.argv[2] data = sc.textFile(testFilePath) dataHeader = data.first() testingSet = set(data\ .filter(lambda row: row != dataHeader)\ .map(lambda r: r.split(","))\ .map(lambda r: (int(r[USER_INDEX]), int(r[MOVIE_INDEX])))\ .collect()) # Load and parse the data data = sc.textFile(ratingsFilePath) dataHeader = data.first() trainRatings = data\ .filter(lambda row: row != dataHeader)\ .map(lambda r: r.split(","))\ .map(lambda r: Rating(int(r[USER_INDEX]), int(r[MOVIE_INDEX]), float(r[RATING_INDEX]))) print "ratings.count() before filter=" + str(trainRatings.count()) testRatings = trainRatings.filter( lambda rating: (rating.user, rating.product) in testingSet) trainRatings = trainRatings.filter( lambda rating: (rating.user, rating.product) not in testingSet) print "testingSetRatings.count()=" + str(testRatings.count()) print "ratings.count() after filter=" + str(trainRatings.count()) rank = 10 numIterations = 12 lamb = 0.1 model = ALS.train(trainRatings, rank, numIterations, lamb) print "Training complete" userProducts = testRatings.map(lambda rating: (rating.user, rating.product)) predictions = model.predictAll(userProducts).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = testRatings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) absDiffBuckets = ratesAndPreds.map(lambda r: int(abs(r[1][0] - r[1][1]))) \ .map(lambda d: min(d, 4)).cache() RMSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() # Write predictions to file outputFileName = "YeJoo_Park_ModelBasedCF.txt" printWriter = open(outputFileName, "a") outputPreds = ratesAndPreds.map(lambda r: (r[0][0], r[0][1], r[1][1])).collect() outputPreds.sort() for pred in outputPreds: printWriter.write( str(pred[0]) + ", " + str(pred[1]) + ", " + str(pred[2])) printWriter.write("\n") printWriter.close() print ">=0 and <1: " + str(absDiffBuckets.filter(lambda d: d == 0).count()) print ">=1 and <2: " + str(absDiffBuckets.filter(lambda d: d == 1).count()) print ">=2 and <3: " + str(absDiffBuckets.filter(lambda d: d == 2).count()) print ">=3 and <4: " + str(absDiffBuckets.filter(lambda d: d == 3).count()) print ">=4: " + str(absDiffBuckets.filter(lambda d: d == 4).count()) print "RMSE=" + str(RMSE)
#http://spark.apache.org/docs/latest/mllib-collaborative-filtering.html # This is one model but we need more combination # Best results are not commented ranks = [5, 10, 15, 20] reguls = [0.1, 1, 10] iters = [5, 10, 20] finalModel = None finalRank = 0 finalRegul = float(0) finalIter = -1 finalDist = float(100) for cRank, cRegul, cIter in itertools.product(ranks, reguls, iters): model = ALS.train(rddTraining, cRank, cIter, float(cRegul)) dist = howFarAreWe(model, rddValidating, nbValidating) print(str(dist)) if dist < finalDist: finalModel = model finalRank = cRank finalRegul = cRegul finalIter = cIter finalDist = dist print("Rank " + str(finalRank)) # best is 20 print("Regul " + str(finalRegul)) # best is 1 print("Iter " + str(finalIter)) # best is 20 print("Dist " + str(finalDist)) # best is 2.45935601578 (It is bad!!!)
train_rate = train.map( lambda row: Rating(uid_to_index[row[0]], asin_to_index[row[1]], row[2])) test_rate = test.map( lambda row: Rating(uid_to_index[row[0]], asin_to_index[row[1]], row[2])) #Rating rdd: rating_rdd = rdd.map( lambda row: Rating(uid_to_index[row[0]], asin_to_index[row[1]], row[2])) #Best model for param. ranks = [16] numIterations = 2 best_model, min_mse = None, None for rank in ranks: print("Rank: ", rank) model = ALS.train(train_rate, rank, numIterations) testdat = test_rate.map(lambda x: (x[0], x[1])) predictions = model.predictAll(testdat).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = test_rate.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() if min_mse is None or MSE < min_mse: min_mse = MSE best_model = model #Best model on the entire dataset best_model_total, min_mse = None, None for rank in ranks: print("Rank: ", rank) model = ALS.train(rating_rdd, rank, numIterations) testdat = rating_rdd.map(lambda x: (x[0], x[1]))
iterations = 10 regularization_parameter = 0.1 rank_list = [] rmse_list = [] ranks = [16, 20, 24, 28, 32, 36] errors = [0] * len(ranks) err = 0 tolerance = 0.02 min_error = float('inf') best_rank = -1 best_iteration = -1 for rank in ranks: model = ALS.train(train, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) predictions = model.predictAll(test1).map(lambda r: ((r[0], r[1]), r[2])) rates_and_preds = test.map( lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) error = math.sqrt( rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) errors[err] = error err += 1 rank_list.append(rank) rmse_list.append(error) print('For rank %s the RMSE is %s' % (rank, error)) if error < min_error: min_error = error best_rank = rank
# change the data to rdd train_rdd_temp = inventory_rdd.map(F_Tuple).flatMapValues(lambda x: x) print(train_rdd_temp.take(10)) def F_Flat(x): (index, (appid, ValuE)) = x return (index, appid, ValuE) train_rdd = train_rdd_temp.map(F_Flat) print(train_rdd.take(10)) # Build a ALS model and train the data model = ALS.train(train_rdd, 5) recom_dict = {} for index in list(user_id_full_list.keys()): try: recom_list = [i.product for i in model.recommendProducts(index, 10)] user_id = user_id_full_list.get(index) recom_dict.update({user_id: recom_list}) except: pass # change it to a sql table df_recom = pd.DataFrame.from_dict(recom_dict, 'index') df_recom.index.name = 'stem_user_id' df_recom.reset_index(inplace=True) engine = create_engine( 'mysql+pymysql://root:[email protected]/game_re?charset=utf8mb4')
# Define spark session spark = SparkSession \ .builder \ .appName("Spark Application") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() sc = spark.sparkContext rawData = sc.textFile("../resource/ml-100k/u.data") print(rawData.first()) rawRatings = rawData.map(lambda s: s.split("\t")[0:3]) print(rawRatings.first()) ratings = rawRatings.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) print(ratings.first()) model = ALS.train(ratings, 50, 10) # model.userFeatures().collect() parsedData = model.productFeatures().map(lambda tuple: tuple[1]) print(parsedData.take(2)) # Build the model (cluster the data) clusters = KMeans.train(parsedData, 5, maxIterations=10, initializationMode="random") # Within Set Sum of Squared Errors WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y) print("Within Set Sum of Squared Error = " + str(WSSSE)) # Save and load model using KmeansModel of MLlib # clusters.save(sc, "../target/KMeansModel")
if __name__ == "__main__": sc = SparkContext(appName="Ranking Metrics Example") # Several of the methods available in scala are currently missing from pyspark # $example on$ # Read in the ratings data lines = sc.textFile("data/mllib/sample_movielens_data.txt") def parseLine(line): fields = line.split("::") return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5) ratings = lines.map(lambda r: parseLine(r)) # Train a model on to predict user-product ratings model = ALS.train(ratings, 10, 10, 0.01) # Get predicted ratings on all existing user-product pairs testData = ratings.map(lambda p: (p.user, p.product)) predictions = model.predictAll(testData).map( lambda r: ((r.user, r.product), r.rating)) ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating)) scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1]) # Instantiate regression metrics to compare predicted and actual ratings metrics = RegressionMetrics(scoreAndLabels) # Root mean squared error print("RMSE = %s" % metrics.rootMeanSquaredError)
# parallelize the normalized data print " ...parallelizing the data" sys.stdout.flush() mu = fulldf.stars.mean() data = sc.parallelize(fulldf[[ 'user_id', 'question_id', 'stars', 'user_mean', 'item_mean' ]].as_matrix()) normed_ratings = data.map(lambda row: Rating( int(row[0]), int(row[1]), row[2] - row[3] - row[4] + mu)) # build the LF model print " ...building the model" sys.stdout.flush() rank = 40 numIterations = 30 model = ALS.train(normed_ratings, rank, numIterations) # extract the user and product features into numpy arrays print " ...extracting the factors" sys.stdout.flush() uf = sorted(model.userFeatures().collect()) pf = sorted(model.productFeatures().collect()) U = np.vstack([a[1] for a in uf]) V = np.hstack([np.array(a[1]).reshape(rank, 1) for a in pf]).T # record the user/product_id <-> index correspondences uids = [str(a[0]) for a in uf] mids = [str(a[0]) for a in pf] user_idx = pd.Series(range(len(uids)), index=uids) item_idx = pd.Series(range(len(mids)), index=mids)
#Create prediction sets without ratings predict_validation_RDD = validation_RDD.map(lambda x: (x[0], x[1])) predict_test_RDD = test_RDD.map(lambda x: (x[0], x[1])) from pyspark.mllib.recommendation import ALS import math seed = 5 iterations = 10 regularization = 0.1 trial_ranks = [4, 8, 12] lowest_error = float('inf') for k in trial_ranks: model = ALS.train(training_RDD, k, seed=seed, iterations=iterations, lambda_=regularization) #Coercing ((u,p),r) tuple format to accomodate join predictions_RDD = model.predictAll(predict_validation_RDD).map(lambda r: ((r[0], r[1]), r[2])) ratings_and_preds_RDD = validation_RDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions_RDD) error = math.sqrt(ratings_and_preds_RDD.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print ('For k=',k,'the RMSE is', error) if error < lowest_error: best_k = k lowest_error = error print('The best rank is size', best_k) # Redo the last phase with the best rank size and using test dataset this time model = ALS.train(training_RDD, best_k, seed=seed, iterations=iterations, lambda_=regularization) predictions_RDD = model.predictAll(predict_test_RDD).map(lambda r: ((r[0], r[1]), r[2])) ratings_and_preds_RDD = test_RDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions_RDD)
.option("driver", jdbcDriver) \ .load() dfRates.registerTempTable('Ratings') sqlContext.cacheTable('Ratings') tt = time() - t0 print "Data is loaded in %s seconds" % round(tt,3) rank = 8 seed = 5L iterations = 10 regularization_parameter = 0.1 t0 = time() print "Training the ALS model..." model = ALS.train(dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(), rank=rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) tt = time() - t0 print "ALS model built!" print "New model trained in %s seconds" % round(tt,3) predictions = model.recommendProductsForUsers(10) \ .flatMap(lambda pair: pair[1]) \ .map(lambda rating: (rating.user, rating.product, rating.rating)) schema = StructType([StructField("userId", StringType(), True), StructField("movieId", StringType(), True), StructField("prediction", FloatType(), True)]) dfToSave = sqlContext.createDataFrame(predictions, schema) #dfToSave.write.jdbc(url=jdbcUrl, table=TABLE_RECOMMENDATIONS) t0 = time() dfToSave.write.option('driver', 'org.postgresql.Driver').jdbc(jdbcUrl, TABLE_RECOMMENDATIONS, mode='overwrite')
seed = 5L iterations = 5 regularizationParameter = 0.1 ranks = [4, 8, 12] errors = [0, 0, 0] err = 0 tolerance = 0.02 minError = float('inf') bestRank = -1 bestIteration = -1 for rank in ranks: model = ALS.train(trainingRDD, rank, seed=seed, iterations=iterations, lambda_=regularizationParameter) predictedRatingsRDD = model.predictAll(validationForPredictRDD) error = computeError(predictedRatingsRDD, validationRDD) errors[err] = error err += 1 print 'For rank %s the RMSE is %s' % (rank, error) if error < minError: minError = error bestRank = rank print 'The best model was trained with rank %s' % bestRank # In[80]:
def SaveModel(sc, model): ''' 存储数据 :param sc: :param model: :return: ''' try: model.save(sc, Path + u'data/ALSmodel') print('>>>>>>>> 已存储 Model 到 ALSmodel 中 <<<<<<<<') except Exception: print('>>>>>>>> Model 已经存在,请先删除再存储。 <<<<<<<<') if __name__ == '__main__': sc = CreateSparkContext() print('=============== 数据准备阶段 ===============') ratingsRDD = PrepareData(sc) print('=============== 训练阶段 ===============') print( '====---->>>> 开始ALS训练,参数rank=5, iterations=10, lambda=0.1 <<<<----====' ) model = ALS.train(ratingsRDD, 5, 10, 0.01) # 经测试将iterations修改为20时执行报错,无法保存对象数据到文件 print('=============== 存储Model ===============') SaveModel(sc, model) # try: # model.save(sc, Path + u'data/ALSmodel') # print('>>>>>>>> 已存储 Model 到 ALSmodel 中 <<<<<<<<') # except Exception: # print('>>>>>>>> Model 已经存在,请先删除再存储。 <<<<<<<<')
# filter out header header = data.first() #extract header data = data.filter(lambda row: row != header) # convert into a sequence of Rating objects ratings = data.map(lambda l: l.split(',')).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # split into train and test train, test = ratings.randomSplit([0.8, 0.2]) # train the model K = 10 epochs = 10 model = ALS.train(train, K, epochs) # evaluate the model # train x = train.map(lambda p: (p[0], p[1])) p = model.predictAll(x).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = train.map(lambda r: ((r[0], r[1]), r[2])).join(p) # joins on first item: (user_id, movie_id) # each row of result is: ((user_id, movie_id), (rating, prediction)) mse = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("train mse: %s" % mse) # test x = test.map(lambda p: (p[0], p[1])) p = model.predictAll(x).map(lambda r: ((r[0], r[1]), r[2]))