def main(sc): seed = 5L iterations = 10 regularization_parameter = 0.1 rank = 4 data = sc.textFile("file:///Expedia/data/train1.csv") ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() new_data = sc.textFile("file:///Expedia/data/new_set.csv") new_ratings = new_data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() new_ratings_for_predict_RDD = new_ratings.map(lambda x: (x[0], x[1])).cache() complete_data = ratings.union(new_ratings).cache() new_ratings_model = ALS.trainImplicit(complete_data, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) # that not work need more invistigation #predictions = new_ratings_model.predictAll(0,'83').collect() predictions = new_ratings_model.predictAll(new_ratings_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])).collect() recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:2] recommendations.take(5)
def main(sc): seed = 5L iterations = 10 regularization_parameter = 0.1 rank = 4 data = sc.textFile("file:///Expedia/data/train1.csv") ratings = data.map(lambda l: l.split(',')).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() new_data = sc.textFile("file:///Expedia/data/new_set.csv") new_ratings = new_data.map(lambda l: l.split(',')).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() new_ratings_for_predict_RDD = new_ratings.map(lambda x: (x[0], x[1])).cache() complete_data = ratings.union(new_ratings).cache() new_ratings_model = ALS.trainImplicit(complete_data, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) # that not work need more invistigation #predictions = new_ratings_model.predictAll(0,'83').collect() predictions = new_ratings_model.predictAll( new_ratings_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])).collect() recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:2] recommendations.take(5)
def build_model(train_data): model = ALS.trainImplicit(train_data, rank=FACTOR, iterations=ITERS, lambda_=LAMBDA, alpha=ALPHA) return model
def evaluate(sc, raw_user_movies, raw_hot_movies): movies_name = build_movies(raw_hot_movies) user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap() ratings = build_ratings(raw_user_movies, user_id_to_int) num_iterations = 10 for rank in [10, 50]: for lam in [1.0, 0.01, 0.0001]: model = ALS.train(ratings, rank, num_iterations, lam) user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1])) predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2])) print predictions.take(3) rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions) print rates_and_preds.take(3) mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print "(rank:%d, lambda: %f,) Mean Squared Error = %f" % (rank, lam, mse) for rank in [10, 50]: for lam in [1.0, 0.01, 0.0001]: for alpha in [1.0, 40.0]: model = ALS.trainImplicit(ratings, rank, num_iterations, lam, alpha=alpha) user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1])) predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2])) rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions) print rates_and_preds.take(3) mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) print "(rank:%d, lambda: %f, alpha: %f, implicit ) Mean Squared Error = %f" % (rank, lam, alpha, mse)
def als_training(ratings, rank=10, num_iteration=12, lambda_=0.01, alpha=0.01): model = ALS.trainImplicit(ratings, rank, num_iteration, lambda_=lambda_, alpha=alpha) return model
def train(training_data, rank, iteration, lmbda, alpha, model_id): """ Train model. Args: training_data (rdd): Used for training. rank (int): Number of factors in ALS model. iteration (int): Number of iterations to run. lmbda (float): Controls regularization. alpha (float): Constant for computing confidence. model_id (str): Model identification string. Returns: model: Trained model. """ try: model = ALS.trainImplicit(training_data, rank, iterations=iteration, lambda_=lmbda, alpha=alpha) return model except Py4JJavaError as err: current_app.logger.error('Unable to train model "{}"\n{}'.format( model_id, str(err.java_exception)), exc_info=True) raise
def als(train, opts): vals = [] for u in xrange(train.shape[0]): for i in train[u].indices: vals.append(Rating(int(u), int(i), float(train[u, i]))) sc = pyspark.SparkContext("local") sc.setCheckpointDir("/tmp/" + str(random.random())) quiet_logs(sc) ratings = sc.parallelize(vals) if opts.implicit: model = ALS.trainImplicit(ratings, opts.D, opts.iters, opts.lbda, alpha=opts.alpha) else: model = ALS.train(ratings, opts.D, opts.iters, opts.lbda) U = [] for ut in model.userFeatures().sortBy(lambda a: a[0]).collect(): U.append(ut[1]) rddItems = model.productFeatures() maxItem = rddItems.map(lambda a: int(a[0])).max() items = dict(rddItems.sortBy(lambda a: int(a[0])).collect()) V = [] for i in xrange(maxItem): item = items.get(i, np.zeros(opts.D)) V.append(item) return (U, V)
def train(training_data, validation_data, num_validation, ranks, lambdas, iterations): best_model = None alpha = 3.0 # controls baseline confidence growth for rank, lmbda, iteration in itertools.product(ranks, lambdas, iterations): print( 'Training model with rank = %.2f, lambda = %.2f, iterations = %d...' % (rank, lmbda, iteration)) model = ALS.trainImplicit(training_data, rank, iterations=iteration, lambda_=lmbda, alpha=alpha) validation_rmse = compute_rmse(model, validation_data, num_validation) print(" RMSE (validation) = %f for the model trained with " % validation_rmse + \ "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, iteration)) if best_model is None or validation_rmse < best_model.error: best_model = Model(model=model, error=validation_rmse, rank=rank, lmbda=lmbda, iteration=iteration) print( 'Best model has error = %.2f, rank = %.2f, lambda = %.2f, iteration=%d' % (best_model.error, best_model.rank, best_model.lmbda, best_model.iteration)) return best_model
def model(sc, rawUserArtistData, rawArtistData, rawArtistAlias): bArtistAlias = sc.broadcast(buildArtistAlias(rawArtistAlias)) trainData = buildRatings(rawUserArtistData, bArtistAlias).cache() model = ALS.trainImplicit(ratings=trainData, rank=10, iterations=5, lambda_=0.01, alpha=1.0) trainData.unpersist() print(model.userFeatures().mapValues(lambda v: ", ".join( map(lambda x: str(x),v) )).first()) userID = 2093760 recommendations = model.recommendProducts(userID, 5) for val in recommendations: print(val) recommendedProductIDs = map(lambda rec: rec.product, recommendations) #get specific user data rawArtistsForUser = rawUserArtistData\ .map(lambda x: x.split(' '))\ .filter(lambda x: int(x[0]) == userID) #map artist id to int existingProducts = rawArtistsForUser.map(lambda x: int(x[1])).collect() artistByID = buildArtistByID(rawArtistData) existingArtists = artistByID.filter(lambda artist: artist[0] in existingProducts).collect() for val in existingArtists: print(val) recommendedArtists = artistByID.filter(lambda artist: artist[0] in recommendedProductIDs).collect() for val in recommendedArtists: print(val) unpersist(model)
def _recommend(self, train_ratings, users): from pyspark.mllib.recommendation import ALS, Rating # Preparing the user/item mapping as integers, since Spark's ALS implementation only works with integer values train_ratings['user'] = train_ratings['user'].astype('category') train_ratings['item'] = train_ratings['item'].astype('category') user_cat, item_cat = train_ratings['user'].cat, train_ratings['item'].cat self.user_cat = user_cat self.item_cat = item_cat self.train_ratings = train_ratings # Training the model self.ratings = self.sc.parallelize(Rating(u, i, rating) for u, i, rating in zip(user_cat.codes, item_cat.codes, train_ratings.rating)) if self.implicit: model = ALS.trainImplicit(self.ratings, **self.spark_args) else: model = ALS.train(self.ratings, **self.spark_args) # Getting predictions from the model self.ratings_to_predict = self.sc.parallelize((user, item) for user in users for item in item_cat.codes.unique()) self.predictions = model.predictAll(self.ratings_to_predict).collect() # Presenting the recommendations as a DataFrame self.predictions = [(user_cat.categories[p.user], item_cat.categories[p.product], p.rating) for p in self.predictions] self.predictions_df = pd.DataFrame(self.predictions, columns=['user', 'item', 'rating']) return self.predictions_df
def main(argv): Conf = (SparkConf().setAppName("recommendation")) sc = SparkContext(conf=Conf) sqlContext = SQLContext(sc) dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet" rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER) # argv[1] is the dump of training data in hdfs # argv[2] is the user perferences # User Hash Lookup stored into cassandra user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a))) distinctUser = user_hash.distinct() userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"]) userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace = keyspace).save(mode="append") # Product Hash Lookup stored into cassandra product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b))) distinctProduct = product_hash.distinct() productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"]) productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace = keyspace).save(mode="append") # Ratings for training # ALS requires a java hash of string. This function does that and stores it as Rating Object # for the algorithm to consume ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c))) model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5) model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model") sc.stop()
def train(self, rank, numIter, lmbda, istest=False, isimp=False, alpha=0.1): training, val = self.prepareTraining() if self.conf.has_key("model"): model = self.loadModel() else: if not isimp: print "use ALS.train" model = ALS.train(training, rank, \ iterations=numIter, lambda_=lmbda, \ blocks=self.conf["parallelize"], \ seed=0, nonnegative=False ) else: print "use ALS.trainImplicit" model = ALS.trainImplicit(training, rank, \ iterations=numIter, lambda_=lmbda, \ blocks=self.conf["parallelize"], seed=0, \ alpha=alpha, nonnegative=False \ ) if True: os.system("rm -rf cachem") model.save(self.sc, "cachem") print "saved model" if istest: self.mkTest(model, val) self.afterTrain(model) return model, val
def create_model(RDD_ratings_data): '''Create the final ALS model on the entire set of ratings (after training/testing work)''' model = ALS.trainImplicit(RDD_ratings_data, rank=50, iterations=20, lambda_=10.0, alpha=20.0) return model
def mf_als_rec(is_test): print('*** Test MF-ALS Recommender ***') conf = SparkConf().setAppName("MF-ALS Rec").setMaster("local") sc = SparkContext(conf=conf) b = Builder() ev = Evaluator(is_test=is_test) ev.split() UCM = b.get_UCM(ev.get_URM_train()) target_playlists = ev.get_target_playlists() urm_train_indices = ev.get_URM_train().nonzero() ratings_list = [] print('Creating RDD of tuples') for index in tqdm(range(0, urm_train_indices[0].size)): ratings_list.append( Rating(urm_train_indices[0][index], urm_train_indices[1][index], 1)) ratings = sc.parallelize(ratings_list) model = ALS.trainImplicit(ratings, rank=10, iterations=5, alpha=0.01) dataframe_list = [] print('Predicting...', flush=True) all_predictions = model.recommendProductsForUsers(10).filter(lambda r: r[0] in target_playlists)\ .collect() for u in tqdm(all_predictions): prediction = [] for i in u[1]: prediction.append(i.product) dataframe_list.append([u[0], prediction]) def get_id(e): return e[0] dataframe_list.sort(key=get_id) train_df = pd.DataFrame(dataframe_list, columns=['playlist_id', 'track_ids']) if is_test: map5 = ev.map5(train_df) print('Hybrid MAP@10:', map5) return map5 else: print('Prediction saved!') train_df.to_csv(os.path.dirname(os.path.realpath(__file__))[:-19] + "/all/sub.csv", sep=',', index=False) return 0
def cross_validation(training, validation, test, candidates, id_title_map, ranks, lambdas, numIters, alphas): # train models and evaluate them on the validation set result_dict = {} result_template = "rank:%d iters:%d lambda: %f" bestModel = None bestValidationRmse = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 numTraining = training.count() numValidation = validation.count() numTest = test.count() if not IMPLICIT: alphas = [1.0] for rank, lmbda, numIter, alpha in itertools.product( ranks, lambdas, numIters, alphas): if IMPLICIT: model = ALS.trainImplicit(training, rank, iterations=numIter, lambda_=lmbda, alpha=alpha, nonnegative=True) else: model = ALS.train(training, rank, iterations=numIter, lambda_=lmbda, nonnegative=True) validationRmse = 0.0 #computeRmse(model, validation, numValidation) print "RMSE (validation) = %f for the model trained with " % validationRmse + \ "rank = %d, lambda = %.4f, and numIter = %d and alpha=%f." % (rank, lmbda, numIter, alpha) qe_results = qualitative_evaluation(model, candidates, id_title_map) if (validationRmse < bestValidationRmse): bestModel = model bestValidationRmse = validationRmse bestRank = rank bestLambda = lmbda bestNumIter = numIter result_dict[result_template % (rank, numIter, lmbda)] = validationRmse testRmse = 0.0 #computeRmse(bestModel, test, numTest) # evaluate the best model on the test set print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \ + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse) result_dict['BEST Model on Test:' + result_template % (bestRank, bestNumIter, bestLambda)] = testRmse # compare the best model with a naive baseline that always returns the mean rating meanRating = training.union(validation).map(lambda x: x[2]).mean() baselineRmse = sqrt( test.map(lambda x: (meanRating - x[2])**2).reduce(add) / numTest) improvement = (baselineRmse - testRmse) / baselineRmse * 100 print "The best model improves the baseline by %.2f" % (improvement) + "%." result_dict['BEST gain over baseline'] = improvement return bestModel, result_dict
def train(training_rdd): model = ALS.trainImplicit( \ training_rdd.map(lambda rr: (rr[0], rr[1], 1)), rank=16, iterations=10, lambda_=0.1, alpha=80.0 ) return model.productFeatures()
def train_model(self): """Train the implicit ALS model with the current dataset for A certain parameter: stats.stackexchange.com/questions/133565/how-to-set-preferences-for-als-implicit-feedback-in-collaborative-filtering """ print(self.rank, self.seed, self.iterations, self.reg_parameter, self.alpha) self.model = ALS.trainImplicit(self.trainData, rank=self.rank, \ iterations=self.iterations, lambda_=self.reg_parameter, \ blocks=-1, alpha=self.alpha, nonnegative=False, seed=self.seed)
def trainALS(self): try: self.model = MatrixFactorizationModel.load(self.sc, "als_final") except (RuntimeError, TypeError, NameError) as e: rank = 4 numIterations = 20 self.model = ALS.trainImplicit(self.implicit_ratings, rank, numIterations) self.model.save(self.sc, "als_final")
def main(sc): #load files train_1 = sc.textFile("file:///Expedia/data/train_1.csv") training_RDD = train_1.map(lambda l: l.split()).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() #load folds files train_2 = sc.textFile("file:///Expedia/data/train_1.csv") validation_RDD = train_2.map(lambda l: l.split()).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])).cache() train_RDD = training_RDD.map(lambda x: (x[0], x[1])).cache() #Train model in tain set and cross validation set and choose the best model with # the best RMSE in Cross validation set seed = 5L iterations = 10 regularization_parameter = [0.1, 0.5, 1.0] ranks = [4, 8] errors = [] min_error = float('inf') best_rank = -1 for rank, regularization_parameter in itertools.product( ranks, regularization_parameter): #train implicit model in train set model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) #Predict model in validation set predictions = model.predictAll(validation_for_predict_RDD).map( lambda r: ((r[0], r[1]), r[2])) rates_and_preds = validation_RDD.map( lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) #compute root mean square error in prediction validation set rmse = math.sqrt( rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) errors.append(rmse) print 'For rank %s the RMSE is %s' % (rank, rmse) if rmse < min_error: min_error = rmse best_rank = rank print "The best model was trained with rank = %d and lambda = %.1f, " % ( best_rank , regularization_parameter) \ + "and numIter = %d, and its RMSE on the validation set is %f." % (iterations, min_error)
def __train_model(self): """Train the ALS model with the current dataset """ logger.info("Training the ALS model...") self.model = ALS.trainImplicit(self.taste_RDD, self.rank, seed=self.seed, iterations=self.iterations, lambda_=self.regularization_parameter) logger.info("ALS model built!")
def train(self, rank=3, iterations=20, lambda_=0.01, alpha=None, blocks=-1): """ train a mf model against the given parameters """ if alpha: model = ALS.trainImplicit(self.train_data, rank, iterations, lambda_, blocks, alpha) else: model = ALS.train(self.train_data, rank, iterations, lambda_) return model
def training_models(self, rank=5, seed=32, iterations=20, alpha=0.01, reg=0.01): '''ALS training parameters: rank - Number of latent factors. iterations - Number of iterations of ALS. (default: 5) lambda_ - Regularization parameter. (default: 0.01) alpha - constant used in computing confidence. (default: 0.01) seed - Random seed for initial matrix factorization model. (default: None) ''' print (self.training.take(5), self.test.take(5)) weights = [.8, .2] trainData_RDD, valData_RDD = self.training.randomSplit(weights, seed) # split training to training and validation sets trainData_RDD.cache(), valData_RDD.cache() print (trainData_RDD.count(), valData_RDD.count()) #X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).filter(lambda x: x[0] in set({92396, 198196, 111182, 2350, 46158})).cache() X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).cache() sum_ratings_val = valData_RDD.map(lambda x: x.rating).sum() product_nums_for_users = X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).map(lambda x: x[1]).collect() #print (X_val_RDD.collect()) print ('num of users', X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).count()) #print (product_num_for_users) rank_lists = Rank_list(product_nums_for_users) print (rank_lists) #print (rank_lists[4]) #return model = ALS.trainImplicit(trainData_RDD, rank, iterations=iterations,\ lambda_=reg, blocks=-1, alpha=alpha, nonnegative=False, seed=seed) # prediced results for validation results predictions_RDD = model.predictAll(X_val_RDD).map(lambda x: ((x[0], x[1]), x[2])) ratings_and_preds_RDD = valData_RDD.map(lambda x: ((x[0], x[1]), x[2])).join(predictions_RDD) print() print('model training is convergenent') print() #return MPR = self.percentage_ranking(ratings_and_preds_RDD, rank_lists, sum_ratings_val) print ('Rank %s, reg %s, alpha %s, AvgRank = %s' % (rank, reg, alpha, MPR))
def main(sc): #load files train_1 = sc.textFile("file:///Expedia/data/train_1.csv") training_RDD = train_1.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() #load folds files train_2 = sc.textFile("file:///Expedia/data/train_1.csv") validation_RDD = train_2.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache() validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])).cache() train_RDD = training_RDD.map(lambda x: (x[0], x[1])).cache() #Train model in tain set and cross validation set and choose the best model with # the best RMSE in Cross validation set seed = 5L iterations = 10 regularization_parameter = [0.1, 0.5 , 1.0 ] ranks = [4, 8] errors = [] min_error = float('inf') best_rank = -1 for rank, regularization_parameter in itertools.product(ranks, regularization_parameter): #train implicit model in train set model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations, lambda_=regularization_parameter) #Predict model in validation set predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])) rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions) #compute root mean square error in prediction validation set rmse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) errors.append(rmse) print 'For rank %s the RMSE is %s' % (rank, rmse) if rmse < min_error: min_error = rmse best_rank = rank print "The best model was trained with rank = %d and lambda = %.1f, " % ( best_rank , regularization_parameter) \ + "and numIter = %d, and its RMSE on the validation set is %f." % (iterations, min_error)
def label(self, rank=50, numIterations=10, alpha=0.01): """ INPUT: - rank: number of topics - numIterations: number of iterations for matrix factorization - alpha: learning rate OUTPUT: - data for training naive bayes with label, feature tuples """ als_model = ALS.trainImplicit(self.tfidf_rating, rank, numIterations, alpha) index_label = als_model.userFeatures().map(lambda x: (x[0], np.argmax(x[1]))) index_feature = self.tfidf.zipWithIndex().map(lambda x: (x[1], x[0])) index_label_feature = index_label.join(index_feature) label_feature = index_label_feature.map(lambda x: x[1]) self.train_data = label_feature.map(lambda x: LabeledPoint(x[0], x[1]))
def train_als(self): self.ratings = self.df.select("user_id", "repo_id")\ .map(lambda x: Rating(x[0], x[1], 1.0)) rank = 10 numIterations = 20 model = ALS.trainImplicit(self.ratings, rank, numIterations, alpha=0.01) testdata = self.ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = self.ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) model.save(self.sc, "ALS_model")
def prepare_model(sc, filename, user_id, ratings_train): if filename is None and os.path.exists(config.MSD_MODEL): # load the trained model print("\n\nLoading existing recommendation model from %s\n\n" % config.MSD_MODEL) model = MatrixFactorizationModel.load(sc, config.MSD_MODEL) else: # train a new model print("\n\nRetraining recommendation model for User %s\n\n" % user_id) rank, lambda_val = ( evaluate.load_best_params(config.MSD_BEST_PARAMS_FILE)) rank, lambda_val = int(rank), float(lambda_val) model = ALS.trainImplicit(ratings_train, rank, evaluate.ITERATIONS, lambda_val, nonnegative=True) return model
def fit_and_save_model(self, train_num=0.8, test_num=0.2, seed_num=2711, rank=5, iterations=5): user_item_spark_df = self.spark.createDataFrame(self.user_item_df) user_item_rdd = user_item_spark_df.rdd train, test = user_item_rdd.randomSplit([train_num, test_num], seed=seed_num) testdata = test.map(lambda p: (p[0], p[1])) model = ALS.trainImplicit(train, rank=rank, iterations=iterations, nonnegative=True) model.save(self.sc, 'data/firstmodel')
def main(cores, prefs): """ args: cores (int) : number of cores for spark job prefs (list[str]) : list of strings containing subreddit names - capital letters are non-trivial """ scfg = SparkConf() scfg.set("spark.cores.max", cores) sc = SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg) try: # prep data raw_counts = sc.textFile( "hdfs://final-gateway/w251_cf-user-site-total") parsed_counts = raw_counts.map(lambda st: eval(st)) all_ratings = parsed_counts.map(tup_to_rating) # assign user-identified preferred subreddits raw_prefs = [(999, x, 100) for x in prefs] my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating) # train model model_input = all_ratings.union(my_prefs) model = ALS.trainImplicit(model_input, 10, 10, alpha=.01) # candidate prefs for prediction my_prefs_ids = set([javahash(x) for x in prefs]) all_subreddit_ids = parsed_counts.map( lambda (a, b, c): (javahash(b), b)).distinct().cache() candidates = all_subreddit_ids.map(lambda (a, b): a).filter( lambda r: r not in my_prefs_ids) predictions = model.predictAll( candidates.map(lambda x: (999, x))).cache() final = predictions.map(lambda (a, b, c): (b, c)).join( all_subreddit_ids).map(lambda (b, (c, d)): (c, d)).sortByKey(False) output = list(final.take(30)) sc.stop() return output except Exception, e: print("App failed. Stopping gracefully") sc.stop() raise Exception(e)
def cross_validation(training, validation, test, candidates, id_title_map, ranks, lambdas, numIters, alphas): # train models and evaluate them on the validation set result_dict = {} result_template = "rank:%d iters:%d lambda: %f" bestModel = None bestValidationRmse = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 numTraining = training.count() numValidation = validation.count() numTest = test.count() if not IMPLICIT: alphas = [1.0] for rank, lmbda, numIter, alpha in itertools.product(ranks, lambdas, numIters, alphas): if IMPLICIT: model = ALS.trainImplicit(training, rank, iterations=numIter, lambda_=lmbda, alpha=alpha, nonnegative=True) else: model = ALS.train(training, rank, iterations=numIter, lambda_=lmbda, nonnegative=True) validationRmse = 0.0 #computeRmse(model, validation, numValidation) print "RMSE (validation) = %f for the model trained with " % validationRmse + \ "rank = %d, lambda = %.4f, and numIter = %d and alpha=%f." % (rank, lmbda, numIter, alpha) qe_results = qualitative_evaluation(model, candidates, id_title_map) if (validationRmse < bestValidationRmse): bestModel = model bestValidationRmse = validationRmse bestRank = rank bestLambda = lmbda bestNumIter = numIter result_dict[result_template % (rank, numIter, lmbda)] = validationRmse testRmse = 0.0 #computeRmse(bestModel, test, numTest) # evaluate the best model on the test set print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \ + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse) result_dict['BEST Model on Test:' + result_template % (bestRank, bestNumIter, bestLambda)] = testRmse # compare the best model with a naive baseline that always returns the mean rating meanRating = training.union(validation).map(lambda x: x[2]).mean() baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest) improvement = (baselineRmse - testRmse) / baselineRmse * 100 print "The best model improves the baseline by %.2f" % (improvement) + "%." result_dict['BEST gain over baseline'] = improvement return bestModel, result_dict
def main(cores, prefs): """ ALS Algorithm to Recommend Subreddits to User based on User-defined preferences args: cores (int) : number of cores for spark job prefs (list[str]) : list of strings containing subreddit names - capital letters are non-trivial """ scfg=SparkConf() scfg.set("spark.cores.max",cores) sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg) try: # prep data raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total") parsed_counts = raw_counts.map(lambda st: eval(st)) all_ratings = parsed_counts.map( tup_to_rating ) # assign user-identified preferred subreddits raw_prefs = [ (999, x, 100) for x in prefs ] my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating) # train model model_input = all_ratings.union(my_prefs) model = ALS.trainImplicit(model_input, 10, 10, alpha=.01, seed=5) # candidate prefs for prediction my_prefs_ids = set([javahash(x) for x in prefs]) all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache() candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids) predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache() final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False) output = list( final.take(30) ) sc.stop() return output except Exception, e: print("App failed. Stopping gracefully") sc.stop() raise Exception(e)
def fit_als(self): sc = self.__set_context_als() data = sc.textFile('data_set/train_set_als.csv') header = data.first() data = data.filter(lambda row: row != header) trans = open('data_set/train_set_als.csv', 'r') lines = trans.readlines() users_dict = dict() # {CUST_ID : Num} items_dict = dict() i = 0 for line in lines[1:]: parts = line.split(',') user = int(parts[0]) if user not in users_dict: users_dict[user] = i i += 1 j = 0 for line in lines[1:]: parts = line.split(',') item = int(parts[1]) if item not in items_dict: items_dict[item] = j j += 1 self.users_als = {v: k for k, v in users_dict.items()} #{Num : CUST_ID} self.items_als = {v: k for k, v in items_dict.items()} self.users_dict = users_dict self.items_dict = items_dict ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating( users_dict[int(l[0])], items_dict[int(l[1])], float(l[2]))) self.model_ALS = ALS.trainImplicit(ratings=ratings, rank=40, iterations=30, lambda_=0.001, blocks=-1, alpha=10.0)
def execute_recommendation(): sc = SparkContext(appName="PythonCollaborativeFilteringExample") #sc = SparkContext( 'local', 'pyspark') #Load train data and train train_file_name = get_training_file_name() train_data = sc.textFile(train_file_name) ratings = train_data.map(lambda l: l.split(','))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) rank = 10 number_iteration = 10 model = ALS.trainImplicit(ratings, rank, number_iteration) #load test data and do prediction test_file_name = get_testing_file_name() test_data = sc.textFile(test_file_name) test_ranking=test_data.map(lambda l: l.split(','))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) testdata = test_ranking.map(lambda p: (p[0], p[1])) count_rdd=testdata.count() if count_rdd > 0: predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) #predictions_lines = predictions.map(toCSVLine) result_file = get_rdd_output() predictions.saveAsTextFile(result_file) count_rdd = predictions.count() print("after prediction: count_rdd=",count_rdd) else: print("Error: empty testdata") sc.stop()
def train_als(ratings, explicit, rank, rp, iteration, non_negative): # To avoid stackoverflow issue. sc.setCheckpointDir("checkpoint/") ALS.checkpointInterval = 2 if explicit == True: model = ALS.train(ratings, rank=rank, iterations=iteration, lambda_=rp, nonnegative=non_negative) else: model = ALS.trainImplicit(ratings, rank=r, iterations=iteration, lambda_=rp, nonnegative=non_negative) return model
def train(training_data, validation_data, num_validation, ranks, lambdas, iterations): best_model = None best_model_metadata = {} model_metadata = [] alpha = 3.0 for rank, lmbda, iteration in itertools.product(ranks, lambdas, iterations): t0 = time() model = ALS.trainImplicit(training_data, rank, iterations=iteration, lambda_=lmbda, alpha=alpha) mt = '{:.2f}'.format((time() - t0) / 60) model_id = 'listenbrainz-recommendation-model-{}'.format(uuid.uuid4()) t0 = time() validation_rmse = compute_rmse(model, validation_data, num_validation) vt = '{:.2f}'.format((time() - t0) / 60) model_metadata.append((model_id, mt, rank, '{:.1f}'.format(lmbda), iteration, "%.2f" % (validation_rmse), vt)) if best_model is None or validation_rmse < best_model.error: best_model = Model(model=model, error=validation_rmse, rank=rank, lmbda=lmbda, iteration=iteration, model_id=model_id, training_time=mt, rmse_time=vt) best_model_metadata = { 'error': '{:.2f}'.format(best_model.error), 'rank': best_model.rank, 'lmbda': best_model.lmbda, 'iteration': best_model.iteration, 'model_id': best_model.model_id, 'training_time': best_model.training_time, 'rmse_time': best_model.rmse_time } return best_model, model_metadata, best_model_metadata
def main(sc): hbaseconf = { "hbase.zookeeper.quorum": 'cluster3', "hbase.mapreduce.inputtable": 'testUserBehavior' } keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" hbase_rdd = sc.newAPIHadoopRDD( "org.apache.hadoop.hbase.mapreduce.TableInputFormat", "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "org.apache.hadoop.hbase.client.Result", keyConverter=keyConv, valueConverter=valueConv, conf=hbaseconf) data = hbase_rdd.map(change_format).filter(lambda x: x is not None).cache() users = set(data.map(lambda x: (x[0], 1)) \ .reduceByKey(lambda x, y: x + y) \ .map(lambda x: (x[1], x[0])) \ .sortByKey(ascending = False) \ .map(lambda x: x[1]).take(1000)) data = data.filter(lambda x: True if x[0] in users else False) model = ALS.trainImplicit(data, 1, seed=10) results = model.recommendProductsForUsers(3).map(output_format).filter( lambda x: x is not None) sqlContext = SQLContext(sc) schema = StructType([ StructField("Uid", StringType(), True), StructField("results", StringType(), True) ]) r = sqlContext.createDataFrame(results, schema) url = "jdbc:mysql://cluster2" table = "CSDN.recommendation" properties = {"user": "******", "password": "******"} r.write.jdbc(url, table, 'overwrite', properties)
def main(argv): Conf = (SparkConf().setAppName("recommendation")) sc = SparkContext(conf=Conf) sqlContext = SQLContext(sc) dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet" rawDF = sqlContext.read.parquet(dirPath).persist( StorageLevel.MEMORY_AND_DISK_SER) # argv[1] is the dump of training data in hdfs # argv[2] is the user perferences # User Hash Lookup stored into cassandra user_hash = rawDF.map(lambda (a, b, c): (a, hashFunction(a))) distinctUser = user_hash.distinct() userHashDF = sqlContext.createDataFrame(distinctUser, ["user", "hash"]) userHashDF.write.format("org.apache.spark.sql.cassandra").options( table="userhash", keyspace=keyspace).save(mode="append") # Product Hash Lookup stored into cassandra product_hash = rawDF.map(lambda (a, b, c): (b, hashFunction(b))) distinctProduct = product_hash.distinct() productHashDF = sqlContext.createDataFrame(distinctProduct, ["product", "hash"]) productHashDF.write.format("org.apache.spark.sql.cassandra").options( table="producthash", keyspace=keyspace).save(mode="append") # Ratings for training # ALS requires a java hash of string. This function does that and stores it as Rating Object # for the algorithm to consume ratings = rawDF.map( lambda (a, b, c): Rating(hashFunction(a), hashFunction(b), float(c))) model = ALS.trainImplicit(ratings, 10, 10, alpha=0.01, seed=5) model.save( sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model" ) sc.stop()
def runModel(Sql, TableName, Rank = RANK, No_iterations = NO_ITERATIONS, Alpha = APLHA): ### ALS ### print("start runModel") dfNonVariety_ALS = saveToTempTable(Sql = Sql, TableName = TableName) print(dfNonVariety_ALS.count()) start = time.time() indexed_user = indexedUser(dfNonVariety_ALS) stop = time.time() print("done -- indexed_user " + str(stop-start)) start = time.time() indexed_product = indexedProduct(indexed_user) stop = time.time() print("done -- indexed_product " + str(stop-start)) start = time.time() ratings_NonVariety = indexed_product.rdd.map(lambda r: Rating(r.UserIdNew, r.PidNew, r.Visit)) stop = time.time() print("done -- rating " + str(stop-start)) alsModel = ALSModel() print("done -- create instance") start = time.time() alsModel.joined_rdd = indexed_product.select('UserIdNew').dropDuplicates().crossJoin(indexed_product.select('PidNew').dropDuplicates()).rdd.map(lambda x: (x[0], x[1])) stop = time.time() print("done -- cross join " + str(stop-start)) start = time.time() saveToTempTable(DFObject = indexed_product, TableName='tbData') stop = time.time() print("done -- dump df_ALS to temp table " + str(stop-start)) # ALS Implicit Model start = time.time() alsModel.model = ALS.trainImplicit(ratings_NonVariety, Rank, No_iterations, Alpha) stop = time.time() print("done -- model " + str(stop-start)) return alsModel
def Recommendation(filename, foods): print ('successful') sc = SparkContext('local', 'Simple App') ratings = sc.textFile(filename) processedRatings = ratings.map(lambda line: (int(line.split(",")[0]),int(line.split(",")[1]),float(line.split(",")[2]))) users = ratings.map(lambda rating: int(rating.split(",")[0])).distinct().collect() #train model model = ALS.trainImplicit(processedRatings, 1,seed=10) rdict = {} recommenddict = {} for user in users: recommenddict.setdefault(user,[]) for user in users: rdict[user]= model.recommendProducts(user,5) for Rating in rdict[user]: recommenddict[user].append(foods[Rating.product]) #recommenddict[user].append(Rating[1]) print (recommenddict) sc.stop() return recommenddict
def calc_cf_mllib(y_training_data, num_partitions = 20): """ Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = y_training_data.map(lambda (user, item, rating): rating).max() min_rating = y_training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating=0 #MLLIb has two methods, train and trainImplicit(). Implicit data will go between zero and 1 if min_rating==0 and max_rating==1: model = ALS.trainImplicit(y_training_data, rank = 10, iterations = 5) else: model = ALS.train(y_training_data, rank = 10, iterations = 5) #predict all user, item pairs item_ids = y_training_data.map(lambda (u,i,r): i).distinct() user_ids = y_training_data.map(lambda (u,i,r): u).distinct() user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions) predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1]))) norm_predictions = predicted.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating))) return norm_predictions
def train_implicit(self, rank, seed=0, iterations=50, lambda_=0.01, **kwargs): """ Train the model using implicit ratings. Parameters ---------- rank : int The number of factors in the underlying model. Generally, larger numbers of factors lead to better models, but increase the memory required. A rank in the range of 10 to 200 is usually reasonable. iterations : int, optional The number of iterations to perform. With each iteration, the model improves. ALS typically converges quickly, so a value of 10 is recommended. lambda : float, optional This parameter controls regularization, which controls overfitting. The higher the value of lambda applies more regularization. The appropriate value here depends on the problem, and needs to be tuned by train/test techniques, which measure overfitting. Returns ------- out: : model A RecommenderModel. This can be used to make predidictions on how a user would rate an item. """ ratings = self._prepare_ratings() model = ALS.trainImplicit(ratings.to_rdd(), rank, iterations=iterations, lambda_=lambda_, seed=seed, **kwargs) return MatrixFactorizationModel(model, self.ratings, self.user_col, self.item_col, self.rating_col)
def home(request): prefs = ["IAmA","funny","nfl"] scfg=SparkConf() scfg.set("spark.cores.max",64) sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg) try: # prep data raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total") parsed_counts = raw_counts.map(lambda st: eval(st)) all_ratings = parsed_counts.map( tup_to_rating ) # assign user-identified preferred subreddits raw_prefs = [ (999, x, 100) for x in prefs ] my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating) # train model model_input = all_ratings.union(my_prefs) model = ALS.trainImplicit(model_input, 10, 10, alpha=.01) # candidate prefs for prediction my_prefs_ids = set([javahash(x) for x in prefs]) all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache() candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids) predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache() final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False) output = list( final.take(30) ) sc.stop() except Exception, e: print("App failed. Stopping gracefully") sc.stop() raise Exception(e)
def calc_cf_mllib(y_training_data, num_partitions=20): """ Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings Args: y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] Returns: predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. """ #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here max_rating = y_training_data.map(lambda (user, item, rating): rating).max() min_rating = y_training_data.map(lambda (user, item, rating): rating).min() if max_rating == min_rating: min_rating = 0 #MLLIb has two methods, train and trainImplicit(). Implicit data will go between zero and 1 if min_rating == 0 and max_rating == 1: model = ALS.trainImplicit(y_training_data, rank=10, iterations=5) else: model = ALS.train(y_training_data, rank=10, iterations=5) #predict all user, item pairs item_ids = y_training_data.map(lambda (u, i, r): i).distinct() user_ids = y_training_data.map(lambda (u, i, r): u).distinct() user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions) predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1]))) norm_predictions = predicted.map(lambda (user, item, pred): ( user, item, rechelp.squish_preds(pred, min_rating, max_rating))) return norm_predictions
def recommend(sc, rawUserArtistData, rawArtistData, rawArtistAlias): bArtistAlias = sc.broadcast(buildArtistAlias(rawArtistAlias)) allData = buildRatings(rawUserArtistData, bArtistAlias).cache() model = ALS.trainImplicit(ratings=allData, rank=50, iterations=10, lambda_=1.0, alpha=40.0) allData.unpersist() userID = 2093760 recommendations = model.recommendProducts(userID, 5) recommendedProductIDs = map(lambda rec: rec.product, recommendations) artistByID = buildArtistByID(rawArtistData) recommendedArtists = artistByID.filter(lambda artist: artist[0] in recommendedProductIDs).collect() for val in recommendedArtists: print(val) someUsers = allData.map(lambda item: item.user).distinct().take(100) someRecommendations = map(lambda userId: model.recommendProducts(userId, 5),someUsers) formattedRecommendations = map(lambda recs: str(recs[0].user) + " -> " + ", ".join( map(lambda x: str(x.product), recs) ),someRecommendations) for val in formattedRecommendations: print(val) unpersist(model)
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark.sql import SparkSession sc = SparkSession.builder \ .appName("ALSmodel") \ .getOrCreate() # Load and parse the data data = sc.sparkContext.textFile("/tmp/transactions_andre_als") ratings = data.map(lambda l: l.split(','))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Build the recommendation model using Alternating Least Squares rank = 15 numIterations = 5 model = ALS.trainImplicit(ratings, rank, numIterations, alpha=0.01) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc.sparkContext, "/tmp/amolenaar/model")
longToShortLocations[str(longLocation)] = shortLocation outStr = str(shortUserID) + "," + str(shortLocation) + "," + numVisits + "\n" fpout.write(outStr) pickle.dump( longToShortLocations, open( "shortenLocations.p", "wb" )) fp.close() fpout.close() # Load and parse the data data = sc.textFile("file:///home/hadoop/RealVisitsDataShort.csv") ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Build the recommendation model using Alternating Least Squares rank = 10 numIterations = 10 model = ALS.trainImplicit(ratings, rank, numIterations) # Evaluate the model on training data testdata = ratings.map(lambda p: (p[0], p[1])) predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print("Mean Squared Error = " + str(MSE)) #Save and load model #commented out the save for now because the model already exists on hdfs #uncomment this when you are ready to train a new model! #model.save(sc, "target/tmp/myCollaborativeFilter") sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") #parse the AskForRecsFor.csv file f = open('AskForRecsForShort.csv')
userArtistDataFile = filePath + 'user_artist_data.txt' rawUserArtistData = sc.textFile(userArtistDataFile) # parse Artist data file artistDataFile = filePath + 'artist_data.txt' rawArtistData = sc.textFile(artistDataFile) artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1) # parse artist alias file artistAliasDataFile = filePath + 'artist_alias.txt' rawArtistAliasData = sc.textFile(artistAliasDataFile) artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap() # broadcast variable bArtistAlias = sc.broadcast(artistAlias) def processTrainData(line): (userId, artistId, count) = map(int, line.split(' ')) artistAliasId = bArtistAlias.value.get(artistId) if artistAliasId == None: artistAliasId = artistId return Rating(userId, artistAliasId, count) trainData = rawUserArtistData.map(processTrainData).cache() model = ALS.trainImplicit(trainData, 10) print model.productFeatures()
def findBestModel(data): # Build the recommendation model using Alternating Least Squares # TODO need to test the best configuration of model rank = 10 numIterations = 20 return ALS.trainImplicit(data, 1)
ranks = [8,12] lambdas = [0.1, 1.0] numIters = [10,20] bestModel = None bestValidationRmse = float('inf') bestRank = 0 bestLambda = -1.0 bestNumIter = -1 #Developing the best model. Using different values to find the model with the least Mean Square Error. #Once I find that I will use that model to predict results. for rank,lmbda,numIter in itertools.product(ranks,lambdas,numIters): model = ALS.trainImplicit(rec_list, rank, numIter,lambda_=lmbda,alpha=0.01) validationRmse = computeRmse(model, validation, numValidation) #print "RMSE (validation) = %s for the model trained with " % str(validationRmse) + \ # "rank = %d, lambda = %f, and numIter = %d." % (rank,lmbda,numIter) if (validationRmse < bestValidationRmse): bestModel = model bestValidationRmse = validationRmse bestRank = rank bestLambda = lmbda bestNumIter = numIter #Computing Best MSE to display testRmse = computeRmse(bestModel, test, numTest)
def trainModel(self): data_triplets = self.loadRatings(self.sc, self.dataFile) #Split the data into 80% training data and 20% valdiation data train_triplet = data_triplets.sample(False,0.8, seed=1).cache() validation_triplets = data_triplets.subtract(train_triplet).cache() print 20 * '-', 'Started Training the ALS model', 20 * '-' #TODO set different ranks and lambdas ranks = [8, 10, 12] lambdas = [1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0] numIters = [20] alp = [1.0,10.0, 50.0, 100.0] bestModel = None bestValidationRMSE = float("inf") bestRank = 0 bestLambda = -1.0 bestNumIter = -1 bestalpha = -1.0 for rank, lmbda, numIter, a in itertools.product(ranks, lambdas, numIters, alp): print ("\nTraining ALS with rank = {}, Regularization parameter = {}, \n" "number of iterations = {}, alpha = {}".format(rank, lmbda, numIter, a)) model = ALS.trainImplicit(train_triplet, rank, lambda_=lmbda, iterations=numIter, alpha=a) #testdata contains only userId and songId testdata = validation_triplets.map(lambda r: (r[0], r[1])) #prediciton will contain userId, songId and predicted ratings predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) #Join the predicted ratings with actual rating to compute root mean square error actualsAndPredictions = validation_triplets.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) RMSE = math.sqrt(actualsAndPredictions.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean()) print ('\n RMSE = {} ').format(RMSE) if (RMSE < bestValidationRMSE): bestModel = model bestValidationRMSE = RMSE bestRank = rank bestLambda = lmbda bestNumIter = numIter bestalpha = a print ("\nThe best model was trained with Rank = {}, Regularization parameter ={} and" "\nNumber of Iterations = {}\n RMSE = {}, best alpha = {}" .format(bestRank, bestLambda, bestNumIter, bestValidationRMSE, bestalpha)) print 20 * '-', 'Finished Training the ALS model', 20 * '-' # Evaluate the best model on the test set. Use the entire data file as test set. print(20 * '-', 'Testing on the given data file itself', 20 * '-') test_ratings = self.loadRatings(self.sc, self.dataFile) testdata = test_ratings.map(lambda p: (p[0], p[1])) predictions = bestModel.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean() MAE = ratesAndPreds.map(lambda r: (abs(abs(r[1][0]) - abs(r[1][1])))).mean() print("Mean Squared Error = " + str(MSE)) print("Mean Absolute Error = " + str(MAE)) print("Root Mean Square Error = ", str(MSE ** .5)) print(20 * '-', 'Testing Finished', 20 * '-') # Save the best model #bestModel.save(self.sc, self.modelPath) bestModel.save(self.sc, self.modelPath)
raw_artist_data = sc.textFile("s3://ehec2/audio_data/artist_data.txt", 5) artist_by_id = raw_artist_data.map(lambda line: line.split('\t'))\ .map(lambda token: tokenize(token, 1)) raw_artist_alias = sc.textFile("s3://ehec2/audio_data/artist_alias.txt") artists_alias = raw_artist_alias.map(lambda line: line.split("\t"))\ .map(lambda token: tokenize(token)) raw_user_artist_data = sc.textFile("s3://ehec2/audio_data/user_artist_data.txt") user_artist_data = raw_user_artist_data.map(lambda line: line.split())\ .map(lambda token: tokenize(token)) # Broadcast global variable for use in getting the right artist id brdc_artists_alias = sc.broadcast(artists_alias.collectAsMap()) trainRDD = user_artist_data.map(lambda row: train_tokenize(row)).cache() model = ALS.trainImplicit(trainRDD, 10, 5, 0.01, 1) # Spot Checking Recommendations # Extract the IDs of artists that this user has listened to and print their names. This means searching the input for artist IDs for this user, and then filtering the set of artists by these IDs so you can collect and print the names in order: # Testing the model by looking at recommendations for user: 1052043 # To check recommendation for other users, change user_id here. test_user_id = 2093760 # Getting all artists user test_user_id has listened to artists_for_user = user_artist_data.filter(lambda l: l[0] == test_user_id) existing_products = set(artists_for_user.map(lambda l: l[1]).collect()) # Getting the names and ids of artists user has listened to artists_for_user = artist_by_id.filter(lambda art: art[0] in existing_products)\ .map(lambda art: art[1]).collect()
(training,test) = CatRating.randomSplit([0.8,0.2]) training.count() 1202345 test.count() 300611 # Call the ALS.train mehod to train the model from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating rank = 10 numIterations = 10 model = model = ALS.trainImplicit(training, rank, numIterations, alpha=0.01) model <pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7fb12319b810> # Evaluate the model on training data testdata = test.map(lambda r: (r[0],r[1])) type(testdata) <class 'pyspark.rdd.PipelinedRDD'> testdata.take(5) [(278716, 1558), (1387683, 324), (639095, 1192), (240681, 646), (31895, 969)]
#rawRatings3.take(10) # In[15]: #ALS.extractParamMap() # In[16]: from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating rank = 200 numIterations = 20 #model = ALS.train(rawRatings3, rank, numIterations, 0.01) #model = ALS.trainImplicit(rawRatings3, rank, numIterations, 0.01) model = ALS.trainImplicit(rawRatings3, rank, numIterations, 0.03) # In[17]: model.recommendProducts(100000,5) # In[18]: #type(model) # In[19]: #type(ALS)
if split: rand_a,rand_b = raw_data.randomSplit(weights=[0.5,0.5],seed=99).persist() ratings_a = rand_a.map(lambda row: Rating(row[0],row[1],row[2])).persist() ratings_b = rand_b.map(lambda row: Rating(row[0],row[1],row[2])).persist() else: ratings = raw_data.map(lambda row: Rating(row[0],row[1],row[2])).persist() base_model_name = d+'model_' #with open(d+'log_rmse','a') as fout: for k in k_range: start = time.time() if split: model_a = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True) model_b = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True) model_a.save(sc,'model_rand_a_'+str(k)) model_b.save(sc,'model_rand_b_'+str(k)) artist_features_a = np.array(model_a.productFeatures().sortByKey().map(lambda row: row[1]).collect()) np.save(d+"features_rand_a_{}".format(k)) artist_features_b = np.array(model_b.productFeatures().sortByKey().map(lambda row: row[1]).collect()) np.save(d+"features_rand_b_{}".format(k)) else: model = ALS.trainImplicit(ratings,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True) model.save(sc,d+'model_'+str(k)) artist_features = np.array(model.productFeatures().sortByKey().map(lambda row: row[1]).collect()) np.save(d+"features_{}".format(k),artist_features) user_features = np.array(model.userFeatures().sortByKey().map(lambda row: row[1]).collect()) np.save(d+"user_features_{}".format(k),user_features)
if __name__ == "__main__": if len(sys.argv) != 3: print(""" first argument is ratings file and second is output file """) exit(-1) txtFile = sys.argv[1] outputFile = sys.argv[2] sc = SparkContext(appName="ALSExample") ratings = sc.textFile(txtFile) processedRatings = ratings.map(lambda line: (int(line.split(",")[0]),int(line.split(",")[1]),float(line.split(",")[2]))) users = ratings.map(lambda rating: int(rating.split(",")[0])).distinct().collect() #train model model = ALS.trainImplicit(processedRatings, 1,seed=10) outArray=[] f=open(outputFile,'w') for user in users: outArray.append(model.recommendProducts(user,20)) f.write(json.dumps(outArray)) sc.stop() f.close()
# * k-folds (inner loop) iter_cnt = 0 for rv in rankVal: for train_index, test_index in kf: iter_cnt += 1 results = {} pair_train, pair_test = pairs[train_index], pairs[test_index] bid_train = sc.parallelize(pair_train).flatMap(lambda x: ((x[0],x[3]), (x[2],x[1]))) train = uni.union(bid_train)\ .union(wish)\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(1.0))) model = ALS.trainImplicit(train, rv, numIterations, lambda_=_lambda, alpha=_alpha) # Reconstruction error testdata = train.map(lambda p: (p[0], p[1])) print "Test data len: ", testdata.count() predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = train.map(lambda r: ((r[0], r[1]), r[2])).join(predictions) MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean() print "-"*50, " RANK: ", rv, "ITER: ", iter_cnt users_in_train = train.map(lambda x: x[0] ).distinct().collect() items_in_train = train.map(lambda x: x[1] ).distinct().collect() recom = [] for i,p in enumerate(pair_test):
from pyspark import SparkContext from pyspark.mllib.recommendation import ALS import terragon sc = SparkContext() r1 = (1, 1, 1.0) r2 = (1, 2, 2.0) r3 = (2, 1, 2.0) ratings = sc.parallelize([r1, r2, r3]) model = ALS.trainImplicit(ratings, 1, seed=10) model.predict(2, 2) stringified_spark_model = terragon.dumps_spark_to_base64(sc, model) with open("/tmp/test.sparkle", "w") as f: f.write(stringified_spark_model)
top_repos = starpairs\ .groupBy(lambda t: t[1])\ .sortBy(lambda t: len(t[1]), False)\ .map(lambda t: t[0])\ .take(sample) top_repos_rdd = sc.parallelize(top_repos) top_repos_rdd.cache() top_repos_bc = sc.broadcast(top_repos) pprint(top_repos[:5]) starpairs_filtered = starpairs.filter(lambda t: t[1] in top_repos_bc.value) starpairs_filtered.cache() # train recommendation model using alternating least squares stars_with_rating = starpairs_filtered.map(lambda t: array([t[0], t[1], 1])) model = ALS.trainImplicit(stars_with_rating, rank=1) # get all user->repo pairs without stars users_repos = users.cartesian(top_repos_rdd).groupByKey() stars_grouped = starpairs_filtered.groupByKey() unstarred = users_repos.join(stars_grouped)\ .map(lambda i: (i[0], set(i[1][0]) - set(i[1][1]) ))\ .flatMap(lambda i: [ (i[0], repo) for repo in i[1] ] ) # predict unstarred user-repo pairs. predictions = model.predictAll(unstarred) # for each user, associate the 5 repos with the highest predicted rating. top = predictions\ .map(lambda t: (t[0], (t[1],t[2])))\ .groupByKey()\
and 0 indicates that the user didn't listen to the song. ''' # calls format_line to convert strings to integers based off of the hash table. Gets all songs(ints) for a user as an array data = values.map(lambda x: format_line(x, UID_INDEX, int(indexOfField), value_hash, id_hash)).distinct().groupByKey() # or cache() # consider all songs that a user hasn't listened to and store in a numpy array user_arrays = data.map(lambda x: dichotomize(x)) # create (user.id, np.array) format ====> (user.id, product.id, viewed) double lambda FTW ratings = user_arrays.flatMap(lambda user: map(lambda(i, x): array([float(user[0]), float(i), float(x)]), enumerate(user[1]))) # sample without replacement training, test = make_training_and_test(data) # train the ALS model w/ 1 latent variables, and with 10 iterations (should be enough to converge ALS) model = ALS.trainImplicit(ratings, 1, 10) ''' ALS Prediction for Model: * For a given user ID, get the top n-recommendations * Due to the sparsity of the matrix, many of the recommendations have very low confidence scores ''' prompt = "\n Enter the user ID for the user you want to get recommendations for \n" print prompt id_to_recommend = float(raw_input) try : id_to_num = id_hash[id_to_recommend] except : print "ID %s is currently not stored in the predictive model :(" % id_to_recommend break