def train_model(df, target_col, parameters, path): ''' Train a model for a specific target column Parameter -------- df: Spark DataFrame DataFrame to use for training target_col: str String Representation of the target_col to use parameters: dict Parameters to use for the ALS (=maxIter, regParam, rank) path: str Path on the HDFS to save the model Return ------ None But Saves model to path ''' maxIter=parameters["maxIter"] regParam=parameters["regParam"] rank=parameters["rank"] model = ALS(maxIter=maxIter, regParam=regParam, rank=rank, userCol="user", itemCol="tweet", ratingCol=target_col, coldStartStrategy="nan", implicitPrefs=True).fit(df) model.save(path + target_col + "_als_model")
def main(spark, rank, regParam, path, fraction): train = spark.read.parquet("{}/data/processed/train_{}.parquet".format(path, fraction)) als = ALS(rank=rank, maxIter=5, seed=42, regParam=regParam, userCol='user_id', itemCol='book_id', ratingCol='rating', coldStartStrategy="drop") print("Training ALS model with rank {} and regularization {} with {} of data...".format(rank, regParam, fraction)) model = als.fit(train) temp_path = "/ALS_{}_{}_{}".format(rank, regParam, fraction) als_path = temp_path + "/als" print("Saving model...") als.save(path + "/models" + als_path) model_path = temp_path + "/als_model" model.save(path + "/models" + model_path)
# Run model on test data test_predictions = best_model.transform(test) RMSE = evaluator.evaluate(test_predictions) print("RMSE value is", "%.15f" % RMSE) test_predictions.show() n_recommendations = best_model.recommendForAllUsers(10) n_recommendations.limit(10).show() n_recommendations = n_recommendations \ .withColumn("rec_exp", explode("recommendations")) \ .select('userId', col("rec_exp.packageId"), col("rec_exp.rating")) n_recommendations.limit(10).show() n_recommendations_550031373 = n_recommendations.join( packages, on='packageId').filter('userId = 550031373') print("category of recommended items for n_recommendations_550031373") n_recommendations_550031373.join(ratings_orig, on='packageId').filter( ratings_orig.userId == '550031373').limit(100).show() print("category of original score items before recommendation for 550031373") ratings_orig.join(packages, on='packageId').filter( ratings_orig.userId == '550031373').limit(100).show() # Save the model into cluster as files als.save('../models/FlexiGYMImplicitRecommender_ALS') model.save('../models/FlexiGYMImplicitRecommender_ALSModel')
(training, test) = ratings.randomSplit([0.8, 0.2]) als = ALS(maxIter=5, regParam=0.01, userCol="uid", itemCol="urlid", ratingCol="label") model = als.fit(training) # Evaluate the model by computing the RMSE on the test data predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="label", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) predictions.head(5) predictions.write.csv( "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/predictions_test" ) print "finish predictions" # Save and load model als_path = "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/als_model_test" als.save(als_path) als2 = ALS.load(als_path) # $example off$ print "finish load"