def lr_tune(train_data, validation_data,val_true_list,regParam,netParam): # initial min_error = float('inf') best_reg1 = None best_net1 = None best_model_rmse = None max_map = 0.0 best_reg2 = None best_net2 = None best_model_map = None for reg in regParam: for net in netParam: lr = LinearRegression(featuresCol='idf_features',labelCol='rating',regParam=reg, elasticNetParam=net,maxIter=200) model = lr.fit(train_data) predictions = model.transform(validation_data) predictions = predictions.withColumn('prediction', when(predictions['prediction'] < 0, 0).otherwise(predictions['prediction'])) # rmse evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='prediction') rmse = evaluator.evaluate(predictions) if rmse < min_error: min_error = rmse best_reg1 = reg best_net1 = net best_model_rmse = model # MAP top 25 window = Window.partitionBy(predictions['user_id']).orderBy(predictions['prediction'].desc()) top_predictions = predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 25) current_map = MAP.getMAP(top_predictions, val_true_list) if current_map > max_map: max_map = current_map best_reg2 = reg best_net2 = net best_model_map = model print('regParam = {} with elasticNetParam = {}: validation RMSE is {} validation MAP is {}'.format(reg, net, rmse, current_map)) print('The best model select by RMSE has regParam = {} with elasticNetParam = {}: RMSE = {}'.format(best_reg1, best_net1, min_error)) print('The best model select by MAP has regParam = {} with elasticNetParam = {}: MAP = {}'.format(best_reg2, best_net2, max_map)) return best_model_rmse,best_model_map
reg_params = [0.01, 0.05, 0.1, 0.2, 0.5] ranks = [10, 20] best_model_rmse,best_model_map = tuning.tune_ALS_NLP(spark, train, val, val_true_list, num_iters, reg_params, ranks, review_val_predictions) # test performance test_predictions = best_model_rmse_lr.transform(test_review_feature) review_test_predictions = test_predictions.withColumn('prediction', when(test_predictions['prediction'] < 0, 0).otherwise(test_predictions['prediction'])) review_test_predictions = review_test_predictions.withColumnRenamed('prediction','review_prediction') test_predictions = best_model_rmse.transform(test) als_test_predictions = test_predictions.withColumnRenamed('prediction','als_prediction') total_predictions = als_test_predictions.join(review_test_predictions,['user_id','book_id','rating'],'outer') total_predictions = total_predictions.withColumn('total_prediction', when(total_predictions['review_prediction'].isNotNull(), total_predictions['review_prediction']).otherwise(total_predictions['als_prediction'])) window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc()) top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num')<=500) evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction') rmse_test = evaluator.evaluate(top_predictions) window = Window.partitionBy(test['user_id']).orderBy(test['rating'].desc()) test_true_list = test.select('*', rank().over(window).alias('true_row')) map_score = MAP.getMAP(top_predictions, test_true_list) print('Test set RMSE = {}, Test set MAP = {}'.format(rmse_test, map_score))
def tune_ALS_NLP(spark, train_data, validation_data, val_true_list, maxIter, regParams, ranks, review_val_predictions): # initial min_error = float('inf') best_iter1 = -1 best_rank1 = -1 best_regularization1 = 0 best_model_rmse = None max_map = 0.0 best_iter2 = -1 best_rank2 = -1 best_regularization2 = 0 best_model_map = None for iteration in maxIter: for current_rank in ranks: for reg in regParams: als=ALS(maxIter=iteration,regParam=reg,rank=current_rank, \ userCol='user_id',itemCol='book_id',ratingCol='rating', \ coldStartStrategy="drop",nonnegative=True) als_model = als.fit(train_data) predictions = als_model.transform(validation_data) review_predictions = review_val_predictions.withColumnRenamed('prediction','review_prediction') als_predictions = predictions.withColumnRenamed('prediction','als_prediction') total_predictions = als_predictions.join(review_predictions,['user_id','book_id','rating'],'outer') total_predictions = total_predictions.withColumn('total_prediction', \ when(total_predictions['review_prediction'].isNotNull(), \ total_predictions['review_prediction']) \ .otherwise(total_predictions['als_prediction'])) window = Window.partitionBy(total_predictions['user_id']).orderBy(total_predictions['total_prediction'].desc()) top_predictions = total_predictions.select('*', rank().over(window).alias('row_num')).filter(col('row_num') <= 500) # rmse evaluator=RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='total_prediction') rmse = evaluator.evaluate(top_predictions) if rmse < min_error: min_error = rmse best_rank1 = current_rank best_regularization1 = reg best_iter1 = iteration best_model_rmse = als_model # MAP current_map = MAP.getMAP(top_predictions, val_true_list) if current_map > max_map: max_map = current_map best_rank2 = current_rank best_regularization2 = reg best_iter2 = iteration best_model_map = als_model print('{} latent factors and regularization = {} with maxIter {}: ' 'validation RMSE is {}' 'validation MAP is {}' .format(current_rank, reg, iteration, rmse, current_map)) with open('train05_review_eval.csv', 'ab') as f: np.savetxt(f, [np.array([iteration, current_rank, reg, rmse, current_map])],delimiter=",") print('\nThe best model select by RMSE has {} latent factors and ' 'regularization = {}'' with maxIter = {}: RMSE = {}'.format(best_rank1, best_regularization1, best_iter1, min_error)) print('\nThe best model select by MAP has {} latent factors and ' 'regularization = {}'' with maxIter = {}: MAP = {}'.format(best_rank2, best_regularization2, best_iter2, max_map)) return best_model_rmse,best_model_map