Esempio n. 1
0
def Tester(spark, model, df_test, rank, regParam, alpha, K=500):
    #df_test = spark.read.parquet(formatted_test_address)
    targetUsers = df_test.select("user_id_numeric").distinct()
    userRecs = model.recommendForUserSubset(targetUsers, K)
    userRecs = userRecs.select("user_id_numeric",
                               "recommendations.track_id_numeric",
                               "recommendations.rating")

    # need to get ordered list of track_id based on counts groupby individual users.
    # reference:https://stackoverflow.com/questions/46580253/collect-list-by-preserving-order-based-on-another-variable
    w = Window.partitionBy("user_id_numeric").orderBy(df_val['count'].desc())
    labels = df_val.withColumn('ActualRanking',
                               F.collect_list("track_id_numeric").over(w))
    labels = labels.select(['user_id_numeric', 'ActualRanking'
                            ]).dropDuplicates(['user_id_numeric'])

    # Get the metrics
    # predictionsAndlabels should be an RDD of (predicted ranking, ground truth set) pairs.
    # reference: https://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.evaluation.RankingMetrics
    predictionsAndlabels = userRecs.join(
        labels, [labels.user_id_numeric == userRecs.user_id_numeric],
        'left').select('track_id_numeric', 'ActualRanking')
    metricsRank = RankingMetrics(predictionsAndlabels.rdd)

    print("------------------------------------------")
    print("Params: Rank %f | regParam %f | alpha = %f" %
          (rank, regParam, alpha))
    print("p(15)   %.8f" % metricsRank.precisionAt(15))
    print("p(500)   %.8f" % metricsRank.precisionAt(500))
    print("MAP  %.8f" % metricsRank.meanAveragePrecision)
    print("nDCG %.8f" % metricsRank.ndcgAt(K))
    return
def annoy(als_model, user_truth, test_user, sc, n_trees=10, search_k=-1):
    print('creating annoy baseline with n_trees: ' + str(n_trees), 'search_k: ' + str(search_k))
    sc = SparkContext.getOrCreate()
    factors = als_model.userFactors
    size = factors.limit(1).select(F.size('features').alias('calculation')).collect()[0].calculation
    time_start = time()
    annoy_list = AnnoyIndex(size)
    for row in factors.collect():
        annoy_list.add_item(row.id, row.features)
    annoy_list.build(n_trees)
    annoy_list.save('./home/hj1325/final-project-final-project/annoy_list' + str(n_trees) + '_k_' + str(search_k) +
                    '.ann')
    recommend_list = [(user.user_label, annoy_list.get_nns_by_item(int(user.user_label), 500)) for user in
                      test_user.collect()]
    temp = sc.parallelize(recommend_list)
    print('recommendations has been created')
    recommend = spark.createDataFrame(temp, ['user_label', 'recommendation'])
    predictions = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner')

    score = predictions.select('recommendation', 'truth').rdd.map(tuple)
    metrics = RankingMetrics(score)
    precision = metrics.precisionAt(500)
    mean_average_precision = metrics.meanAveragePrecision
    print('time taken: ' + str(time() - time_start))
    print('precision at 500: ' + str(precision))
    print('mean average precision: ' + str(mean_average_precision))
    annoy_list.unload()
Esempio n. 3
0
def main(spark, model_file):

    ###
    train = spark.read.parquet('./train.parquet')
    #validation = spark.read.parquet('./validation.parquet')
    test = spark.read.parquet('./test.parquet')

    train_model = ALSModel.load(model_file)
    users = test.select('convert_user_id').distinct()
    user_recs = train_model.recommendForUserSubset(users, 500)
    prediction_df = user_recs.select('convert_user_id',
                                     'recommendations.convert_track_id')
    true_df = test.groupBy('convert_user_id').agg(
        expr('collect_list(convert_track_id) as true_items'))

    prediction_df.write.parquet('./recommendation_count.parquet')
    true_df.write.parquet('./true_count.parquet')

    prediction_rdd = prediction_df.join(true_df, 'convert_user_id') \
    .rdd \
    .map(lambda row: (row[1], row[2]))

    rankingMetrics = RankingMetrics(prediction_rdd)
    print(rankingMetrics.meanAveragePrecision)
    print(rankingMetrics.precisionAt(500))
Esempio n. 4
0
def main(spark, test_file, index_file, model_file):
    # Load the dataframe
    test = spark.read.parquet(test_file)
    indexer = PipelineModel.load(index_file)
    #transform user and track ids
    test = indexer.transform(test)
    #select distinct users for recommendations
    #testUsers = test.select("userNew").distinct().alias("userCol")
    #establish "ground truth"
    groundTruth = test.groupby("userNew").agg(
        F.collect_list("trackNew").alias("truth"))
    print("created ground truth df")
    alsmodel = ALSModel.load(model_file)
    rec = alsmodel.recommendForAllUsers(500)
    print("created recs")
    predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew,
                           'inner')

    scoreAndLabels = predictions.select('recommendations.trackNew',
                                        'truth').rdd.map(tuple)
    metrics = RankingMetrics(scoreAndLabels)
    precision = metrics.precisionAt(500)
    map_out = metrics.meanAveragePrecision
    print(f"precision at 500: {precision}")
    print(f"map : {map_out}")
Esempio n. 5
0
def main(spark, model_file, data_file):
    '''Main routine for supervised evaluation

    Parameters
    ----------
    spark : SparkSession object

    model_file : string, path to store the serialized model file

    data_file : string, path to the parquet file to load
    '''

    ###
    # TODO: YOUR CODE GOES HERE
    df = spark.read.parquet(data_file)
    model = PipelineModel.load(model_file)
    predictions = model.transform(df)
    #predictions_sorted = predictions.orderBy(desc('count')).limit(500).collect()
    print("smile")

    scoreAndLabels = predictions.select('prediction','count')
    print("smile again")
    scoreAndLabels.show(5)
    scoreAndLabels = scoreAndLabels.rdd 
    print("I am smiling")
    metrics = RankingMetrics(scoreAndLabels)

    precision = metrics.precisionAt(500)
    print(precision)
Esempio n. 6
0
def evaluateTopk(model,data,top_k=500):
    '''
    Input:
    validation: RDD
        - user, product (book_id), rating
    '''
    truth=spark.createDataFrame(data).groupby("user").agg(F.collect_set("product"))
    print("Getting Predictions...")
    tmp1=model.recommendProductsForUsers(top_k).map(lambda r: [r[0],[k.product for k in r[1]]])
    predictions=spark.createDataFrame(tmp1,["user","predictions"])


    print("Predictions and Labels...")
    k=predictions.join(truth,truth.user==predictions.user)
    final=k.rdd.map(lambda r: [r[1],r[3]])
    metrics=RankingMetrics(final)

    print("\nCalculate NDCG at {}...".format(top_k))
    res1=metrics.ndcgAt(top_k)
    print("NDCG at {}: {}".format(top_k,res1))

    print("\nCalculate MAP...")
    res2=metrics.meanAveragePrecision
    print("MAP: {}".format(res2))

    print("\nCalculate Precision at {}...".format(top_k))
    res3=metrics.precisionAt(top_k)
    print("Precision at {}: {}".format(top_k,res1))

    return res1,res2,res3
Esempio n. 7
0
def main(spark, txt):
    model = ALSModel.load('hdfs:/user/jm7955/' + args.model)
    distinct_users = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' %
                                        args.distinct)

    print("distinct_users")
    print('finished writing in %d seconds' % int(timer() - start))
    #distinct_users.show()
    labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels)
    print("labels")
    #labels.show()
    print('finished writing in %d seconds' % int(timer() - start))

    predictions = model.recommendForUserSubset(distinct_users, 500)\
        .select('user', F.col('recommendations.item').alias('item'))
    print("predictions")
    #predictions.show()
    print('finished writing in %d seconds' % int(timer() - start))
    predictionsAndLabels = predictions.join(
        labels, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print("predictionsAndLabels")
    print('finished writing in %d seconds' % int(timer() - start))

    metrics = RankingMetrics(predictionsAndLabels)
    print('finished writing in %d seconds' % int(timer() - start))

    file = open(txt, 'w')

    file.write('metrics.meanAveragePrecision: %s\n' %
               metrics.meanAveragePrecision)
    file.write('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500))
    file.write('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
    file.close()
def main(spark, model_file, test_file):
    '''
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the parquet file
    test = spark.read.parquet(test_file)
    test = test.sort('user', ascending=False)
    test.createOrReplaceTempView('test_table')
    model = ALSModel.load(model_file)

    user_subset = test.select("user").distinct()
    user_subset = model.recommendForUserSubset(user_subset, 500)

    user_subset = user_subset.select("user",
                                     col("recommendations.item").alias("item"))
    user_subset = user_subset.sort('user', ascending=False)
    print("sort user")
    predictionAndLabels = user_subset.join(
        test, ["user"], "inner").rdd.map(lambda tup: (tup[1], tup[2]))
    print("joined predictions and counts")

    metrics = RankingMetrics(predictionAndLabels)
    print("made metrics")
    MAP = metrics.meanAveragePrecision
    precision = metrics.precisionAt(500)
    ndcg = metrics.ndcgAt(500)

    print('MAP: %f' % MAP)
    print('Precision: %f' % precision)
    print('NDCG: %f' % ndcg)
Esempio n. 9
0
def annoy(alsmodel, groundTruth, testUsers, sc, n_trees=10, search_k=-1):
    print(f"annoy index version with n_trees: {n_trees}, search_k: {search_k}")
    sc = SparkContext.getOrCreate()
    userfactors = alsmodel.userFactors
    size = userfactors.limit(1).select(
        F.size("features").alias("calc_size")).collect()[0].calc_size
    start_time = time()
    a = AnnoyIndex(size)
    for row in userfactors.collect():
        a.add_item(row.id, row.features)
    a.build(n_trees)
    a.save("./anns/annoy_t" + str(n_trees) + "_k_" + str(search_k) + ".ann")
    rec_list = [(u.userNew, a.get_nns_by_item(int(u.userNew), 500))
                for u in testUsers.collect()]
    temp = sc.parallelize(rec_list)
    print("created recs")
    rec = spark.createDataFrame(temp, ["userNew", "recs"])
    predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew,
                           'inner')

    scoreAndLabels = predictions.select('recs', 'truth').rdd.map(tuple)
    metrics = RankingMetrics(scoreAndLabels)
    precision = metrics.precisionAt(500)
    MAP = metrics.meanAveragePrecision
    print(f"time elapsed: {time()-start_time}s")
    print(f"precision at 500: {precision}")
    print(f"MAP: {MAP}")
    a.unload()
Esempio n. 10
0
def mAPandprecisionatK(spark, model, k, labels, user_ids):
    '''
    Function to print the metric meanAveragePrecision and Precisionatk

    Parameters
    ----------
    spark : spark session object
    model: type-MLlib model: developed model
    k: type-int: Top-k predictions for every user
    scoreAndLabels: type-RDD: predicted scores and actual Labels
    user_ids: user_ids to recommend products for

    return
    ----------
    None
    '''
    recs = []
    for uid in user_ids:
        # recommend k products for each user
        temp_recs = model.recommendProducts(uid.user_id, k)
        # collect only the book_ids from the recommendations
        recs.append([temp_rec.product for temp_rec in temp_recs])

    l = labels.map(lambda tup: float(tup[1])).collect()
    rdd = spark.sparkContext.parallelize([(recs, l)])
    m = RankingMetrics(rdd)
    print("meanAveragePrecision {}".format(m.meanAveragePrecision))
    print("Precision at K for K ={} is {}" .format(k, m.precisionAt(k)))
Esempio n. 11
0
def evaluation(df, model, ks):
	'''
	Evaluate the model.
	ks: a list of parameter k used in precision at k and NDCG at k.
	'''

	print(' Make predictions...')
	predictions = model.recommendForUserSubset(df, 500)

	print(' Prepare ground truth set and predicted set...')
	labels = df.groupBy('user').agg(F.collect_set('item')).collect()
	user_pred = predictions.select('user','recommendations.item').rdd.flatMap(lambda x:[x]).collect()
	labels = sorted(labels, key = lambda x: x.user)
	user_pred = sorted(user_pred, key = lambda x: x.user)
	print(' Combine ground truth set and predicted set...')
	predictionAndLabels = []
	for i in range(len(user_pred)):
		predictionAndLabels.append((user_pred[i].item, labels[i][1]))
	print(' Parallelize...')
	predictionAndLabels = sc.parallelize(predictionAndLabels, numSlices=2000)
	print(' Calculate metrics...')
	metrics = RankingMetrics(predictionAndLabels)
	eval_results = []
	eval_results.append(metrics.meanAveragePrecision)
	for k in ks:
		eval_results.append(metrics.precisionAt(k))
		eval_results.append(metrics.ndcgAt(k))

	return eval_results
Esempio n. 12
0
def get_rankMetrics(spark, df, trained_model, approx=False, k=500):
    """
    This function evaluates the performance of a given model on a given dataset using Ranking Metrics,
    and returns the final performance metrics.

    Parameters
    ----------
    df: DataFrame to evaluate on
    trained_model: trained model to evaluate
    approx: boolean; use ANN(approximate nearest neighbors) when True
    k: number of recommendation 
    ----------
    """
    import datetime
    import nmslib_recommend2
    import pyspark.sql.functions as F
    from pyspark.mllib.evaluation import RankingMetrics

    # change column names
    df = df.select(['user_id', 'book_id',
                    'rating']).toDF('user', 'item', 'rating')

    # relevant item if its centered rating > 0
    fn = F.udf(lambda x: 1.0 if x >= 3 else 0.0)
    df = df.withColumn('rating', fn(df.rating))
    relevant = df[df.rating == 1.0].groupBy('user').agg(F.collect_list('item'))

    # recommend k items for each user
    print("recommendation time comparison start: ",
          datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    if approx:
        recommend = nmslib_recommend2.nmslib_recommend(spark, df,
                                                       trained_model, k)
        recommend = spark.createDataFrame(recommend, ["user", "recommend"])
        joined = recommend.join(relevant, on='user')
        rec_and_rel = []
        for user, rec, rel in joined.collect():
            rec_and_rel.append((rec, rel))
    else:
        userSubset = relevant.select('user')
        recommend = trained_model.recommendForUserSubset(userSubset, 500)
        joined = recommend.join(relevant, on='user')
        rec_and_rel = []
        for user, rec, rel in joined.collect():
            predict_items = [i.item for i in rec]
            rec_and_rel.append((predict_items, rel))
    print("recommendation time comparison end: ",
          datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    # Compute metrics
    rec_and_rel_rdd = spark.sparkContext.parallelize(rec_and_rel)
    metric_class = RankingMetrics(rec_and_rel_rdd)

    ndcg = metric_class.ndcgAt(k)
    map_ = metric_class.meanAveragePrecision
    pk = metric_class.precisionAt(k)

    return print("NDCG:", ndcg, "\nMAP:", map_, "\nPrecision:", pk)
Esempio n. 13
0
    def precision_at_k(self, k):
        """
        Calculate precision at k for the predicted rankings

        :param k: int, calculate precision at k
        :return : int, precision
        """
        rank = self.pred_rankings.rdd.map(lambda tup: (tup[2], tup[1]))
        metrics = RankingMetrics(rank)
        return metrics.precisionAt(k)
Esempio n. 14
0
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed):

    val_set = spark.read.parquet(f'{dirname}/val.parquet')

    print(
        f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...'
    )

    # load corresponding trained model
    model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model')

    # computing RMSE on validation set
    predictions = model.transform(val_set)
    evaluator = RegressionEvaluator(metricName='rmse',
                                    labelCol='rating',
                                    predictionCol='prediction')
    rmse = evaluator.evaluate(predictions)

    print(f'rmse: {rmse}')

    print(f'Constructing top {k} books recommended to per user ...')
    val_users = val_set.select('user_id').distinct()

    start_time = time.time()

    perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k)

    myudf = udf(extract_item, ArrayType(IntegerType()))
    perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn(
        'predictions',
        myudf(perUserPredictedTopKItemsDF['recommendations'])).drop(
            'recommendations')

    print('Constructing actual books per user ...')
    perUserActualItemsDF = val_set.filter(
        column('rating') >= 3.0).groupBy('user_id').agg(
            expr('collect_list(book_id) as book_ids'))

    print('Constructing Ranking Metrics ...')
    perUserItemsRDD = perUserPredictedTopKItemsDF.join(
        perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2]))

    rankingMetrics = RankingMetrics(perUserItemsRDD)

    precisionAtK = rankingMetrics.precisionAt(k)
    mAP = rankingMetrics.meanAveragePrecision

    end_time = time.time()
    time_delta = str(datetime.timedelta(seconds=end_time - start_time))

    print(f'p@{k}: {precisionAtK}')
    print(f'mAP: {mAP}')
    print(f'run time: {time_delta}')
Esempio n. 15
0
def top_k_rankingmetrics(dataset=None,
                         k=10,
                         ranking_metrics="precisionAt",
                         user="******",
                         item="book_id",
                         rating="rating",
                         prediction="prediction"):
    '''
	This function is to compute the ranking metrics from predictions.
	Input:
	1. k: only evaluate the performance of the top k items
	2. ranking_metrics: precisionAt, meanAveragePrecision, ndcgAt
	3. user, item, prediction: column names; string type

	refer to https://vinta.ws/code/spark-ml-cookbook-pyspark.html
	'''
    if dataset == None:
        print("Error! Please specify a dataset.")
        return
    # prediction table
    windowSpec = Window.partitionBy(user).orderBy(col(prediction).desc())
    perUserPredictedItemsDF = dataset \
     .select(user, item, prediction, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # actual target table
    windowSpec = Window.partitionBy(user).orderBy(col(rating).desc())
    perUserActualItemsDF = dataset \
     .select(user, item, rating, F.rank().over(windowSpec).alias('rank')) \
     .where('rank <= {}'.format(k)) \
     .groupBy(user) \
     .agg(expr('collect_list({}) as items'.format(item)))
    # join
    perUserItemsRDD = perUserPredictedItemsDF \
     .join(F.broadcast(perUserActualItemsDF), user, 'inner') \
     .rdd \
     .map(lambda row: (row[1], row[2]))
    ranking_metrics_evaluator = RankingMetrics(perUserItemsRDD)
    # get the result of the metric
    if ranking_metrics == "precisionAt":
        precision_at_k = ranking_metrics_evaluator.precisionAt(k)
        #print("precisionAt: {}".format(round(precision_at_k, 4)))
        return precision_at_k
    elif ranking_metrics == "meanAveragePrecision":
        mean_avg_precision = ranking_metrics_evaluator.meanAveragePrecision(k)
        #print("meanAveragePrecision: {}".format(round(mean_avg_precision, 4)))
        return mean_avg_precision
    elif ranking_metrics == "ndcgAt":
        ndcg_at_k = ranking_metrics_evaluator.ndcgAt(k)
        #print("meanAveragePrecision: {}".format(round(ndcg_at_k, 4)))
        return ndcg_at_k
def main(spark, test_file, train_file, model_path):

    # Read data from parquet
    print('Reading parquet file ...')
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')
    train = spark.read.parquet(train_file)
    train.createOrReplaceTempView('train')

    # Load the best model from training
    print('Loading model ...')
    best_model = ALSModel.load(model_path)

    # get recommendations for users in test set
    print('Evaluating model on test set ...')
    test_users = test.select("user_id").distinct()
    rec_test = best_model.recommendForUserSubset(test_users, 700)
    pred_test_700 = rec_test.select(
        rec_test.user_id,
        rec_test.recommendations.book_id.alias('rec_book_id'))

    sub_train_test = spark.sql('SELECT user_id, book_id \
                                FROM train \
                                WHERE user_id IN (SELECT DISTINCT user_id FROM test)'
                               )

    df_train_book_test = sub_train_test.groupby('user_id').agg(
        F.collect_set('book_id').alias('train_book_id'))

    df_join_test = pred_test_700.join(df_train_book_test, 'user_id')
    diff = F.udf(book_diff, ArrayType(IntegerType()))
    df_join_pred_test = df_join_test.withColumn(
        'predictions',
        diff(df_join_test.rec_book_id, df_join_test.train_book_id))
    pred_test = df_join_pred_test.select(df_join_pred_test.user_id,
                                         df_join_pred_test.predictions)

    # get true preferences of users in validation set
    label_test = test.filter(test.rating >= 3).groupby("user_id").agg(
        F.collect_list("book_id"))
    predAndLabel_test = pred_test.join(
        label_test, 'user_id').rdd.map(lambda row: (row[1], row[2]))

    # Use Mean Average Precision as evaluation metric
    metrics_test = RankingMetrics(predAndLabel_test)
    MAP_test = metrics_test.meanAveragePrecision
    pak_100_test = metrics_test.precisionAt(100)
    pak_500_test = metrics_test.precisionAt(500)
    print('\n')
    print(
        'Ranking scores of the best model on test data: MAP = {}, Precision@100 = {}, Precision@500 = {}'
        .format(MAP, pak_100_test, pak_500_test))
def dummy_run(spark):

    from pyspark.ml.recommendation import ALS
    from pyspark.mllib.evaluation import RankingMetrics
    import pyspark.sql.functions as F
    from pyspark.sql.functions import expr

    train=spark.createDataFrame(
    [
        (82, 124, 5.0),
        (64, 123, 4.0),
        (27, 122, 3.0),
        (25, 122, 1.0),
        (12, 124, 2.0)
    ],
    ['user_id', 'book_id', 'rating'] 
    )

    val=spark.createDataFrame(
    [
        (82, 123, 5.0),
        (64, 122, 4.0),
        (27, 124, 3.0),
        (64, 123, 2.0),
        (12, 122, 4.0)
    ],
    ['user_id', 'book_id', 'rating'] 
    )

    user_id = val.select('user_id').distinct()
    true_label = val.select('user_id', 'book_id')\
                .groupBy('user_id')\
                .agg(expr('collect_list(book_id) as true_item'))

    als = ALS(rank = 3 , regParam=0.1, 
                userCol="user_id", itemCol="book_id", ratingCol='rating', 
                implicitPrefs=False, coldStartStrategy="drop")
    model = als.fit(train)

    recs = model.recommendForUserSubset(user_id, 2)
    pred_labels = recs.select('user_id','recommendations.book_id')
    pred_true_rdd = pred_labels.join(F.broadcast(true_label), 'user_id', 'inner') \
                .rdd \
                .map(lambda row: (row[1], row[2]))
    
    metrics = RankingMetrics(pred_true_rdd)
    mean_ap = metrics.meanAveragePrecision
    ndcg_at_k = metrics.ndcgAt(2)
    p_at_k= metrics.precisionAt(2)
    print('MAP: ', mean_ap , 'NDCG: ', ndcg_at_k, 'Precision at k: ', p_at_k)
    return 
Esempio n. 18
0
def hyperparameter_tuning(spark, train_file, val_file):

    # load in the data
    train = spark.read.parquet(train_file)
    train.createOrReplaceTempView('train')
    val = spark.read.parquet(val_file)
    val.createOrReplaceTempView('val')
    user_idxer = StringIndexer(inputCol = 'user_id', outputCol = 'user', handleInvalid = "skip")
    item_idxer = StringIndexer(inputCol = 'book_id', outputCol = 'item', handleInvalid = "skip")
    pipeline = Pipeline(stages = [user_idxer, item_idxer])
    indexers = pipeline.fit(train)
    train = indexers.transform(train)
    val = indexers.transform(val)
    val = val.withColumn('item', val['item'].cast('int'))
    val = val.withColumn('user', val['user'].cast('int'))
    val_users = val.select('user').distinct()
    val_groundtruth = val.groupby('user').agg(F.collect_list('item').alias('truth')).cache()

    # ranks to test
    # ranks = [1,2,5,10,50]
    ranks = [10]

    # regParams to test
    # lambdas = [0.01, 0.1, 1, 2, 10]
    lambdas = [0.01]

    # Set up list for results
    p = []
    iters = len(ranks) * len(lambdas)
    count = 0

    for r in ranks:
        for lam in lambdas:
            print('regParam: {}, Rank: {}'.format(lam, r))
            als = ALS(regParam = lam, rank = r,
                userCol='user', itemCol='item', seed=2020, ratingCol='rating',
                nonnegative=True, coldStartStrategy='drop',
                intermediateStorageLevel='MEMORY_AND_DISK', finalStorageLevel='MEMORY_AND_DISK')
            model = als.fit(train)
            rec = model.recommendForAllUsers(500)
            predictions = rec.join(val_groundtruth, rec.user == val_groundtruth.user, 'inner')
            predictions = predictions.select('recommendations.item', 'truth')
            predictionAndLabels = predictions.rdd.map(tuple).repartition(1000)
            metrics = RankingMetrics(predictionAndLabels)
            precision = metrics.precisionAt(500)
            MAP = metrics.meanAveragePrecision

            p.append([lam, r, MAP, precision, model, als])
            count += 1
            print('precision: {}, MAP: {}'.format(precision, MAP))
            print('done with iter {} out of {}'.format(count, iters))
def baseline(als_model, user_truth, test_user):
    print('creating baseline model')
    time_start = time()
    recommend = als_model.recommendForUserSubset(test_user, 500)
    print('recommendation has been created.')
    predictions = recommend.join(user_truth, recommend.user_label == user_truth.user_label, 'inner')

    score = predictions.select('recommendations.book_label', 'truth').rdd.map(tuple)
    metrics = RankingMetrics(score)
    precision = metrics.precisionAt(500)
    mean_average_precision = metrics.meanAveragePrecision
    print('time taken: ' + str(time() - time_start))
    print('precision at 500: ' + str(precision))
    print('mean average precision: ' + str(mean_average_precision))
def main(spark, train_file, test_file, output_file):

    sys.stdout = open(output_file, 'w')

    # Read data from parquet
    print('Reading parquet files ...')
    train = spark.read.parquet(train_file)
    train.createOrReplaceTempView('train')
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')

    print(
        'Recommending most popular 500 books in terms of average rating counted in training set ...'
    )

    # get recommendations for users in test set
    test_users = test.select("user_id").distinct()
    book_top500_train = spark.sql('SELECT book_id, AVG(rating) \
                                   FROM train \
                                   WHERE book_id IN(SELECT DISTINCT book_id \
                                                    FROM train \
                                                    GROUP BY book_id \
                                                    HAVING COUNT(*) >= 20) \
                                   GROUP BY book_id \
                                   ORDER BY AVG(rating) DESC \
                                   LIMIT 500')

    rec_list = book_top500_train.select(book_top500_train.book_id).agg(
        F.collect_list('book_id'))
    rec = test_users.rdd.cartesian(
        rec_list.rdd).map(lambda row: (row[0][0], row[1][0])).toDF()
    pred = rec.select(rec._1.alias('user_id'), rec._2.alias('pred'))

    print('Collecting true labels for each test user')
    # get true preferences of users in test set
    # ground truth in test set
    sub_test = spark.sql('SELECT user_id, book_id FROM test WHERE rating >= 3')
    label = sub_test.groupby('user_id').agg(
        F.collect_set('book_id').alias('label'))

    predAndLabel = pred.join(label,
                             'user_id').rdd.map(lambda row: (row[1], row[2]))

    # Use Mean Average Precision as evaluation metric
    metrics = RankingMetrics(predAndLabel)
    MAP = metrics.meanAveragePrecision
    pat500 = metrics.precisionAt(500)

    print('Scores on test set: MAP = {} and Precision at 500 = {}'.format(
        MAP, pat500))
def annoy_model(als_model,
                sc,
                groundTruth_test,
                test_users,
                n_trees=10,
                search_k=-1):
    print(f"annoy model with n_trees: {n_trees}, search_k: {search_k}")

    sc = SparkContext.getOrCreate()

    user_factors = als_model.userFactors
    size = user_factors.limit(1).select(
        F.size("features").alias("calc_size")).collect()[0].calc_size
    start_time = time()
    index_size = AnnoyIndex(size)

    for row in user_factors.collect():
        index_size.add_item(row.id, row.features)

    index_size.build(n_trees)
    index_size.save("./annoy_result/annoy_t" + str(n_trees) + "_k_" +
                    str(search_k) + ".ann")

    rec_list = [(user.user_id,
                 index_size.get_nns_by_item(int(user.user_id), 500))
                for user in test_users.collect()]

    temp = sc.parallelize(rec_list)

    print("Annoy-Recommendations (500) created for test users")

    rec = spark.createDataFrame(temp, ["user_id", "recommendations"])

    pred_test = rec.join(groundTruth_test,
                         rec.user_id == groundTruth_test.user_id, 'inner')

    predAndLabels_test_annoy = pred_test.select('recommendations',
                                                'test_truth').rdd.map(tuple)

    metrics_test_annoy = RankingMetrics(predAndLabels_test_annoy)
    precision_test_annoy = metrics_test_annoy.precisionAt(500)
    map_test_annoy = metrics_test_annoy.meanAveragePrecision

    print(f"Time taken: {time() - start_time}s")
    print(f"Precision at 500: {precision_test_annoy}")
    print(f"Mean Average Precision: {map_test_annoy}")

    index_size.unload()
Esempio n. 22
0
def baseline(alsmodel, groundTruth, testUsers):
    print("baseline version")
    start_time = time()
    rec = alsmodel.recommendForUserSubset(testUsers, 500)
    print("created recs")
    predictions = rec.join(groundTruth, rec.userNew == groundTruth.userNew,
                           'inner')

    scoreAndLabels = predictions.select('recommendations.trackNew',
                                        'truth').rdd.map(tuple)
    metrics = RankingMetrics(scoreAndLabels)
    precision = metrics.precisionAt(500)
    MAP = metrics.meanAveragePrecision
    print(f"time elapsed: {time()-start_time}s")
    print(f"precision at 500: {precision}")
    print(f"MAP: {MAP}")
Esempio n. 23
0
    def __evaluate_ranking(self, rnk_inf: SparkDF):
        test_ground_truth = self.__test.groupBy("user_id").agg(collect_list("business_id").alias("business_gt"))

        pred_with_labels = rnk_inf.join(test_ground_truth, on="user_id").drop("user_id")

        metrics = RankingMetrics(pred_with_labels.rdd)

        results = {}

        for m in self.ranking_metrics:
            metric_name = "{}@{}".format(m, self.top_k)
            if "ndcg" in m:
                results[metric_name] = metrics.ndcgAt(self.top_k)
            elif m == "precision":
                results[metric_name] = metrics.precisionAt(self.top_k)

        return results
Esempio n. 24
0
def recsys(spark):
    # Load data from parquet
    val = spark.read.parquet("val_set.parquet")
    test = spark.read.parquet("test_set.parquet")
    cols_to_drop = ['is_read', 'is_reviewed']
    test = test.drop(*cols_to_drop)
    val = val.drop(*cols_to_drop)

    # Load model from path
    model_path = "hdfs:/user/ago265/best_model"
    best_model = ALSModel.load(model_path)

    # Compile a list of all the books each user read
    val_users = val.select("user_id").distinct()

    val_books = val.select("user_id", "book_id")\
                                .groupBy("user_id")\
                                .agg(expr('collect_list(book_id) as books'))

    test_users = test.select("user_id").distinct()
    test_books = test.select("user_id", "book_id").groupBy("user_id").agg(expr('collect_list(book_id) as books'))


    # # Recommender System for all users at k=500
    # k = 500
    # print('Making top 500 recommendations for all users')
    # rec = best_model.recommendForAllUsers(k)

    # Recommender System for subset of users at k=10
    k = 10
    print('Making top {} recommendations for a subset of users'.format(k))
    rec = best_model.recommendForUserSubset(test_users, k)
    pred_label = rec.select('user_id','recommendations.book_id')

    # Create an RDD to evaluate with Ranking Metrics
    final_df = pred_label.join(test_books,['user_id'],'inner').select('book_id','books')
    final_rdd = final_df.rdd.map(lambda x: (x.book_id, x.books))
    
    metrics = RankingMetrics(final_rdd)
    result1 = metrics.meanAveragePrecision
    result2 = metrics.precisionAt(k)
    result3 = metrics.ndcgAt(k)
    print("MAP = ", result1)
    print("Precision at k = ", result2)
    print("NDCG at k = ", result3)
def brute_force(als_model, groundTruth_test, test_users):
    print("Normal Recommender system-Brute force")
    start_time = time()
    rec = als_model.recommendForUserSubset(test_users, 500)
    print("Normal-500 recommendations for test users generated")

    predictions_test = rec.join(groundTruth_test,
                                rec.user_id == groundTruth_test.user_id,
                                'inner')

    predAndLabels_test = predictions_test.select('recommendations.book_id',
                                                 'test_truth').rdd.map(tuple)
    metrics_test = RankingMetrics(predAndLabels_test)
    precision_test = metrics_test.precisionAt(500)
    map_test = metrics_test.meanAveragePrecision
    print(f"Time taken: {time() - start_time}s")
    print(f"Precision at 500: {precision_test}")
    print(f"Mean Average Precision: {map_test}")
Esempio n. 26
0
def get_val_metrics(model, val):
    preds = model.transform(val)
    recs = model.recommendForUserSubset(val, 500)
    
    top_items = recs.selectExpr('user as user', 'recommendations.item as top_items')
    true_items = val.where(val.rating >= 3).groupby('user').agg(collect_list('item').alias('true_item_list'))
    predictions_and_labels_rankings = top_items.join(true_items, how = 'inner', on = 'user')\
        .select('true_item_list', 'top_items')
    
    predictions_and_labels_rankings.write.json('val_recs.json')
    
    ranking_metrics = RankingMetrics(predictions_and_labels_rankings.cache().rdd)
    prec_at = ranking_metrics.precisionAt(500)
    mean_avg_prec = ranking_metrics.meanAveragePrecision
    ndcg = ranking_metrics.ndcgAt(500)
    
    rmse = RegressionMetrics(preds.select('rating', 'prediction').cache().rdd).rootMeanSquaredError
    evaluator = RegressionEvaluator(predictionCol = 'prediction', labelCol = 'rating', metricName = 'rmse')
    rmse = evaluator.evaluate(preds)
    return rmse, prec_at, mean_avg_prec, ndcg
Esempio n. 27
0
def main(spark):
    val_df = spark.read.parquet(
        'hdfs:/user/jm7955/test_full_indexed.parquet').drop('count')
    labels = spark.read.parquet('hdfs:/user/jm7955/%s.parquet' % args.labels)

    predictions = val_df.groupBy("item").count().orderBy(
        "count", ascending=False).limit(500).collect()
    predictions = [row.item for row in predictions]
    print("predictions")
    #predictions.show()
    print('finished writing in %d seconds' % int(timer() - start))
    predictionsAndLabels = labels.rdd.map(lambda tup: (predictions, tup[1]))
    print("predictionsAndLabels")
    print('finished writing in %d seconds' % int(timer() - start))

    metrics = RankingMetrics(predictionsAndLabels)
    print('finished writing in %d seconds' % int(timer() - start))

    print('metrics.meanAveragePrecision: %s\n' % metrics.meanAveragePrecision)
    print('metrics.precisionAt(500) %s\n' % metrics.precisionAt(500))
    print('metrics.ndcgAt(500) %s\n' % metrics.ndcgAt(500))
Esempio n. 28
0
def main(spark, model_file, data_file, K):
    '''Main routine for Collaborative Filtering Model testing

        Parameters
        ----------
        spark: SparkSession object

        model_file: string, path to store the model

        data_file: string, path to the parquet file to load

        K: int, evaluations are based on predictions of the top K items for each user
        '''
    testIdx = spark.read.parquet(data_file)
    model = ALSModel.load(model_file)

    users_val = testIdx.select("user_idx").distinct()

    perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K)
    perUserPredictedItemsDF = perUserPredictedItemsDF.select(
        "user_idx", "recommendations.track_idx").withColumnRenamed(
            'user_idx', 'user').withColumnRenamed('recommendations.track_idx',
                                                  'items')

    w2 = Window.partitionBy('user_idx').orderBy(col('count').desc())
    perUserActualItemsDF = testIdx.select(
        'user_idx', 'track_idx', 'count',
        F.rank().over(w2).alias('rank')).where(
            'rank <= {0}'.format(K)).groupBy('user_idx').agg(
                expr('collect_list(track_idx) as items')).withColumnRenamed(
                    'user_idx', 'user')

    perUserItemsRDD = perUserPredictedItemsDF.join(
        perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2]))
    rankingMetrics = RankingMetrics(perUserItemsRDD)

    print("============================================")
    print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision)
    print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K))
    print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
Esempio n. 29
0
def main(spark, data_file, val_file, model_file):
    # Load the dataframe
    df = spark.read.parquet(data_file)
    df = df.sample(True, 0.0001)
    val_df = spark.read.parquet(val_file)
    val_df = df.sample(True, 0.01) 
    
    user_indexer  = StringIndexer(inputCol = "user_id", outputCol = "userNew", handleInvalid = "skip")
    track_indexer = StringIndexer(inputCol = "track_id", outputCol = "trackNew", handleInvalid = "skip")
    
    RegParam = [0.001, 0.01] # 0.1, 1, 10]
    Alpha = [0.1, 1]#5,10, 100]
    Rank = [5,10] #50,100,1000]
    sc = spark.sparkContext
    PRECISIONS = {}
    count = 0
    for i in RegParam:
        for j in Alpha:
            for k in Rank:
                print(f"i: {i}, j: {j}, k: {k}")
                als = ALS(maxIter=5, regParam = i, alpha = j, rank = k, \
                          userCol="userNew", itemCol="trackNew", ratingCol="count",\
                          coldStartStrategy="drop")
                pipeline = Pipeline(stages = [user_indexer, track_indexer, als]) 
                model = pipeline.fit(df)
                #val_predictions = model.transform(val_df)
                alsmodel = model.stages[-1]
                rec = alsmodel.recommendForAllUsers(500)
                print(rec.show(10))
                #scoreAndLabels = val_predictions.rdd
                #sc = spark.sparkContext
                #scoreAndLabels = sc.parallelize(scoreAndLabels)
                metrics = RankingMetrics(scoreAndLabels)
                precision = metrics.precisionAt(500)
                PRECISIONS[precision] = model
                count += 1
                print(count)
                print(precision)
def main(spark, model_file, test_file):
    test_data = spark.read.parquet(test_file)
    als_model_tuned = ALSModel.load(model_file)

    print("Imported trained model and test data sets")

    #generating true values of book_id for each user_id
    groundTruth_test = test_data.groupby("user_id").agg(
        F.collect_list("book_id").alias("test_truth"))
    print("Created ground truth df for test set")

    # user_test_list=spark.sql('select distinct user_id from groundTruth_val where user_id=14')
    # rec = als_model_normal.recommendForUserSubset(user_test_list,500)

    #generating recs
    rec = als_model_tuned.recommendForAllUsers(500)
    print("500 recommendations for all users generated")

    #creating dataframe to have both true values and predicted values
    predictions_test = rec.join(groundTruth_test,
                                rec.user_id == groundTruth_test.user_id,
                                'inner')

    #coverting to rdd for RankingMetrics()
    predAndLabels_test = predictions_test.select('recommendations.book_id',
                                                 'test_truth').rdd.map(tuple)

    print("starting ranking metrics for test data")
    metrics_test = RankingMetrics(predAndLabels_test)

    #calculating metrics
    precision_test = metrics_test.precisionAt(500)
    map_test = metrics_test.meanAveragePrecision
    ndcg_test = metrics_test.ndcgAt(500)

    print('Test set , Precision at 500: {}'.format(precision_test))
    print('Test set , Mean Average Precision : {}'.format(map_test))
    print('Test set, ndcgAt500 : {}'.format(ndcg_test))
Esempio n. 31
0
    
ratings_train = train.map(lambda r: parseLine(r))
ratings_test = test.map(lambda r: parseLine(r))

sample_test = ratings_test.sample(False,0.1) #tirando uma amostra dos usuarios de teste
sample_test.count() #quantidade de usuarios de teste usados nesse exemplo
test_users = sample_test.map(lambda x: x.user).collect()

model = ALS.trainImplicit(ratings_train, 10, 10)

recs={}
for u in test_users:
    rec = model.recommendProducts(u,10)
    recs[u]=map(lambda r: r[1],rec)
   
groundTruth = {}
userItemTestRDD = sample_test.map(lambda x: (x.user,x.product)) 
trueRec = userItemTestRDD.groupByKey().collect()
for x in trueRec:
    groundTruth[x[0]]=list(x[1]) 
    
predictionsAndLabels = []
for u in test_users:
    predictionsAndLabels.append((recs[u],groundTruth[u]))

predictionsAndLabelsRDD = sc.parallelize(predictionsAndLabels)

metrics = RankingMetrics(predictionsAndLabelsRDD)

metrics.precisionAt(5)