Ejemplo n.º 1
0
def main(spark, model_file, data_file, user_file, track_file):

    #load ALS model
    als_model = ALSModel.load(model_file)
    user_indexer = StringIndexer.load(user_file)
    track_indexer = StringIndexer.load(track_file)

    #read in test data as parquet
    df_test = spark.read.parquet(data_file)
    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    mapping = pipeline.fit(df_test)
    df_test = mapping.transform(df_test)

    ########### PERFORM RANKING METRICS ###########

    #create user actual items dataframe
    actual_recs = df_test.groupBy('user_idx').agg(
        F.collect_list('track_idx').alias('track_idx'))

    #create user predicted items dataframe
    user_subset = df_test.select('user_idx').distinct()
    pred_recs = als_model.recommendForUserSubset(user_subset, 500)
    pred_recs = pred_recs.select(
        'user_idx',
        F.col('recommendations.track_idx').alias('track_idx'))

    #create user item RDD & join on users
    perUserItemsRDD = pred_recs\
                        .join(actual_recs, on='user_idx').rdd\
                        .map(lambda row: (row[1], row[2]))

    rankingMetrics = RankingMetrics(perUserItemsRDD)

    #print results to the console
    print("Ranking Metrics MAP: ", rankingMetrics.meanAveragePrecision)