Example #1
0
def main(spark, data_file, val_file, model_file):
    # Load the dataframe
    df = spark.read.parquet(data_file)
    df = df.sample(True, 0.0001)
    val_df = spark.read.parquet(val_file)
    val_df = df.sample(True, 0.01) 
    
    user_indexer  = StringIndexer(inputCol = "user_id", outputCol = "userNew", handleInvalid = "skip")
    track_indexer = StringIndexer(inputCol = "track_id", outputCol = "trackNew", handleInvalid = "skip")
    
    RegParam = [0.001, 0.01] # 0.1, 1, 10]
    Alpha = [0.1, 1]#5,10, 100]
    Rank = [5,10] #50,100,1000]
    sc = spark.sparkContext
    PRECISIONS = {}
    count = 0
    for i in RegParam:
        for j in Alpha:
            for k in Rank:
                print(f"i: {i}, j: {j}, k: {k}")
                als = ALS(maxIter=5, regParam = i, alpha = j, rank = k, \
                          userCol="userNew", itemCol="trackNew", ratingCol="count",\
                          coldStartStrategy="drop")
                pipeline = Pipeline(stages = [user_indexer, track_indexer, als]) 
                model = pipeline.fit(df)
                #val_predictions = model.transform(val_df)
                alsmodel = model.stages[-1]
                rec = alsmodel.recommendForAllUsers(500)
                print(rec.show(10))
                #scoreAndLabels = val_predictions.rdd
                #sc = spark.sparkContext
                #scoreAndLabels = sc.parallelize(scoreAndLabels)
                metrics = RankingMetrics(scoreAndLabels)
                precision = metrics.precisionAt(500)
                PRECISIONS[precision] = model
                count += 1
                print(count)
                print(precision)
Example #2
0
    def _calculate_metrics(self):
        """Calculate ranking metrics."""
        self._items_for_user_pred = self.rating_pred

        self._items_for_user_true = (self.rating_true.groupBy(
            self.col_user).agg(
                expr("collect_list(" + self.col_item +
                     ") as ground_truth")).select(self.col_user,
                                                  "ground_truth"))

        self._items_for_user_all = self._items_for_user_pred.join(
            self._items_for_user_true, on=self.col_user).drop(self.col_user)

        return RankingMetrics(self._items_for_user_all.rdd)
def main(spark, data_file_train, data_file_val):

    start = time.time()

    # reading training and validation files
    df_train = spark.read.parquet(data_file_train)
    df_val = spark.read.parquet(data_file_val)

    window_user_ordered = Window.partitionBy('user_id').orderBy('rating')
    window_user = Window.partitionBy('user_id')

    actual_df_val = df_val.withColumn(
        'actual_books',
        F.collect_list('book_id').over(window_user_ordered)).groupBy(
            'user_id').agg(F.max('actual_books').alias('actual_books'))

    print("Datasets loaded | Time taken: {}".format(time.time() - start))

    start = time.time()

    als = ALS(maxIter=10,
              regParam=0.001,
              userCol="user_id",
              itemCol="book_id",
              ratingCol="rating",
              rank=100)
    model = als.fit(df_train)

    print("Done with model fitting | Time taken: {}".format(time.time() -
                                                            start))
    start = time.time()

    # predictions = model.transform(df_val)
    # evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    # rmse = evaluator.evaluate(predictions)

    # print("RMSE: {}".format(rmse))

    recommendations = model.recommendForUserSubset(df_val, 500)
    userPredictions = recommendations.select(
        'user_id', F.explode('recommendations.book_id')).withColumn(
            'pred_books',
            F.collect_list('col').over(window_user)).groupBy('user_id').agg(
                F.max('pred_books').alias('pred_books'))
    predAndLabels = userPredictions.join(actual_df_val, on='user_id').select(
        'pred_books', 'actual_books')
    metrics = RankingMetrics(predAndLabels.rdd)
    score = metrics.meanAveragePrecision
    print('MAP for test data: {}'.format(score))
    print('Time taken: {}'.format(time.time() - start))
def main(spark, rank, regParam, path, fraction):
    TEMP_PATH = "/models/ALS_{}_{}_{}".format(rank, regParam, fraction)
    ALS_PATH = TEMP_PATH + "/als"
    MODEL_PATH = TEMP_PATH + "/als_model"
    print("Loading model...")
    als = ALS.load(path + ALS_PATH)
    model = ALSModel.load(path + MODEL_PATH)
    print("Loading data...")
    testing = spark.read.parquet("{}/data/processed/testing_{}.parquet".format(
        path, fraction))
    testing.createOrReplaceTempView("testing")

    # RMSE
    predictions = model.transform(testing)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("RSME:", rmse)
    predictions = model.recommendForAllUsers(500)
    predictions.createOrReplaceTempView("predictions")
    groundtruth = testing.groupby("user_id").agg(
        F.collect_set("book_id").alias('groundtruth'))
    groundtruth.createOrReplaceTempView("groundtruth")
    total = spark.sql(
        "SELECT g.user_id, g.groundtruth AS groundtruth, p.recommendations AS predictions FROM groundtruth g JOIN predictions p ON g.user_id = p.user_id"
    )
    total.createOrReplaceTempView("total")

    data = total.selectExpr("predictions.book_id", "groundtruth")
    print("df to rdd...")
    rdd = data.rdd.map(tuple)
    print("creating metrics...")
    metrics = RankingMetrics(rdd)
    print("meanAveragePrecision:", metrics.meanAveragePrecision)
    print("precision at 500:", metrics.precisionAt(500))
    print("ndcgAt 500:", metrics.ndcgAt(500))
Example #5
0
def Ranking_evaluator (spark,model, val, metric_type):
    
    val.createOrReplaceTempView('val')                        
    val_user = spark.sql('SELECT DISTINCT user_id FROM val')  
    #val_user = val.select('user_id').distinct()
    val_rec = model.recommendForUserSubset(val_user,500)
    #val_rec.printSchema()
    
    val_rec = val_rec.select('user_id','recommendations',f.posexplode('recommendations')).drop('pos').drop('recommendations')
    val_rec = val_rec.select('user_id',f.expr('col.book_id'),f.expr('col.rating'))
    
    w= Window.partitionBy('user_id')
    val_recrank=val_rec.select('user_id',f.collect_list('book_id').over(w).alias('rec_rank')).sort('user_id').distinct()
   
    val = val.sort(f.desc('rating'))
    val_truerank=val.select('user_id', f.collect_list('book_id').over(w).alias('true_rank')).sort('user_id').distinct()
    
    scoreAndLabels = val_recrank.join(val_truerank,on=['user_id'],how='inner')
    
    rankLists=scoreAndLabels.select("rec_rank", "true_rank").rdd.map(lambda x: tuple([x[0],x[1]])).collect()
    ranks = spark.sparkContext.parallelize(rankLists)
    
    metrics = RankingMetrics(ranks)
    
    MAP = metrics.meanAveragePrecision
    Precision = metrics.precisionAt(500)
    NDCG = metrics.ndcgAt(500)
    
    if metric_type == 'Precision':
        return Precision, {'MAP': MAP,'NDCG': NDCG}
    elif metric_type == 'MAP':
        return MAP, {'Precision': Precision,'NDCG': NDCG}
    elif metric_type == 'NDCG':
        return NDCG, {'MAP': MAP, 'Precision': Precision}
    else:
        return None
Example #6
0
def main(spark, test_file, index_file, model_file):
    # load test data and create dataframe
    test_df = spark.read.parquet(test_file)
    model_indexer = PipelineModel.load(index_file)
    # transform user and track ids for test data
    test_df = model_indexer.transform(test_df)
    # store ground truth for user
    user_truth = test_df.groupby('user_label').agg(
        F.collect_list('book_label').alias('truth'))
    print('created ground truth df')
    als_model = ALSModel.load(model_file)

    # predict based on the top 500 item of each user
    recommend = als_model.recommendForAllUsers(500)
    print('recommendation has been created.')
    # RMSE
    predict = als_model.transform(test_df)
    evaluator = RegressionEvaluator(metricName='rmse',
                                    labelCol='rating',
                                    predictionCol='prediction')
    rmse = evaluator.evaluate(predict)
    print('Root mean square error is ' + str(rmse))

    # prediction = spark.sql('SELECT * FROM recommend INNER JOIN user_truth WHERE recommend.user_label=user_truth.user_label')
    # after running panda udf is faster than using sparksql
    prediction = recommend.join(user_truth,
                                recommend.user_label == user_truth.user_label,
                                'inner')

    score = prediction.select('recommendations.book_label',
                              'truth').rdd.map(tuple)
    rank_metric = RankingMetrics(score)
    precision = rank_metric.precisionAt(500)
    mean_precision = rank_metric.meanAveragePrecision
    print(' precision at 500 ' + str(precision) +
          'mean average precision of ' + str(mean_precision))
def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''


    # Reading train and transforming with StringIndexer
    train_file = 'hdfs:/user/dev241/train_sample.parquet'
    val_file = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'

    train_sample = spark.read.parquet(train_file)
    val = spark.read.parquet(val_file)

    idx_pipe = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')

    train_idx = idx_pipe.transform(train_sample)
    val_idx = idx_pipe.transform(val)

    val_idx = val_idx.select('user_idx','track_idx','count')
    val_users = val_idx.select('user_idx').distinct()
    val_comb = val_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('val_labels'))

    # Hyperparameter values
    results = []
    i = 0

    # Looping through the hyperparameter values - low alpha
    for i in range(50):
        rank = np.random.randint(100)
        alpha = np.random.uniform(0.1,15)
        reg = np.random.uniform(0.1,1)
        als = ALS(rank = rank, alpha = alpha, regParam = reg, userCol="user_idx", itemCol="track_idx", ratingCol="count", coldStartStrategy="drop", implicitPrefs = True)
        model = als.fit(train_idx)
        model.save('model_random_search'+str(rank)+'_'+str(alpha)+'_'+str(reg))
        track_number = 500
        rec_val = model.recommendForUserSubset(val_users, track_number)
        join = val_comb.join(rec_val,val_comb.user_idx == rec_val.user_idx)
        predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.val_labels))
        metrics = RankingMetrics(predictionAndLabels)
        mavgp = metrics.meanAveragePrecision
        results.append((rank,alpha,reg,mavgp))
        print("Rank : ",rank,"Alpha : ",alpha,"Reg : ",reg,"MAP : ",mavgp)
    print('First Validation completed.')
    sc.parallelize(results).saveAsTextFile("MAP_random_search_high.txt")
    Print('MAP_random_search_high.txt saved')
Example #8
0
def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''

    # File names
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'
    train_sample_file = 'hdfs:/user/ah3243/extension1_count_greater_1.parquet'

    # Reading the parquet files
    test = spark.read.parquet(test_file)
    train_sample = spark.read.parquet(train_sample_file)

    # StringIndexer 
    print("String Indexer entered")
    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    
    test_idx = test_idx.sample(.3)
    
    train_idx = StringIndexer.transform(train_sample)
    print("String Indexer done")
	
    #change to best
    rank = 78
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    #model
    als = ALS(rank = rank, alpha = alpha, regParam = reg, userCol="user_idx", itemCol="track_idx", ratingCol="count", coldStartStrategy="drop", implicitPrefs = True)
    model = als.fit(train_idx)
    print("Model fit for test done")

    #test ranking metrics
    test_idx = test_idx.select('user_idx','track_idx','count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = model.recommendForUserSubset(test_users, track_number)
    join = test_comb.join(rec_test,test_comb.user_idx == rec_test.user_idx)
    predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.test_labels))
    metrics = RankingMetrics(predictionAndLabels)
    mavgp = metrics.meanAveragePrecision
    print("Test mean Average Precision : ",mavgp)
    pass
Example #9
0
def main(spark, model_file, test_file):
    model = MatrixFactorizationModel.load(sc, model_file)
    
    test_df = spark.read.parquet(test_file)
    test_df = test_df.select('user_label', 'track_label', 'count')
    
    #predictions = model.recommendProductsForUsers(500)
    predictions = model.recommendProductsForUsers(2)
    prediction_flat = predictions.flatMap(lambda p: p[1])
    prediction_df = prediction_flat.toDF()
    intersections = prediction_df.join(test_df, (prediction_df.product == test_df.track_label)&
                                      (prediction_df.user == test_df.user_label), how = 'inner')
    predLabel = intersections.select('rating', 'count')
    predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1]))
    metrics = RankingMetrics(predLabl_rdd) 
    print(metrics.meanAveragePrecision)
Example #10
0
def bestmodel(traindata, validatedata):
    bestValidationRmse = float("inf")
    #map validate data to userId, movieId
    validation = validatedata.map(lambda r: (r[0], r[1]))
    #get actual rating data for pairs of userId, movieId
    ratingTuple = validatedata.map(lambda r:
                                   ((int(r[0]), int(r[1])), float(r[2])))
    for rank in ranks:
        #create model by train data
        model = ALS.train(traindata, rank, numIterations, lambda_=regulz_para)
        #predict ratings for validation data
        predictions = model.predictAll(validation).map(lambda r:
                                                       ((r[0], r[1]), r[2]))
        #create predict and actual ratings
        scoreAndLabels = predictions.join(ratingTuple).map(lambda tup: tup[1])

        regMetrics = RegressionMetrics(scoreAndLabels)
        RMSE = regMetrics.rootMeanSquaredError
        MSE = regMetrics.meanSquaredError

        print("For rank %s:" % rank)
        print("RMSE = %s" % RMSE)
        print("MSE = %s" % MSE)

        if RMSE < bestValidationRmse:
            bestValidationRmse = RMSE
            best_rank = rank

    print 'The best model was trained with rank %s' % best_rank

    #MAP:
    #actual top 10 movie sequence for users by rating
    model = ALS.train(traindata, best_rank, numIterations, lambda_=regulz_para)
    actual_user_movie = validatedata.map(lambda x:
                                         (x[0], (x[1], x[2]))).groupByKey()
    actual_user_movie1 = actual_user_movie.map(order_movies)
    predict_user_movie = model.predictAll(validation).map(
        lambda r: (r[0], (r[1], r[2]))).groupByKey()
    predict_user_movie1 = predict_user_movie.map(order_movies)
    movie_seq = predict_user_movie1.join(actual_user_movie1).map(
        lambda x: x[1])
    movie_seq = movie_seq.map(movie_index)
    rankMetrics = RankingMetrics(movie_seq)
    MAP = rankMetrics.meanAveragePrecision
    print("MAP = %s" % MAP)
Example #11
0
def main(sc):
    ratings_info = sc.textFile("input/ratings.csv")
    ratings_data = ratings_info.map(split).map(parse).filter(
        lambda line: line != None)

    fold1, fold2, fold3, fold4, fold5 = ratings_data.randomSplit(
        [0.2, 0.2, 0.2, 0.2, 0.2])
    folds = [fold1, fold2, fold3, fold4, fold5]

    rank = 12
    itr = 25
    mse = 0
    rmse = 0
    map = 0
    for i in range(5):
        test_data = folds[i]
        train_data = sc.emptyRDD()
        for j in range(5):
            if i == j:
                continue
            else:
                train_data = train_data.union(folds[j])

        model = ALS.train(train_data, rank, iterations=itr, lambda_=0.1)
        testdata = test_data.map(lambda p: (p[0], p[1]))
        predictions = model.predictAll(testdata).map(lambda r:
                                                     ((r[0], r[1]), r[2]))
        rates = test_data.map(lambda r: ((r[0], r[1]), r[2]))
        predsAndlabels = predictions.join(rates).map(lambda tup: tup[1])
        actual_rating = predsAndlabels.map(lambda r: r[1]).collect()
        predicted_rating = predsAndlabels.map(lambda r: r[0]).collect()
        predAndReal = sc.parallelize([(predicted_rating, actual_rating)])

        metrics = RegressionMetrics(predsAndlabels)
        metric = RankingMetrics(predAndReal)
        mse += metrics.meanSquaredError
        rmse += metrics.rootMeanSquaredError
        map += metric.meanAveragePrecision

    k_mse = mse / 5.0
    k_rmse = rmse / 5.0
    k_map = map / 5.0
    print("MSE = %s" % k_mse)
    print("RMSE = %s" % k_rmse)
    print("MAP = %s" % k_map)
def compute_MAP(model, users, df):
    predictions = model.recommendForUserSubset(users, 500)
    print("Generated predictions")
    userRec = (predictions.select(
        "userIndex",
        F.explode("recommendations").alias("recommendation")).select(
            "userIndex", "recommendation.*"))
    rankings = userRec.groupby('userIndex').agg(
        F.collect_list('trackIndex').alias('ranked_tracks'))
    print("Generated rankings")
    truth = df.groupby('userIndex').agg(
        F.collect_list('trackIndex').alias('ground_truth'))
    print("Generated ground truth")
    final = rankings.join(truth, rankings.userIndex == truth.userIndex).select(
        'ranked_tracks', 'ground_truth')
    metrics = RankingMetrics(final.rdd)
    map_val = metrics.meanAveragePrecision
    return map_val
def main(spark, train_file, val_file, model_file):

    train_df = spark.read.parquet(train_file)
    val_df = spark.read.parquet(val_file)
    print('Finish reading data')

    train_df = train_df.withColumn('inc_count', train_df['count'] + 1)
    train_df = train_df.withColumn('log_count', F.log(train_df['inc_count']))

    print('finish transforming data')
    print(train_df.first())
    print(val_df.first())

    train_df = train_df.select('user_label', 'track_label', 'log_count')
    val_df = val_df.select('user_label', 'track_label', 'count')
    val_grouped = val_df.groupBy('user_label').agg(
        F.collect_list(F.col('track_label')).alias('track_label'))
    print('finish preparing data')

    val_grouped.cache()
    train_df.cache()
    print('start fitting')
    # ALS for implicit feedback
    als = ALS(maxIter = 5, regParam = 0.01, alpha = 0.1, rank =10, implicitPrefs = True, \
          userCol = 'user_label', itemCol = 'track_label', ratingCol = 'log_count')

    als_model = als.fit(train_df)
    print('Model fitted')
    als_model.save(model_file)
    print('Model Saved')
    predictions = als_model.recommendForAllUsers(100)
    prediction_df = predictions.rdd.map(
        lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF()
    prediction_df = prediction_df.selectExpr('_1 as user_label',
                                             '_2 as recommendations')

    # Join table
    val_pred = val_grouped.join(prediction_df, 'user_label', 'inner')
    rdd = val_pred.select('recommendations', 'track_label').rdd
    ranking_metrics = RankingMetrics(rdd)
    print(
        'Log: Current log job alpha is : 0.1, current rank is 10, reg is 0.01')
    print('Single model, MAP = %s' % ranking_metrics.meanAveragePrecision)
Example #14
0
def main(spark, model_file, data_file,count):
    df = spark.read.parquet(data_file).repartition(2000,['userIndex'])
    if count =='log':
        df = df.withColumn("count",log(col("count")+1))
    elif count =='drop1':
        df = df.filter('count>1')
    elif count =='drop2':
        df = df.filter('count>2')
    model = ALSModel.load(model_file)
    test_user = df.select('userIndex').distinct()
    predictions = model.transform(df)
    actual = predictions.groupBy("userIndex").agg(expr("collect_set(trackIndex) as tracks"))
    rec = model.recommendForUserSubset(test_user,500)
    a= rec.select('userIndex','recommendations.trackIndex')
    b=a.join(actual,['userIndex']).select('trackIndex','tracks').rdd
    metrics = RankingMetrics(b)
    result = metrics.meanAveragePrecision
    print(result)
    np.savetxt('drop2.txt',np.array([result]))
    pass
def getMAP(top_predictions, truth):
    true = truth.select('user_id', 'book_id', 'true_row')
    w = Window.partitionBy('user_id').orderBy('true_row')
    true = true.withColumn(
        'true',
        F.collect_list('book_id').over(w)).groupBy('user_id').agg(
            F.max('true').alias('true'))

    pred = top_predictions.select('user_id', 'book_id', 'row_num')
    w = Window.partitionBy('user_id').orderBy('row_num')
    pred = pred.withColumn(
        'pred',
        F.collect_list('book_id').over(w)).groupBy('user_id').agg(
            F.max('pred').alias('pred'))

    pred_true = pred.join(true, 'user_id').select('pred', 'true').rdd

    metrics = RankingMetrics(pred_true)
    score = metrics.meanAveragePrecision
    return score
Example #16
0
def main(spark, train_file, val_file, model_file):
    df_train = spark.read.parquet(train_file)
    df_val = spark.read.parquet(val_file)
    print(df_train.count())
    print(df_val.count())
    als = ALS(implicitPrefs=True,
              userCol="userIndex",
              itemCol="trackIndex",
              ratingCol="count",
              coldStartStrategy="drop")
    ranks = [10, 20, 40]
    reg_params = [0.001, 0.01, 0.1]
    alphas = [1, 20, 40]
    max_result = 0.0
    best_rank = 0
    best_alpha = 0
    best_regparam = 0
    k = 500
    val_user = df_val.select('userIndex').distinct()
    for rank, reg_param, alpha in itertools.product(ranks, reg_params, alphas):
        als.setRank(rank).setRegParam(reg_param).setAlpha(alpha)
        model = als.fit(df_train)
        rec = model.recommendForUserSubset(val_user, 500)
        predictions = model.transform(df_val)
        actual = df_val.groupBy("userIndex").agg(
            expr("collect_set(trackIndex) as tracks"))
        pred = rec.select('userIndex', 'recommendations.trackIndex')
        a = pred.join(actual, ['userIndex']).select('trackIndex', 'tracks')
        metrics = RankingMetrics(a.rdd)
        result = metrics.meanAveragePrecision
        print('For rank %s, for alpha %s, for reg_param %s, the MAP is %s' %
              (rank, alpha, reg_param, result))
        if result > max_result:
            max_result = result
            best_rank = rank
            best_alpha = alpha
            best_regparam = reg_param
    best = als.setRank(best_rank).setAlpha(best_alpha).setRegParam(
        best_regparam)
    best_model_ = best.fit(df_train)
    best_model_.save(model_file)
Example #17
0
def main(spark, val_file, model_file):
    model = ALSModel.load(model_file)
    print('finish loading models')
    val_df = spark.read.parquet(val_file)
    val_df = val_df.select('user_label', 'track_label')
    val_grouped = val_df.groupBy('user_label').agg(F.collect_list(F.col('track_label')).alias('track_label'))
    print('Finish preparing test data')
    val_grouped.cache()

    predictions = model.recommendForAllUsers(500)
    print('finish making predictions')
    prediction_df = predictions.rdd.map(lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF()
    prediction_df = prediction_df.selectExpr("_1 as user_label", "_2 as recommendations")

    # Join table
    val_pred = val_grouped.join(prediction_df, "user_label", "inner")
    print('finish joining data')
    # Instantiate regression metrics to compare predicted and actual ratings
    rdd = val_pred.select('recommendations', 'track_label').rdd
    print('final steps')
    ranking_metrics = RankingMetrics(rdd)

    # MAP
    print("MAP = %s" % ranking_metrics.meanAveragePrecision)
Example #18
0
def main(spark, data_file, model_file,truth_file):
    '''Main routine for supervised training
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load: test set user id.
    model_file : string, path to store the serialized model file
    truth_file: ground truth interaction list for each user in the test set.
    '''

    topk =  500
    #nmslib indexing parameters
    Mlist = [50]
    efclist = [3000]
    efslist = [800]

    #Prepare data
    test_users = spark.read.parquet(data_file) #distinct_test_users.parquet
    truth = spark.read.parquet(truth_file)
    model = ALSModel.load(model_file) #recsys_model_search
    usefactor = model.userFactors
    queryuser = test_users.join(usefactor,test_users.user_num_id == usefactor.id,how = 'left')
    userid = [row.id for row in queryuser.select('id').collect()]
    user = np.array([row.features for row in queryuser.select('features').collect()])
    itmfactor = model.itemFactors
    item = np.array([row.features for row in itmfactor.select('features').collect()])
    
    itemidx = np.array([row.id for row in itmfactor.select('id').collect()])
    trs_user = np.append(user,np.zeros((user.shape[0],1)),axis = 1)
    norms = np.linalg.norm(item,axis = 1)
    maxnorm = norms.max()
    extra_item_dim = np.sqrt(maxnorm ** 2 - norms ** 2)
    trs_item = np.append(item, extra_item_dim.reshape(norms.shape[0], 1), axis=1)
    print('Finish Preparing the data')
    print('Start brute force search')
    #only try brute force once.
    #time2 = time.time()
    #brutea,bruteb = bruteforce(user,item,itemidx,topk)
    #brute_time = time.time() - time2
    #print('Time to brute force search top{} items is {}, {} seconds per query'.format(topk,brute_time,brute_time/len(user)))
    
    #Get MAP
    R = Row('id', 'recs')
    #rec_brute = spark.createDataFrame([R(x, y) for i, (x,y) in enumerate(zip(userid,brutea))])
    #pred_brute = truth.join(rec_brute, truth.user_id == rec_brute.id, how='left').select('recs', 'label')
    #predictionAndLabels_b = pred_brute.rdd.map(lambda lp: (lp.recs, lp.label)).repartition(100)
    #metrics_b = RankingMetrics(predictionAndLabels_b)
    #meanAP_b = metrics_b.meanAveragePrecision



   #Multiple accelerated search with different parameter settings
    for M,efc,efs in itertools.product(Mlist,efclist,efslist):
        indexParams = {'M': M, 'indexThreadQty': 4, 'efConstruction': efc, 'post' : 0}
    
        #Get time
        print("__________________Start a new indexer______________________")
        index = nmslib.init(method = 'hnsw',space = 'cosinesimil')
        index.addDataPointBatch(trs_item,ids = itemidx)
        time1 = time.time()
        index.createIndex(indexParams)
        nmslib_buildtime = time.time() - time1
        print('indexParams for nmslib is {}'.format(indexParams),'queryParams for nmslib is efs =  {}'.format(efs))
        print('Time to build index for nmslib is {}'.format(nmslib_buildtime))
       
        time3 = time.time()
        nms_a,nms_b = nmslib_search(index, trs_user,trs_item,itemidx,topk,efs)
        nms_time = time.time() - time3
        print('Time to nmslib search top{} items is {}, {} seconds per query'.format(topk,nms_time,nms_time/len(user)))

        #Get MAP
        rec_nms = spark.createDataFrame([R(x,y) for i ,(x,y) in enumerate(zip(userid,nms_a))])
        pred_nms = truth.join(rec_nms,truth.user_id == rec_nms.id,how = 'left').select('recs','label')
        predictionAndLabels_n = pred_nms.rdd.map(lambda lp: (lp.recs, lp.label)).repartition(100) 
        metrics_n = RankingMetrics(predictionAndLabels_n)
        meanAP_n = metrics_n.meanAveragePrecision

        print(' MAP for nmslib is {},MAP for bruteforce is 0.04126000613271917'.format(meanAP_n))
        print('____________Finish an indexer__________________')
Example #19
0
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions_test)

print("Root-mean-square error = " + str(rmse))

test.createOrReplaceTempView('test')
test_true = spark.sql(
    'select user, book from test where rating > 2 sort by rating desc')
labels = test_true.groupby('user').agg(collect_list('book'))

test_recommendations = model.recommendForUserSubset(labels.select('user'), 500)
preds = test_recommendations.withColumn(
    'recommendations', explode('recommendations')).select(
        'user',
        'recommendations.item').groupBy('user').agg(collect_list('item'))

preds_and_labels = preds.join(labels, on='user')

metrics = RankingMetrics(
    preds_and_labels.select('collect_list(item)', 'collect_list(book)').rdd)
map_metric = metrics.meanAveragePrecision
pA = metrics.precisionAt(500)
ndcgA = metrics.ndcgAt(500)

results.append((rank, reg, rmse, map_metric, pA, ndcgA))

print('MAP = ', map_metric, ' pA = ', pA, ' ndcgA = ', ndcgA, '\n')

res_rdd = spark.sparkContext.parallelize(results)
res_df = spark.createDataFrame(res_rdd).repartition(1)
res_df.write.csv('test_results.csv')
prediction_val = best_model.transform(df_validation)
print(" Predictions for validation dataset: ------------------------------")
prediction_val.show()
prediction_val.write.csv('hdfs:/user/pg1910/pub/goodreads/prediction_val.csv')

prediction_test = best_model.transform(df_test)
print(" Predictions for test dataset: ------------------------------")
prediction_test.show()
prediction_test.write.csv(
    'hdfs:/user/pg1910/pub/goodreads/prediction_test.csv')

actual_val = df_validation.groupBy("user_id").agg(
    expr("collect_set(book_id) as books"))
pred_val = user_recs.select('user_id', 'recommendations.book_id')
output_val = pred_val.join(actual_val, ['user_id']).select('book_id', 'books')
metrics_val = RankingMetrics(output_val.rdd)
result_val = metrics_val.meanAveragePrecision

print("Mean average precision for validation dataset: " + str(result_val))

rmse_val = evaluator.evaluate(prediction_val)
print("RMSE for validation dataset=" + str(rmse_val))

actual_test = df_test.groupBy("user_id").agg(
    expr("collect_set(book_id) as books"))
pred_test = user_recs.select('user_id', 'recommendations.book_id')
output_test = pred_test.join(actual_test,
                             ['user_id']).select('book_id', 'books')
metrics_test = RankingMetrics(output_test.rdd)
result_test = metrics_test.meanAveragePrecision
Example #21
0
    
ratings_train = train.map(lambda r: parseLine(r))
ratings_test = test.map(lambda r: parseLine(r))

sample_test = ratings_test.sample(False,0.1) #tirando uma amostra dos usuarios de teste
sample_test.count() #quantidade de usuarios de teste usados nesse exemplo
test_users = sample_test.map(lambda x: x.user).collect()

model = ALS.trainImplicit(ratings_train, 10, 10)

recs={}
for u in test_users:
    rec = model.recommendProducts(u,10)
    recs[u]=map(lambda r: r[1],rec)
   
groundTruth = {}
userItemTestRDD = sample_test.map(lambda x: (x.user,x.product)) 
trueRec = userItemTestRDD.groupByKey().collect()
for x in trueRec:
    groundTruth[x[0]]=list(x[1]) 
    
predictionsAndLabels = []
for u in test_users:
    predictionsAndLabels.append((recs[u],groundTruth[u]))

predictionsAndLabelsRDD = sc.parallelize(predictionsAndLabels)

metrics = RankingMetrics(predictionsAndLabelsRDD)

metrics.precisionAt(5)
          userCol="u_id",
          itemCol="t_id",
          ratingCol="count",
          coldStartStrategy="drop")
model = als.fit(new_data)

labels = new_data.groupby('u_id').agg(
    F.collect_set('t_id').alias('ranked_labels'))
testusers = new_data.select('u_id').distinct()

userSubsetRecs = model.recommendForUserSubset(testusers, 10)
recommendationsDF = (userSubsetRecs.select(
    "u_id",
    explode("recommendations").alias("recommendation")).select(
        "u_id", "recommendation.*"))
preds = recommendationsDF.groupby('u_id').agg(
    F.collect_set('t_id').alias('ranked_preds'))
joined_table = labels.join(preds, labels.u_id == preds.u_id)
reqdPredsLabels = joined_table.select(labels.ranked_labels, preds.ranked_preds)
metrics = RankingMetrics(reqdPredsLabels.rdd)
print('Precision at 500 : {0}'.format(metrics.precisionAt(500)))
print('Mean Average Precision: {0}'.format(metrics.meanAveragePrecision))
print(datetime.now())

testdata = new_data.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = new_data.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))
print(datetime.now())
Example #23
0
                      lambda_=lamda,
                      seed=seed)

    #predicting on test dataset
    preds = model.predictAll(CVTestData.map(lambda p: (p[0], p[1]))).map(
        lambda r: ((r[0], r[1]), r[2]))

    ratesAndPreds = CVTestData.map(lambda r: ((r[0], r[1]), r[2])).join(preds)

    #evaluating predictions with actual ratings/rankings
    metrics = RegressionMetrics(ratesAndPreds.map(lambda r: r[1]))

    ratingPairs = ratesAndPreds.map(lambda r: (r[0][0], (r[1][0], r[1][
        1]))).reduceByKey(lambda x, y: x + y).map(lambda x: list(x[1]))
    rankAndPreds = ratingPairs.map(map1)
    rmetrics = RankingMetrics(sc.parallelize(rankAndPreds.collect(
    )))  #rankAndPreds is a PipelinedRDD and has to be converted into RDD

    MSE = metrics.meanSquaredError
    total_mse += MSE

    RMSE = metrics.rootMeanSquaredError
    total_rmse += RMSE

    MAP = rmetrics.meanAveragePrecision
    total_map += MAP

print("Average Mean Squared Error = " + str(total_mse / folds))
print("Average Root Mean Squared Error = " + str(total_rmse / folds))
print("Average Mean Average Precision = " + str(total_map / folds))

#This code is executed only after getting best parameters from Cross Validation
def main(spark, train_file, val_file, model_file):

    train_df = spark.read.parquet(train_file)
    val_df = spark.read.parquet(val_file)
    train_df = train_df.select('user_label', 'track_label', 'count')
    val_df = val_df.select('user_label', 'track_label', 'count')
    val_grouped = val_df.groupBy('user_label').agg(
        F.collect_list(F.col('track_label')).alias('track_label'))

    # ALS for implicit feedback
    als = ALS(maxIter=5,
              regParam=0,
              implicitPrefs=True,
              alpha=0.4,
              rank=20,
              userCol='user_label',
              itemCol='track_label',
              ratingCol='count')

    als_model = als.fit(train_df)
    predictions = als_model.recommendForAllUsers(100)
    prediction_df = predictions.rdd.map(
        lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF()
    prediction_df = prediction_df.selectExpr('_1 as user_label',
                                             '_2 as recommendations')

    # Join table
    val_pred = val_grouped.join(prediction_df, 'user_label', 'inner')
    rdd = val_pred.select('recommendations', 'track_label').rdd
    ranking_metrics = RankingMetrics(rdd)
    print('Before tuning, MAP = %s' % ranking_metrics.meanAveragePrecision)

    # hyperparameter tuning
    ranks = [10, 20, 40, 60]
    reg_params = [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5]
    alphas = [0.10, 0.20, 0.40]
    best_rank = None
    best_reg_param = None
    best_alpha = None
    best_model = None
    best_map = 0

    for rank_i, alpha_i, reg_param_i in itertools.product(
            ranks, alphas, reg_params):

        print('Running on rank:', rank_i)
        print('Running on alpha:', alpha_i)
        print('Running on reg:', reg_param_i)

        als = ALS(maxIter=5,
                  regParam=reg_param_i,
                  implicitPrefs=True,
                  alpha=alpha_i,
                  rank=rank_i,
                  userCol='user_label',
                  itemCol='track_label',
                  ratingCol='count')

        als_model = als.fit(train_df)
        predictions = als_model.recommendForAllUsers(100)
        prediction_df = predictions.rdd.map(lambda r: (
            r.user_label, [i[0] for i in r.recommendations])).toDF()
        prediction_df = prediction_df.selectExpr('_1 as user_label',
                                                 '_2 as recommendations')

        # Join table
        val_pred = val_grouped.join(prediction_df, 'user_label', 'inner')
        rdd = val_pred.select('recommendations', 'track_label').rdd
        ranking_metrics = RankingMetrics(rdd)
        map_ = ranking_metrics.meanAveragePrecision

        print('MAP:', map_)

        if map_ > best_map:
            best_rank = rank_i
            best_reg_param = reg_param_i
            best_alpha = alpha_i
            best_model = als_model
            best_map = map_

    print('Best rank:', best_rank)
    print('Best regParam:', best_reg_param)
    print('Best alpha:', best_alpha)
    print('Best map:', best_map)

    # save the best model
    best_model.save(model_file)
Example #25
0
File: lr.py Project: yuweitu/Kaggle
#Specially in case of unbalanced dataset
train = sqlContext.read.parquet('train_transformed')
train = train.withColumn("label", train["label"].cast(DoubleType()))
test = sqlContext.read.parquet('test_transformed')
test = test.withColumn("label", train["label"].cast(DoubleType()))
lr = LogisticRegression()

#Building the grid
grid = ParamGridBuilder() \
 .addGrid(lr.maxIter, [0, 1, 5, 10, 15, 20, 25, 30]) \
 .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1, 10, 100, 1000]) \
 .addGrid(lr.regType, ['l1', 'l2']) \
 .addGrid(r.elasticNetParam, [0.001, 0.01, 0.1, 1, 10, 100, 1000]) \
 .build()

metrics = RankingMetrics()
evluator = metrics.precisionAt(12)
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid,
                    evaluator=evaluator,
                    numFolds=5)
cvModel = cv.fit(train)

#Testing overfitting/underfitting
train_score = evaluator.evaluate(cvModel.transform(train))
test_score = evaluator.evaluate(cvModel.transform(test))

#Saveing the model
bestModel = cvModel.bestModel
coefficients = DenseVector(bestModel.coefficients.toArray())
os.system('mkdir lr')
Example #26
0
def main(spark, train_data, val_data, downsample=True, extension=None):
    '''Main routine for supervised training
    Parameters
    ----------
    spark : SparkSession object
    train_data : string, path to the training parquet file to load
    val_data: string, path to the validation parquet file to load
    test_data: string, path to the testing parquet file to load
    downsample: TRUE or FALSE. To indicate if we should downsample the data or not
    '''

    ### read in the files
    train = spark.read.parquet(train_data)
    val = spark.read.parquet(val_data)

    ### if down-sample: down-sample train data to random 0.1%
    if downsample:
        train = train.sample(False, 0.00001, seed=0)
        #val = val.sample(False, 0.00001, seed = 0)

    if extension != None:
        if extension == "log":  # log-compression
            train = train.withColumn("log_count",
                                     log("count"))  # apply log-compression

        elif extension == "drop":  # drop low counts
            lower_bound = train.approxQuantile("count", [
                0.1
            ], 0.25)  # treat the 0.1 quantile of count data as the lower bound
            train = train.filter(
                train["count"] > int(lower_bound[0])
            )  # filter out count data rows lower than the lower bound

    ### transform dataframe columns: user_id, track_id from string to float and put them in the pipeline
    user_indexer = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_indexed",
                                 handleInvalid='skip')
    track_indexer = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_indexed",
                                  handleInvalid='skip')
    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    indexing_model = pipeline.fit(train)  #learn (return: pipeline model)

    ### transform the datasets and create the view
    train = indexing_model.transform(
        train)  # return a dataframe with new columns
    train.createOrReplaceTempView("train")
    val = indexing_model.transform(val)  # return a dataframe with new columns
    val.createOrReplaceTempView("val")

    # group by user_id, aggregate track_id_indexed for train and val
    val_groupby = spark.sql(
        "select user_id_indexed, collect_list(track_id_indexed) track_id_indexed_collections from val group by user_id_indexed"
    )
    val_groupby.createOrReplaceTempView("val_groupby")

    # Build the recommendation model using ALS on the training data
    rank = np.arange(4, 10, 2)
    regParam = np.linspace(0.01, 0.2, 3)
    alpha = np.linspace(0.5, 2, 3)
    paramGrid = list(itertools.product(rank, regParam, alpha))
    MAP_lst = []  # store MAP results
    precision_at_500_lst = []  # store precision at 500 results

    for combo in paramGrid:
        rank_, regParam_, alpha_ = combo

        # Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
        if extension == "log":
            ratingCol = "log_count"
        else:
            ratingCol = "count"
        als = ALS(rank=rank_,
                  regParam=regParam_,
                  alpha=alpha_,
                  implicitPrefs=True,
                  userCol="user_id_indexed",
                  itemCol="track_id_indexed",
                  ratingCol=ratingCol,
                  coldStartStrategy="drop")

        # Save the model
        model = als.fit(train)  # fit the pipeline onto training data

        # get top 500 recommendations
        userRecs = model.recommendForAllUsers(
            500
        )  # return: dataframe (columns: user_id_indexed, recommendations)
        # [('user_id_indexed', 'int'), ('recommendations', 'array<struct<track_id_indexed:int,rating:float>>')]
        userRecs = userRecs.select(
            userRecs.user_id_indexed,
            userRecs.recommendations.track_id_indexed.alias(
                "pred_list"))  # with track_id_indexed only, no track_id
        userRecs.createOrReplaceTempView("userRecs")  # create temporary view

        combined_df = spark.sql(
            '''select val_groupby.user_id_indexed user_id_indexed, userRecs.pred_list pred_list, 
        val_groupby.track_id_indexed_collections track_id_indexed_collections from userRecs inner join val_groupby on val_groupby.user_id_indexed = userRecs.user_id_indexed'''
        )  # combine dfs wrg to user_id_indexed

        # use ranking metrics for evaluations
        predLabelsTuple = combined_df.rdd.map(
            lambda r:
            (r.pred_list, r.track_id_indexed_collections))  # result: tuple
        metrics = RankingMetrics(predLabelsTuple)
        MAP = metrics.meanAveragePrecision
        precision_at_500 = metrics.precisionAt(500)
        MAP_lst.append(MAP)  # store MAP for each config
        precision_at_500_lst.append(
            precision_at_500)  # store precision at 500 for each config
        # print out validation evaluation result
        print("---------------------------------------")
        print("configs: \n")
        print("rank = " + str(rank_) + " , regParam = " + str(regParam_) +
              " , alpha = " + str(alpha_))
        print("\n")
        print("MAP = " + str(MAP))
        print("Precision at 500 = " + str(precision_at_500))

    min_index = MAP_lst.index(np.max(MAP_lst))
    rank_opt, regParam_opt, alpha_opt = paramGrid[min_index]
    print("---------------------------------------")
    print("optimal configs: \n")
    print("rank = " + str(rank_opt) + " , regParam = " + str(regParam_opt) +
          " , alpha = " + str(alpha_opt))
    print("\n")
    print("MAP = " + str(np.max(MAP_lst)))
    print("Precision at 500 =" + str(precision_at_500_lst[min_index]))
def tune_ALS_map(train_read, val_read, val_true_list, iteration, regParams,
                 current_rank):
    """
    grid search function to select the best model based on RMSE of
    validation data
    Parameters
    ----------
    train_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    validation_data: spark DF with columns ['userId', 'movieId', 'rating']
    
    maxIter: int, max number of learning iterations
    
    regParams: list of float, one dimension of hyper-param tuning grid
    
    ranks: list of float, one dimension of hyper-param tuning grid
    
    Return
    ------
    The best fitted ALS model with lowest RMSE score on validation data
    """
    # initial
    min_error = float('inf')
    best_iter1 = -1
    best_rank1 = -1
    best_regularization1 = 0
    best_model_rmse = None
    max_map = 0.0
    best_iter2 = -1
    best_rank2 = -1
    best_regularization2 = 0
    best_model_map = None
    for current_rank in ranks:
        for reg in regParams:
            # get ALS model
            #als = ALS().setMaxIter(iteration).setRank(rank).setRegParam(reg)
            als = ALS(maxIter=iteration,
                      regParam=reg,
                      rank=current_rank,
                      userCol='user_id',
                      itemCol='book_id',
                      ratingCol='rating',
                      coldStartStrategy="drop",
                      nonnegative=True)
            # train ALS model
            train_read.checkpoint()
            model_read = als.fit(train_read)
            # evaluate the model by computing the RMSE on the validation read data
            predictions_read = model_read.transform(val_read)
            # combine predictions on read and unread data
            predictions_all = predictions_read.union(predictions_unread)
            # select top 500 books for each use to evaluate
            window = Window.partitionBy(predictions_all['user_id']).orderBy(
                predictions_all['prediction'].desc())
            val_pred_order = predictions_all.select(
                '*',
                rank().over(window).alias('rank')).filter(col('rank') <= 500)
            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(val_pred_order)

            if rmse < min_error:
                min_error = rmse
                best_rank1 = current_rank
                best_regularization1 = reg
                best_iter1 = iteration
                best_model_rmse = model_read

                # evaluate the model by computing the MAP on the validation data

            val_pred_list = val_pred_order.select(
                'user_id', 'book_id').groupBy('user_id').agg(
                    expr('collect_list(book_id) as books'))
            val_RDD = val_pred_list.join(
                val_true_list, 'user_id').rdd.map(lambda row: (row[1], row[2]))
            val_RDD.checkpoint()
            rankingMetrics = RankingMetrics(val_RDD)
            current_map = rankingMetrics.meanAveragePrecision

            if current_map > max_map:
                max_map = current_map
                best_rank2 = current_rank
                best_regularization2 = reg
                best_iter2 = iteration
                best_model_map = model_read

            print('{} latent factors and regularization = {} with maxIter {}: '
                  'validation RMSE is {}'
                  'validation MAP is {}'.format(current_rank, reg, iteration,
                                                rmse, current_map))
            with open('train01_read_eval.csv', 'ab') as f:
                np.savetxt(f, [
                    np.array([iteration, current_rank, reg, rmse, current_map])
                ],
                           delimiter=",")

        print('\nThe best model select by RMSE has {} latent factors and '
              'regularization = {}'
              'with maxIter = {}'.format(best_rank1, best_regularization1,
                                         best_iter1))
        print('\nThe best model select by MAP has {} latent factors and '
              'regularization = {}'
              'with maxIter = {}'.format(best_rank2, best_regularization2,
                                         best_iter2))

        return best_model_rmse, best_model_map
test_pred_order = predictions.select(
    '*',
    rank().over(window).alias('rank')).filter(col('rank') <= 500)

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(test_pred_order)

# evaluate the model by computing the MAP on the validation data
test_pred_list = test_pred_order.select(
    'user_id',
    'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books'))
test_RDD = test_pred_list.join(test_true_list,
                               'user_id').rdd.map(lambda row: (row[1], row[2]))
rankingMetrics = RankingMetrics(test_RDD)
current_map = rankingMetrics.meanAveragePrecision

print(
    '\nThe best baseline model select by RMSE = {} has {} latent factors and '
    'regularization = {}  with maxIter = {} MAP = {}'.format(
        rmse, current_rank, reg, iteration, current_map))
"""
# evaluate read model

train_new = train.withColumn('rating',when(train.is_read == 0,float('nan')).otherwise(train.rating))
train_read = train_new.na.drop()
train_unread = train.subtract(train_read)

test_new = test.withColumn('rating',when(test.is_read == 0,float('nan')).otherwise(test.rating))
test_read = test_new.na.drop()
Example #29
0

prediction_val = best_model.transform(df_validation)
print(" Predictions for validation dataset: ------------------------------")
prediction_val.show()


prediction_test = best_model.transform(df_test)
print(" Predictions for test dataset: ------------------------------")
prediction_test.show()


actual_val = df_validation.groupBy("user_id").agg(expr("collect_set(book_id) as books"))
pred_val = user_recs.select('user_id','recommendations.book_id')
output_val =pred_val.join(actual_val,['user_id']).select('book_id','books')
metrics_val = RankingMetrics(output_val.rdd)
result_val = metrics_val.meanAveragePrecision
result_val2 = metrics_val.precisionAt(20)

print("Mean average precision for validation dataset: " + str(result_val))
print("Precision @ 20 for validation dataset: " + str(result_val2))
rmse_val = evaluator.evaluate(prediction_val)
print("RMSE for validation dataset=" + str(rmse_val))


actual_test = df_test.groupBy("user_id").agg(expr("collect_set(book_id) as books"))
pred_test = user_recs.select('user_id','recommendations.book_id')
output_test =pred_test.join(actual_test,['user_id']).select('book_id','books')
metrics_test = RankingMetrics(output_test.rdd)
result_test = metrics_test.meanAveragePrecision
result_test2 = metrics_test.precisionAt(20)
Example #30
0
        predictedList.append(rankingDict[predictedRankings[i]])

    return (predictedList, rankingList)


# In[8]:

test = groundTruthRankedRatings.join(
    predictedRankedRatings)  # joins the rdds on the movie user id

#user id (tuple ( list of actual rankings, other list of predicted rankings)
rankingsRDD = test.map(convertToRankings)

x = rankingsRDD.map(lambda t:
                    (t[1][0], t[1][1]))  # this combines the two rankings
metrics = RankingMetrics(rankingsRDD)
metrics.meanAveragePrecision

# In[6]:

num_folds = 5
fold1, fold2, fold3, fold4, fold5, = ratings.randomSplit([.2, .2, .2, .2, .2],
                                                         seed=9999)
dataList = [fold1, fold2, fold3, fold4, fold5]

for rank in [5, 10, 15, 20]:
    for numIterations in [5, 10, 15, 20]:

        print('rank is ' + str(rank) + '  numIterations is ' +
              str(numIterations))
        total_RMSE = 0
Example #31
0
def main(spark, log_comp=False, drop_low=False, drop_thr=0):
    '''

    Parameters
    ----------
    spark : SparkSession object

    train_path : string, path to the training parquet file to load

    val_path : string, path to the validation parquet file to load

    test_path : string, path to the validation parquet file to load
    '''
    ## Load in datasets
    train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet'
    val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'
    test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    train = spark.read.parquet(train_path)
    val = spark.read.parquet(val_path)
    test = spark.read.parquet(test_path)

    ## Downsample the data
    # Pick out user list in training set
    user_train = set(row['user_id']
                     for row in train.select('user_id').distinct().collect())
    # Pick out user list in validation set
    user_val = set(row['user_id']
                   for row in val.select('user_id').distinct().collect())
    # Get the previous 1M users
    user_prev = list(user_train - user_val)
    # Random sampling to get 20%
    k = int(0.2 * len(user_prev))
    user_prev_filtered = random.sample(user_prev, k)
    train = train.where(train.user_id.isin(user_prev_filtered +
                                           list(user_val)))

    ## Create StringIndexer
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_indexed",
                                 handleInvalid='skip')
    indexer_user_model = indexer_user.fit(train)
    indexer_track = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_indexed",
                                  handleInvalid='skip')
    indexer_track_model = indexer_track.fit(train)

    train = indexer_user_model.transform(train)
    train = indexer_track_model.transform(train)

    val = indexer_user_model.transform(val)
    val = indexer_track_model.transform(val)

    test = indexer_user_model.transform(test)
    test = indexer_track_model.transform(test)

    ## ALS model
    rank_ = [5, 10, 20]
    regParam_ = [0.1, 1, 10]
    alpha_ = [1, 5, 10]
    param_grid = it.product(rank_, regParam_, alpha_)

    ## Pick out users from validation set
    user_id = val.select('user_id_indexed').distinct()
    true_label = val.select('user_id_indexed', 'track_id_indexed')\
                    .groupBy('user_id_indexed')\
                    .agg(expr('collect_list(track_id_indexed) as true_item'))

    ## Log-Compression
    ## count -> log(1+count)
    if log_comp == True:
        train = train.select('*', F.log1p('count').alias('count_log1p'))
        val = val.select('*', F.log1p('count').alias('count_log1p'))
        rateCol = "count_log1p"
    else:
        rateCol = "count"

    ## Drop interactions that have counts lower than specified threhold
    if drop_low == True:
        train = train.filter(train['count'] > drop_thr)
        val = val.filter(val['count'] > drop_thr)

    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \
            alpha=i[2], nonnegative=True, coldStartStrategy="drop")
        model = als.fit(train)
        print('Finish Training for {}'.format(i))

        # Make top 500 recommendations for users in validation test
        res = model.recommendForUserSubset(user_id, 500)
        pred_label = res.select('user_id_indexed',
                                'recommendations.track_id_indexed')

        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \
                    .rdd \
                    .map(lambda row: (row[1], row[2]))

        print('Start Evaluating for {}'.format(i))
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa)

    pass