Example #1
0
def test_rmse():
    # TODO: revised so that it will take user's inputs instead of hardcoded values

    movies_schema = None
    ratings_schema = None

    # load the schemas
    with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
        movies_schema = StructType.fromJson(json.load(json_schema_file))

    with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
        ratings_schema = StructType.fromJson(json.load(json_schema_file))

    # create a hdfs directory
    os.system("hdfs dfs -mkdir datasets")

    # load the json file into the hdfs directory
    os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")

    # create a DataFrame based on the content of the json file
    ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
    # explicitly repartition RDD after loading so that more tasks can run on it in parallel
    # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
    ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)    

    # parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
    ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
    ratingsRDD.cache()

    # split data into train (60%), test (40%)
    # TODO: add validation in the future? train (60%), validation (20%), test(20%)?
    trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = ALS.train(trainingRDD, rank=3)
    print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
    print "testPredRDD: %s seconds" % t.secs

    # calculate RMSE
    with Timer() as t:
        testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
    print "testRmse: %s seconds" % t.secs
    print "testRmse", testRmse

    return
Example #2
0
def test_simple_rmse():
    """ Test RMSE as follows:
        (1) train the ALS model with a subset of 15 values 
        (2) predict a subset of 15 values using the trained model 
        (3) calculate RMSE or how accurately the prediction is 
            in comparison to the known values

    Values used to train the ALS model are based on a fictitious world where 
    5 users rate 4 items whether they like or dislike an item. If the user liked
    the item, he will provide a rating of 1; otherwise, if the user disliked the
    item, he will provide a rating of -1. No rating means that the user has not
    rated the item. This data will be formatted in an RDD of [(userId, itemId, rating)].
    Splitting these 15 values into training, validation, and test dataset is 
    randomly selected.

                     0   1   2   3  = itemID
    userId =    0    1  -1   1   1
                1        1  -1  -1
                2    1   1  -1  
                3   -1       1  
                4    1   1      -1

    0:  (0, 0, 1)   
    1:  (0, 1, -1)  
    2:  (0, 2, 1)   
    3:  (0, 3, 1)   
    4:  (1, 1, 1)   
    5:  (1, 2, -1)  
    6:  (1, 3, -1)  
    7:  (2, 0, 1)   
    8:  (2, 1, 1)   
    9:  (2, 1, -1)  
    10: (3, 0, -1)  
    11: (3, 2, 1)   
    12: (4, 0, 1)   
    13: (4, 1, 1)   
    14: (4, 3, -1)

    """

    # load the data, an RDD of [(userId, itemId, rating)]
    # split data into train (60%), validation (20%), test(20%)
    # training (8): data to train the model
    # validation (3):  best performing approach using the validation data
    # test (3): estimate accuracy of the selected approach
    # TODO: possible split using sklearn's train_test_split?

    trainingArray = [(4, 3, -1), (1, 1, 1), (3, 0, -1), 
                     (4, 0, 1), (1, 2, -1), (0, 0, 1), 
                     (2, 1, -1), (0, 2, 1), (1, 3, -1)]
    validationArray = [(4, 1, 1), (3, 2, 1), (2, 1, 1)]
    testArray = [(2, 0, 1), (0, 1, -1), (0, 3, 1)]

    trainingRDD = scsingleton.sc.parallelize(trainingArray)
    validationRDD = scsingleton.sc.parallelize(validationArray)
    testRDD = scsingleton.sc.parallelize(testArray)

    # run training algorithm to build the model 
    isExplicit = True 
    ranks = [3, 5, 7]
    #numIters = [5]         # default value
    #lmbdas = [0.01]        # default value
    #blocks = -1            # default value
    #nonnegative = False    # default value
    #seed = None            # default value
    #alpha = [0.01]         # default value
    model = None

    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0

    # with validation
    #for rank, numIter, lmbda in itertools.product(ranks, numIters, lmbdas):
    for rank in ranks:
        if isExplicit:
            model = ALS.train(trainingRDD, rank)
        else: 
            # TODO: figure out why trainImplicit crash
            model = ALS.trainImplicit(trainingRDD, rank, iterations=5, alpha=0.01)
        validationPredRDD = model.predictAll( validationRDD.map( lambda x: (x[0], x[1]) ) )
        validationRmse = pm.calculate_rmse_using_rdd(validationRDD, validationPredRDD)
        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank

    # make a prediction
    testPredRDD = bestModel.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()

    """
    # without validation
    model = ALS.train(trainingRDD, rank=3)
    testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) )
    """

    # calculate RMSE
    testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
    print "testRmse using RDD = ", testRmse

    return