def evaluate(sc, raw_user_movies, raw_hot_movies):
    movies_name = build_movies(raw_hot_movies)
    user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap()
    ratings = build_ratings(raw_user_movies, user_id_to_int)
    num_iterations = 10
    for rank in [10, 50]:
        for lam in [1.0, 0.01, 0.0001]:
            model =  ALS.train(ratings, rank, num_iterations, lam)
            user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
            predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
            print predictions.take(3)
            rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
            print rates_and_preds.take(3)
            mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            print "(rank:%d, lambda: %f,) Mean Squared Error = %f" % (rank, lam, mse)
    for rank in [10, 50]:
        for lam in [1.0, 0.01, 0.0001]:
            for alpha in [1.0, 40.0]:
                model = ALS.trainImplicit(ratings, rank, num_iterations, lam, alpha=alpha)
                user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
                predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
                rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
                print rates_and_preds.take(3)
                mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
                print "(rank:%d, lambda: %f, alpha: %f, implicit  ) Mean Squared Error = %f" % (rank, lam, alpha, mse)
Beispiel #2
0
    def _recommend(self, train_ratings, users):
        from pyspark.mllib.recommendation import ALS, Rating

        # Preparing the user/item mapping as integers, since Spark's ALS implementation only works with integer values
        train_ratings['user'] = train_ratings['user'].astype('category')
        train_ratings['item'] = train_ratings['item'].astype('category')
        user_cat, item_cat = train_ratings['user'].cat, train_ratings['item'].cat
        self.user_cat = user_cat
        self.item_cat = item_cat
        self.train_ratings = train_ratings

        # Training the model
        self.ratings = self.sc.parallelize(Rating(u, i, rating) for u, i, rating in zip(user_cat.codes, item_cat.codes, train_ratings.rating))
        if self.implicit:
            model = ALS.trainImplicit(self.ratings, **self.spark_args)
        else:
            model = ALS.train(self.ratings, **self.spark_args)

        # Getting predictions from the model
        self.ratings_to_predict = self.sc.parallelize((user, item) for user in users for item in item_cat.codes.unique())
        self.predictions = model.predictAll(self.ratings_to_predict).collect()
        # Presenting the recommendations as a DataFrame
        self.predictions = [(user_cat.categories[p.user], item_cat.categories[p.product], p.rating) for p in self.predictions]
        self.predictions_df = pd.DataFrame(self.predictions, columns=['user', 'item', 'rating'])
        return self.predictions_df
Beispiel #3
0
    def train(self, rank=3, iterations=20, lambda_=0.01, alpha=None, blocks=-1):
        """
        train a mf model against the given parameters
        """
        if alpha:
            model = ALS.trainImplicit(self.train_data, rank, iterations, lambda_, blocks, alpha)
        else:
            model = ALS.train(self.train_data, rank, iterations, lambda_)

        return model
def main(sc):

    seed = 5L
    iterations = 10
    regularization_parameter = 0.1
    rank = 4


    data = sc.textFile("file:///Expedia/data/train1.csv")
    
    ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    
    new_data = sc.textFile("file:///Expedia/data/new_set.csv")
    
    new_ratings = new_data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    new_ratings_for_predict_RDD = new_ratings.map(lambda x: (x[0], x[1])).cache()
    
    complete_data = ratings.union(new_ratings).cache()
    
    new_ratings_model = ALS.trainImplicit(complete_data, rank, seed=seed, 
                              iterations=iterations, lambda_=regularization_parameter)
                              
    
    # that not work need more invistigation                        
    #predictions = new_ratings_model.predictAll(0,'83').collect()
    predictions = new_ratings_model.predictAll(new_ratings_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])).collect()
    recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:2]
    
    recommendations.take(5)
Beispiel #5
0
    def train(self):
        "Train the model with new data and write to file"
        user_lookup, course_lookup = self.__prepare_data()

        # send list of (user_id, course_id, rating) triples to the ML algorithm
        log.info('Loading ratings data')
        ratings_RDD_raw = self.sc.parallelize(m.UserCourse.objects)
        self.ratings_RDD = (ratings_RDD_raw
                            .filter(lambda ratings:
                                    ratings.course_review.interest is not None)
                            .map(lambda ratings:
                                 (user_lookup[str(ratings.user_id)],
                                  course_lookup[ratings.course_id],
                                  float(ratings.course_review.interest)))
                            ).cache()
        training_error, test_error = self._report_error(self.ratings_RDD)

        log.info('Training model')
        model = ALS.train(self.ratings_RDD,
                          _PARAMS['rank'],
                          _PARAMS['num_iterations'],
                          _PARAMS['reg_param'])
        log.info('Model trained!')
        model_path = os.path.join(os.path.dirname(__file__),
                                  '%s/trained_model' % c.RECOMMENDATION_DIR)
        if os.path.isdir(model_path):
            rmtree(model_path)
        model.save(self.sc, model_path)

        self._report_metrics(num_courses=self.ratings_RDD.count(),
                             training_error=training_error,
                             test_error=test_error)
def main(argv):

    Conf = (SparkConf().setAppName("recommendation"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)

    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
    rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
    # argv[1] is the dump of training data in hdfs
    # argv[2] is the user perferences

    # User Hash Lookup stored into cassandra
    user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
    distinctUser = user_hash.distinct()
    userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
    userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace =  keyspace).save(mode="append")
    

    # Product Hash Lookup stored into cassandra
    product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
    distinctProduct = product_hash.distinct()
    productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
    productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace =  keyspace).save(mode="append")

    # Ratings for training
    # ALS requires a java hash of string. This function does that and stores it as Rating Object
    # for the algorithm to consume
    ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))

    
    model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
    model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")

    sc.stop()
Beispiel #7
0
def alq_spark(A, k, sc, **kwargs):
    """
    Args:
    
    - A: sign matrix (csr_matrix)
    - k: number of clusters
    - sc: the spark context
    - kwargs: parameters for ALS.train except for ratings
        https://spark.apache.org/docs/1.5.1/api/python/pyspark.mllib.html#pyspark.mllib.recommendation.ALS.train

    Return:

    X: np.ndarray (n x k)
    Y: np.ndarray (k x n)
    """
    edges = indexed_entries(A)
    
    edges_rdd = sc.parallelize(edges)
    model = ALS.train(edges_rdd, rank=k, **kwargs)

    u_ft = model.userFeatures()
    p_ft = model.productFeatures()

    X = u_ft.sortByKey(ascending=True).collect()
    Y = p_ft.sortByKey(ascending=True).collect()

    X = np.array(list(zip(*X))[1])
    Y = np.transpose(np.array(list(zip(*Y))[1]))

    return X, Y
Beispiel #8
0
def grid_search(train_df, test_df, X_test_df, y_test):
    ranks = [6]  # , 8, 12, 18]
    lambdas = list(np.arange(0.1, 0.5, 0.1))
    numIters = [20]
    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1

    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
        model = ALS.train(train_df, rank, numIter, lmbda)
        validationRmse = computeRMSE(model, test_df, X_test_df, len(y_test))
        print "RMSE (validation) = %f for the model trained with " % validationRmse + "rank = %d, lambda = %.1f, and numIter = %d." % (
            rank,
            lmbda,
            numIter,
        )

        if validationRmse < bestValidationRmse:
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter

    testRmse = computeRMSE(bestModel, test_df, X_test_df, len(y_test))

    # evaluate the best model on the test set
    print "The best model was trained with rank = %d and lambda = %.1f, " % (
        bestRank,
        bestLambda,
    ) + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)

    return bestModel
Beispiel #9
0
def main():
    training_data = sc.textFile(training_inputs)
    testing_data = sc.textFile(testing_inputs)

    training_ratings = training_data.map(get_tuple).cache()
    testing_ratings = testing_data.map(get_tuple).cache()
    testing_all = testing_ratings.map(lambda (uid, mid, rating): (uid, mid)).cache()
    ratings = testing_ratings.map(to_Rating)


    ranks = [2, 4, 8, 16, 32, 64, 128, 256]
    reg_params = [0.1, 0.01]

    for i in range(len(reg_params)):
        RMSES = []
        for rank in ranks:
            model = ALS.train(training_ratings, rank=rank, lambda_=reg_params[i], seed=10)
            predictions = model.predictAll(testing_all).map(lambda r: ((r[0], r[1]), r[2]))
            ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
            MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
            RMSE = math.sqrt(MSE)
            RMSES.append(RMSE)
        plt.plot(range(len(ranks)), RMSES, label=str(reg_params[i]))

    plt.xticks(range(len(ranks)), ranks, size='small')
    plt.legend()
    plt.show()
Beispiel #10
0
 def __train_model(self, ratings_RDD):
     """Train the ALS model with the current dataset
     """
     model = ALS.train(ratings_RDD, self.rank, seed=self.seed,
                            iterations=self.iterations, lambda_=self.regularization_parameter)
     
     return model
Beispiel #11
0
    def train(self, rank, iterations=10, lambda_=0.01, seed=0, **kwargs):
        """
        Train the model.

        Parameters
        ----------
        rank : int
            The number of factors in the underlying model.  Generally, larger numbers of factors
            lead to better models, but increase the memory required.  A rank in the range of 10 to 200
            is usually reasonable.

        iterations : int, optional
            The number of iterations to perform.  With each iteration, the model improves.  ALS
            typically converges quickly, so a value of 10 is recommended.

        lambda : float, optional
            This parameter controls regularization, which controls overfitting.  The higher the value of
            lambda applies more regularization.  The appropriate value here depends on the problem, and needs
            to be tuned by train/test techniques, which measure overfitting.

        Returns
        -------
        out: : model
            A RecommenderModel.  This can be used to make predidictions on how a user would rate an item.
        """

        ratings = self._prepare_ratings()
        model = ALS.train(ratings.to_rdd(),
                          rank, 
                          iterations=iterations, 
                          lambda_=lambda_, 
                          seed=seed, 
                          **kwargs)
        return MatrixFactorizationModel(model, self.ratings, self.user_col, self.item_col, self.rating_col)
def als(data):
    train, test = data.randomSplit(weights=[0.8, 0.2])
    X_train = train.map(lambda r : Rating(r[0], r[1], r[2]))
    y = test.map(lambda r : ((r[0], r[1]), r[2]))
    X_test = test.map(lambda r : (r[0], r[1]))
    rank = 7
    X_train.cache()
    X_test.cache()
    lambdas = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    numIterations = 10
    nonnegative=True
    bestModel = None
    error = float('Inf')
    errors = []
    #Use ALS to predict play time for test users and choose the best parameter for lambda
    for lmbda in lambdas:
        model = ALS.train(X_train, rank, numIterations, lmbda, nonnegative=nonnegative)
        y_hat = model.predictAll(X_test).map(lambda r : ((r[0], r[1]), r[2]))
        ratesAndPreds = y.join(y_hat)
        MSE = ratesAndPreds.map(lambda r : ((r[1][0]) - (r[1][1]))**2).mean()
        errors.append(MSE)
        if MSE < error:
            bestModel = model
            error = MSE
    #Plot mean square error v.s. lambda
    plt.plot(lambdas, errors, 'ro')
    plt.xlabel(r'$\lambda$')
    plt.ylabel(r'$MSE$')
    plt.title(r'MSE v.s. $\lambda$')
    plt.savefig('cross_validation_p.png')
    #Make Prediction by using the best model
    y_hat = model.predictAll(X_test).map(lambda r : (r[0], r[1], r[2]))
    y_hat.map(toCVSLine).saveAsTextFile('prediction')
    return bestModel, error
def main():
    """ Train and evaluate an ALS recommender.
    """
    # Set up environment
    sc = SparkContext("local[*]", "RecSys")

    # Load and parse the data
    data = sc.textFile("./data/ratings.dat")
    ratings = data.map(parse_rating)

    # Build the recommendation model using Alternating Least Squares
    rank = 10
    iterations = 20
    model = ALS.train(ratings, rank, iterations)

    movies = sc.textFile("./data/movies.dat")\
               .map(parse_movie)
    # Evaluate the model on training data
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata)\
                       .map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = ratings.map(lambda r: ((r[0], r[1]), r[2]))\
                             .join(predictions)
    MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))
def find_best_model(data):
    global bestRank
    global bestLambda
    global bestNumIter
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1
    ranks = [8, 12]
    lambdas = [0.1, 10.0]
    numIters = [10, 20]
    min_error = float('inf')
    training, validation, test = data.randomSplit([0.6, 0.2, 0.2], 6)
    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
        ALS.checkpointInterval = 2
        training_data = training.map(lambda xs: [int(x) for x in xs])
        model = ALS.train(training_data, rank, numIter, lmbda)
        validation_data = validation.map(lambda p: (int(p[0]), int(p[1])))
        predictions = model.predictAll(validation_data).map(lambda r: ((r[0], r[1]), r[2]))
        ratings_and_predictions = validation.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        error = sqrt(ratings_and_predictions.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        print 'For rank %s the RMSE is %s' % (rank, error)
        if error < min_error:
            min_error = error
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
    print 'The best model was trained with rank %s' % bestRank
Beispiel #15
0
 def __train_model(self):
     """Train the ALS model with the current dataset
     """
     logger.info("Training the ALS model...")
     self.model = ALS.train(self.ratings_RDD, self.rank, seed=self.seed,
                            iterations=self.iterations, lambda_=self.regularization_parameter)
     logger.info("ALS model built!")
Beispiel #16
0
def fit_final_model(train):
    #model params
    iterations = 20
    reg = 0.0875
    rank = 6
    model = ALS.train(train.rdd.map(lambda x: (x[0], x[1], x[2])), rank=rank, nonnegative=True, iterations=iterations, lambda_=reg)
    return model
Beispiel #17
0
def model_param_sweep(train, test):
    #model params
    iterations = 20
    regularization_param_list = np.linspace(0.05, 0.2, 5)

    #params used in keeping track of error between different ranks
    rank_list = [4, 6, 8]
    errors = np.zeros(len(regularization_param_list)*len(rank_list))
    err = 0
    min_error = float('inf')
    max_class_rate = 0
    best_rank = -1
    best_iteration = -1

    for rank in rank_list:
        for reg in regularization_param_list:
            model = ALS.train(train.rdd.map(lambda x: (x[0], x[1], x[2])), rank=rank, nonnegative=True, iterations=iterations, lambda_=reg)
            predictions =  model.predictAll(test.rdd.map(lambda r: (r[0], r[1]) )).map(lambda x: ((int(x[0]), int(x[1])), float(x[2])) )
            rates_and_preds = test.rdd.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
            correct_count = rates_and_preds.filter(lambda r:( abs(r[1][0] - r[1][1]) < 1) or (r[1][0] < 6 and r[1][1] < 6) ).count()
            total_count = rates_and_preds.count()
            class_rate = correct_count*1./total_count
            error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            errors[err] = error
            err += 1
            print 'For rank=%s, regParam=%s the RMSE is %s with a correct classification rate of %0.3f' % (rank, reg, error, class_rate)
            if class_rate > max_class_rate:
                max_class_rate = class_rate
                best_rank = (rank, reg)
    print 'The best model was trained with (rank, regParam): %s and had class rate %0.3f' %(str(best_rank), max_class_rate)
def build_ALS_model(ratings):
    # Build the recommendation model using Alternating Least Squares
    rank = 10
    numIterations = 20
    model = ALS.train(ratings, rank, numIterations)

    return model
def train_and_predict():

    training = sc.textFile('member_item_file').map(parseRating).cache()
    #now train the model using ALS
    rank=10
    number_of_iterations = 10
    model = ALS.train(training, rank, number_of_iterations)
    print model
Beispiel #20
0
def train_als(data):
    # map ratings into Ratings object comprised of [user, movie, rating]
    data = data.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

    rank = 10
    numIterations = 10
    model = ALS.train(data, rank, numIterations)
    return model, data
Beispiel #21
0
def cross_validation(training, validation, test, candidates, id_title_map, ranks, lambdas, numIters, alphas):
# train models and evaluate them on the validation set

    result_dict = {}
    result_template = "rank:%d  iters:%d  lambda: %f"
    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1
    numTraining = training.count()
    numValidation = validation.count()
    numTest = test.count()
    if  not IMPLICIT:
        alphas = [1.0]
    for rank, lmbda, numIter, alpha in itertools.product(ranks, lambdas, numIters, alphas):
        if IMPLICIT:
            model = ALS.trainImplicit(training, rank, iterations=numIter, lambda_=lmbda, alpha=alpha, nonnegative=True)
        else:
            model = ALS.train(training, rank, iterations=numIter, lambda_=lmbda, nonnegative=True)
        validationRmse = 0.0 #computeRmse(model, validation, numValidation)
        print "RMSE (validation) = %f for the model trained with " % validationRmse + \
              "rank = %d, lambda = %.4f, and numIter = %d and alpha=%f." % (rank, lmbda, numIter, alpha)

        qe_results = qualitative_evaluation(model, candidates, id_title_map)

        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
        result_dict[result_template % (rank, numIter, lmbda)] = validationRmse
    testRmse = 0.0 #computeRmse(bestModel, test, numTest)
    # evaluate the best model on the test set
    print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
      + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
    result_dict['BEST Model on Test:' + result_template % (bestRank, bestNumIter, bestLambda)] = testRmse
    # compare the best model with a naive baseline that always returns the mean rating
    meanRating = training.union(validation).map(lambda x: x[2]).mean()
    baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)
    improvement = (baselineRmse - testRmse) / baselineRmse * 100
    print "The best model improves the baseline by %.2f" % (improvement) + "%."
    result_dict['BEST gain over baseline'] = improvement
    return bestModel, result_dict
Beispiel #22
0
def run():
    log.info("train model start...")
    sql_user_rating = "select uid,pid,rating from user_rating"
    cur.execute(sql_user_rating)
    rdd_user_rating = sc.parallelize(cur.fetchall())

    ratings = rdd_user_rating.map(lambda x:Rating(int(x[0]),int(x[1]),float(x[2])))
    mod = ALS.train(ratings,50)
    return mod
def compare_recommand_result_byMSE(rank,inter,lambda_value):
    rating_data=row_data_rdd.map(lambda x:x.split("\t")).\
        map(lambda raw_rating_data:Rating(int(raw_rating_data[0]),int(raw_rating_data[1]),float(raw_rating_data[2])))
    rating_model=ALS.train(rating_data,rank,inter,lambda_value)
    user_product=rating_data.map(lambda r:(r[0],r[1]))
    predictions =rating_model.predictAll(user_product).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds=rating_data.map(lambda r:((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print "Mean Squared Error = " + str(MSE)
Beispiel #24
0
 def _report_error(self, data_set):
     training_set, test_set = data_set.randomSplit([7, 3], seed=0L)
     model = ALS.train(training_set,
                       _PARAMS['rank'],
                       _PARAMS['num_iterations'],
                       _PARAMS['reg_param'])
     training_error = self._get_rsme(model, training_set)
     test_error = self._get_rsme(model, test_set)
     return training_error, test_error
def model(sc, raw_user_movies, raw_hot_movies):
    movies_name = build_movies(raw_hot_movies)
    user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap()
    print user_id_to_int['adamwzw']
    user_int_to_id = {v: k for k, v in user_id_to_int.iteritems()}
    rating_data = build_ratings(raw_user_movies, user_id_to_int)
    model = ALS.train(rating_data, 50, 10, 0.0001)
    print model.userFeatures().collect()[:2]
    for user_int in xrange(1, 30):
        check_recommend_result(user_int, raw_user_movies, movies_name, user_int_to_id, model)
def trainModel(limit,data,rank,num_iterations):
	save_file = "models/"+str(limit)+"rank"+str(rank)+"iterations"+str(num_iterations)
	if isdir(save_file):
		print("Rank "+str(rank)+" and Iterations "+str(num_iterations)+" Model already exists, loading...")
		model = MatrixFactorizationModel.load(sc, save_file)
	else:
		print("Model does not exist, training ALS with rank "+str(rank)+" and "+str(num_iterations)+" iterations")
		model = ALS.train(data, rank, num_iterations)
		print("Saving new model")
		model.save(sc,save_file)
	return model
def recommand_movies(user_id):
    rating_data=load_rating_data()
    #rating_model=ALS.train(rating_data,50,10,0.1)
    rating_model=ALS.train(rating_data,100,10,0.1)
    #预测特定用户对特定电影的评分
    moveid=123
    predict_result=rating_model.predict(user_id,moveid)
    print "预测用户:%s 对电影: %s  的评测分数是:%s" %(user_id,moveid,predict_result)
    #为789用户推荐10个商品
    recommand_result=rating_model.recommendProducts(user_id,10)
    return recommand_result
 def train_model(self):
     """
     rank:对应ALS模型中的因子个数,也就是在低阶近似矩阵中的隐含特征个数。因子个数一般越多越好。但它也会直接影响模型训练和保存时所需的内存开销,尤其是在用户和物品很多的时候。因此实践中该参数常作为训练效果与系统开销之间的调节参数。通常,其合理取值为10到200。
     iterations:对应运行时的迭代次数。ALS能确保每次迭代都能降低评级矩阵的重建误差,但一般经少数次迭代后ALS模型便已能收敛为一个比较合理的好模型。这样,大部分情况下都没必要迭代太多次(10次左右一般就挺好)。
     lambda:该参数控制模型的正则化过程,从而控制模型的过拟合情况。其值越高,正则化越严厉。该参数的赋值与实际数据的大小、特征和稀疏程度有关。和其他的机器学习模型一样,正则参数应该通过用非样本的测试数据进行交叉验证来调整。
     试过一些参数,使用的rank、iterations和lambda参数的值分别为50、10和0.01,效果稍好:
     """
     self.model = ALS.train(self.ratings, 50, 10, 0.01)
     user_features = self.model.userFeatures()
     item_features = self.model.productFeatures()
     self.user_num = user_features.count()   # 用户数量
     self.item_num = user_features.count()   # 电影数量
Beispiel #29
0
def test_rmse():
    # TODO: revised so that it will take user's inputs instead of hardcoded values

    movies_schema = None
    ratings_schema = None

    # load the schemas
    with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
        movies_schema = StructType.fromJson(json.load(json_schema_file))

    with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
        ratings_schema = StructType.fromJson(json.load(json_schema_file))

    # create a hdfs directory
    os.system("hdfs dfs -mkdir datasets")

    # load the json file into the hdfs directory
    os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")

    # create a DataFrame based on the content of the json file
    ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
    # explicitly repartition RDD after loading so that more tasks can run on it in parallel
    # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
    ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)    

    # parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
    ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
    ratingsRDD.cache()

    # split data into train (60%), test (40%)
    # TODO: add validation in the future? train (60%), validation (20%), test(20%)?
    trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = ALS.train(trainingRDD, rank=3)
    print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
    print "testPredRDD: %s seconds" % t.secs

    # calculate RMSE
    with Timer() as t:
        testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
    print "testRmse: %s seconds" % t.secs
    print "testRmse", testRmse

    return
Beispiel #30
0
    def __train_model(self, rank, seed, iterations, reg):
        logger.info("Training Movie Rec Engine (ALS)...")
        model = ALS.train(self.ratings_RDD, rank, seed=seed,
                               iterations=iterations, lambda_=reg)
        logger.info("Movie Rec Engine Trained!")

        self.rank = rank
        self.seed = seed
        self.iterations = iterations
        self.reg = reg

        return model
Beispiel #31
0
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [4]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(training_RDD,
                      rank,
                      seed=seed,
                      iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(
        lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(
        lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(
        rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank
    print("Training: %d, validation: %d, test: %d" %
          (numTraining, numValidation, numTest))

    # train models and evaluate them on the validation set

    ranks = [8, 12]
    lambdas = [0.1, 10.0]
    numIters = [10, 20]
    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1

    for rank, lmbda, numIter in itertools.product(ranks, lambdas, numIters):
        model = ALS.train(training, rank, numIter, lmbda)
        validationRmse = computeRmse(model, validation, numValidation)
        print("RMSE (validation) = %f for the model trained with " % validationRmse + \
        "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, numIter))
        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter

    testRmse = computeRmse(bestModel, test, numTest)

    # evaluate the best model on the test set
    print("The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
    + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse))
Beispiel #33
0
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

jdbcDriver = 'com.mysql.jdbc.Driver'
jdbcUrl    = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)

# checkpointing helps prevent stack overflow errors
sc.setCheckpointDir('checkpoint/')

# Read the ratings and accommodations data from Cloud SQL
dfRates = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Rating', useSSL='false').load()
dfAccos = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Accommodation', useSSL='false').load()
print("read ...")

# train the model
model = ALS.train(dfRates.rdd, 20, 20) # you could tune these numbers, but these are reasonable choices
print("trained ...")

# use this model to predict what the user would rate accommodations that she has not rated
allPredictions = None
for USER_ID in range(0, 100):
  dfUserRatings = dfRates.filter(dfRates.userId == USER_ID).rdd.map(lambda r: r.accoId).collect()
  rddPotential  = dfAccos.rdd.filter(lambda x: x[0] not in dfUserRatings)
  pairsPotential = rddPotential.map(lambda x: (USER_ID, x[0]))
  predictions = model.predictAll(pairsPotential).map(lambda p: (str(p[0]), str(p[1]), float(p[2])))
  predictions = predictions.takeOrdered(5, key=lambda x: -x[2]) # top 5
  print("predicted for user={0}".format(USER_ID))
  if (allPredictions == None):
    allPredictions = predictions
  else:
    allPredictions.extend(predictions)
training.count()
705486

test.count()
176359

# 1. Build the recommendation model using Alternating Least Squares

# Call the ALS.train mehod to train the model

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

rank = 10
numIterations = 10
model = ALS.train(training, rank, numIterations)

model
<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7f7afa38e850>

# Evaluate the model on training data

testdata = test.map(lambda r: (r[0],r[1]))

type(testdata)
<class 'pyspark.rdd.PipelinedRDD'>

testdata.take(5)
[(2866, 90950), (2866, 83908), (3997, 137454), (4101, 376365), (4101, 320620)]
# Making predictions
Beispiel #35
0

# Define spark session
spark = SparkSession \
    .builder \
    .appName("Spark Application") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
sc = spark.sparkContext
rawData = sc.textFile("../resource/ml-100k/u.data")
print(rawData.first())
rawRatings = rawData.map(lambda s: s.split("\t")[0:3])
print(rawRatings.first())
ratings = rawRatings.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
print(ratings.first())
model = ALS.train(ratings, 50, 10)
# model.userFeatures().collect()
parsedData = model.productFeatures().map(lambda tuple: tuple[1])
print(parsedData.take(2))
# Build the model (cluster the data)
clusters = KMeans.train(parsedData,
                        5,
                        maxIterations=10,
                        initializationMode="random")

# Within Set Sum of Squared Errors
WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model using KmeansModel of MLlib
# clusters.save(sc, "../target/KMeansModel")
    ra = 0
    lam = 0

    min_error = float('inf')
    best_lambda = -1
    best_lambda_index = -1
    best_model = None
    best_rank = -1
    best_rank_index = -1

    # Loop over all possible value fr lambda and rank to find the best parameters for our model that minimize the rmse
    for rank in ranks:
        for regParam in regularization_parameter:
            model = ALS.train(training_RDD,
                              rank,
                              seed=seed,
                              iterations=iterations,
                              lambda_=regParam)
            predictions = model.predictAll(validation_for_predict_RDD).map(
                lambda r: ((r[0], r[1]), r[2]))
            rates_and_preds = validation_RDD.map(lambda r: (
                (int(r[0]), int(r[1])), float(r[2]))).join(predictions)
            error = np.sqrt(
                rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            errors[ra][lam] = error
            print 'For lambda %s and rank %s the RMSE is %s' % (regParam, rank,
                                                                error)
            if error < min_error:
                min_error = error
                best_lambda = regParam
                best_model = model
def test(rdd):

    if len(rdd.collect()) == 0:

        pass

    else:
        new_user_ratings = []
        msgValue = rdd.map(lambda x: json.loads(x[1])).collect()
        print(msgValue)
        print(type(msgValue[0]))
        new_user_ID = msgValue[0]['user_id']
        song_id = msgValue[0]['shoes_id']
        rating = msgValue[0]['rating']

        print(new_user_ID, song_id, rating)
        data = (new_user_ID, song_id, rating)

        new_user_ratings.append(data)



        print(new_user_ratings)

        new_user_ratings_RDD = sc.parallelize(new_user_ratings)


        print('New user ratings: %s' % new_user_ratings_RDD.take(3))


        # merge new data into old data

        complete_data_with_new_ratings_RDD = complete_ratings_data.union(new_user_ratings_RDD)

        # train model again with new data

        from time import time

        t0 = time()

        new_ratings_model = ALS.train(complete_data_with_new_ratings_RDD, best_rank, seed=seed, iterations=iterations, lambda_=regularization_parameter)

        tt = time() - t0

        print("New model trained in %s seconds" % round(tt,3))


        #print(new_user_ratings)
        #print(type(new_user_ratings))
        #print(map(lambda x: x[1], new_user_ratings))
        try:
            new_user_ratings_ids = map(lambda x: x[1], new_user_ratings) # get just music IDs
        except:
            print(new_user_ratings,new_user_ratings_ids)

        # keep just those not on the ID list

        new_user_unrated_music_RDD = (complete_music_data.filter(lambda x: x[0] not in new_user_ratings_ids).map(lambda x: (new_user_ID, x[0])))

        # Use the input RDD, new_user_unrated_music_RDD, with new_ratings_model.predictAll() to predict new ratings for the musics

        new_user_recommendations_RDD = new_ratings_model.predictAll(new_user_unrated_music_RDD)


        # get every predicct result for new user

        new_user_recommendations_rating_RDD = new_user_recommendations_RDD.map(lambda x: (x.product, x.rating))

        # merge data with music info

        new_user_recommendations_rating_title_and_count_RDD = new_user_recommendations_rating_RDD.join(complete_music_titles).join(music_rating_counts_RDD)

        new_user_recommendations_rating_title_and_count_RDD.take(3)

        # transfer data format

        new_user_recommendations_rating_title_and_count_RDD = new_user_recommendations_rating_title_and_count_RDD.map(lambda r: (r[1][0][1], r[1][0][0], r[1][1]))

        # sort data by rating score and list first 25 data

        top_musics = new_user_recommendations_rating_title_and_count_RDD.filter(lambda r: r[2]>=25).takeOrdered(25, key=lambda x: -x[1])

        # print('TOP recommended musics (with more than 25 reviews):\n%s' % '\n'.join(map(str, top_musics)))

        # result_r = r.hset('shoes', new_user_ID, str(top_musics))


        # j = {'user': new_user_ID, 'music': top_musics}
        # result_m = shoe_recommend.insert_one(j)

        new_user_ratings = []

        return sc.parallelize(top_musics)
Beispiel #38
0
from pyspark.sql import HiveContext
from pyspark.mllib.evaluation import RankingMetrics
from pyspark import SparkContext
import pandas as pd
import numpy as np
sc = SparkContext()
train = sc.textFile("als_visible.txt")
test = sc.textFile("als_predict.txt")

train_ratings = train.map(lambda l: l.split(',')).map(
    lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
test_ratings = test.map(lambda l: l.split(',')).map(
    lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
rank = 30
numIterations = 10
model = ALS.trainImplicit(train_ratings, rank, numIterations)
model.save(sc, "target/tmprank30/myCollaborativeFilter")
#sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")

test_users = test_ratings.map(lambda x: x.user).collect()
test_users = list(set(test_users))
test_users = test_users[0:10000]

recs = {}
i = 0
for u in test_users:
    i += 1
    rec = model.recommendProducts(u, 200)
    recs[u] = list(map(lambda r: r[1], rec))
    if i % 100 == 0:
        print(i)
if __name__ == "__main__":
    sc = SparkContext(appName="Ranking Metrics Example")

    # Several of the methods available in scala are currently missing from pyspark
    # $example on$
    # Read in the ratings data
    lines = sc.textFile("data/mllib/sample_movielens_data.txt")

    def parseLine(line):
        fields = line.split("::")
        return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)

    ratings = lines.map(lambda r: parseLine(r))

    # Train a model on to predict user-product ratings
    model = ALS.train(ratings, 10, 10, 0.01)

    # Get predicted ratings on all existing user-product pairs
    testData = ratings.map(lambda p: (p.user, p.product))
    predictions = model.predictAll(testData).map(
        lambda r: ((r.user, r.product), r.rating))

    ratingsTuple = ratings.map(lambda r: ((r.user, r.product), r.rating))
    scoreAndLabels = predictions.join(ratingsTuple).map(lambda tup: tup[1])

    # Instantiate regression metrics to compare predicted and actual ratings
    metrics = RegressionMetrics(scoreAndLabels)

    # Root mean squared error
    print("RMSE = %s" % metrics.rootMeanSquaredError)
# train model
#training_RDD, validation_RDD = ratings_RDD.randomSplit([8, 2], 0)
#validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
#print(training_RDD.collect().take(3))

seed = 5
iterations = 12
regularization_parameter = 0.1
rank = 4
#errors = [0, 0, 0]
#err = 0
#tolerance = 0.02

training_RDD, test_RDD = ratings_RDD.randomSplit([8, 2], 0)
complete_model = ALS.train(training_RDD, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\
                    nonnegative = True)

test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

predictions = complete_model.predictAll(test_for_predict_RDD).map(
    lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = test_RDD.map(
    lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
mae = rates_and_preds.map(lambda r: (abs(r[1][0] - r[1][1]))).mean()
rmse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
logs = rates_and_preds.map(lambda r:
                           (math.log(r[1][1] + 1) - math.log(r[1][0] + 1)))
rmsle = math.sqrt(logs.map(lambda x: x**2).mean())

print("The MAE is {:G}".format(mae))
print("The RMSE is {:G}".format(rmse))
Beispiel #41
0
def main(data_source, users_source, apps_source, output, number_recs):

    users_data = {}
    with open(users_source) as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            users_data[int(row[0])] = row[1]

    apps_data = {}
    with open(apps_source) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            apps_data[int(row["id"])] = row["app"]

    # This should be changed if running on cluster
    conf = SparkConf().setMaster("local[*]").setAppName("AptoideALS")

    sc = SparkContext(conf=conf)
    sc.setLogLevel("OFF")
    # Load and parse the data
    data = sc.textFile(data_source)
    ratings = data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

    # Build the recommendation model using Alternating Least Squares
    seed = 5L
    iterations = 10
    # Is a basic L2 Regularizer to reduce overfitting
    regularization_parameter = 0.1
    # Number of features used to describe items
    rank = 50
    # Is the confidence that we have that the user likes the item
    alpha = 100.0

    model = ALS.trainImplicit(ratings,
                              rank,
                              seed=seed,
                              iterations=iterations,
                              lambda_=regularization_parameter,
                              alpha=alpha)

    # Evaluate the model on training data
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata).map(lambda r:
                                                 ((r[0], r[1]), r[2]))
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
        predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))

    users_names = sc.broadcast(users_data)
    apps_names = sc.broadcast(apps_data)

    def construct_string(x):
        item = "\"{}\"".format(str(apps_names.value[x[0]]))
        recs = [("\"{}\"".format(str(users_names.value[y[1]])), y[2])
                for y in x[1]]
        string = "{},{}".format(item, str(recs))
        return string.replace("[", "").replace("]",
                                               "").replace("\\", "").replace(
                                                   "'", "").replace(" ", "")

    recRDD = model.recommendUsersForProducts(
        int(number_recs)).map(construct_string)

    recRDD.saveAsTextFile(output)
Beispiel #42
0
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark import StorageLevel
import yaml

config = yaml.load(open('../config.yaml'))

newratingsRDD = newratings.rdd
ratingsML = newratingsRDD.map(
    lambda l: Rating(int(l[6]), int(l[8]), float(l[3])))

rank = 100
numIterations = 20
model = ALS.train(ratingsML, rank, numIterations)

# Evaluate the model on training data

testdata = ratingsML.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratingsML.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

#0.327520844251 -- rank = 10, numIterations = 10
#0.0835593328909 -- rank = 50, numIterations = 10

# Save and load model

model.save(sc, config['S3_model'])

model = MatrixFactorizationModel.load(sc, config['S3_model'])
Beispiel #43
0
RANK = 48


def cal_ndcg(model, test_data, k):
    test = test_data.map(lambda p: (p[0], p[1]))
    ret = model.predictAll(test) \
        .map(lambda r: (r.user, (r.product, r.rating))) \
        .groupByKey() \
        .mapValues(lambda l: sorted(l, key=lambda x: x[1], reverse=True)) \
        .mapValues(lambda l: [x[0] for x in l])
    true = test_data.filter(lambda p: p[2] == 1.0).map(lambda r:
                                                       (r[0], [r[1]]))
    predictionAndLabels = ret.join(true).map(lambda r:
                                             (r[1][0], list(r[1][1])))
    metrics = RankingMetrics(predictionAndLabels)
    return metrics.ndcgAt(k)


train_data = sc.textFile(train_file).map(lambda p: eval(p)).map(
    lambda p: (p[0], p[1], p[3]))
test_data = sc.textFile(test_file).map(lambda p: eval(p)).map(
    lambda p: (p[0], p[1], p[3]))
# train
model = ALS.trainImplicit(train_data,
                          RANK,
                          iterations=ITERS,
                          lambda_=LAMBDA,
                          alpha=ALPHA)
ndcg = cal_ndcg(model, test_data, NDCG_AT)
print('ndcg:', ndcg)
seed = 5
iterations = 12
regularization_parameter = 0.1
rank = 4
#errors = [0, 0, 0]
#err = 0
#tolerance = 0.02

training_RDD, test_RDD = ratings_RDD.randomSplit([8, 2], 0)

training_1 = training_RDD.map(lambda l: (l[0], l[1] // 5, l[2]))
training_2 = training_RDD.map(lambda l: (l[0], (l[1] + 3) // 5, l[2]))
training_3 = training_RDD.map(lambda l: (l[0], (l[1] - 3) // 5, l[2]))

model_1 = ALS.train(training_1, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\
                    nonnegative = True)
model_2 = ALS.train(training_2, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\
                    nonnegative = True)
model_3 = ALS.train(training_3, rank, seed=None, iterations=iterations, lambda_=regularization_parameter,\
                    nonnegative = True)

test_for_predict_RDD = test_RDD.map(
    lambda x: (x[0], x[1], x[1] // 5, (x[1] + 3) // 5, (x[1] - 3) // 5))
preds = test_for_predict_RDD.map(lambda x: (x[0], x[1], model_1.predict(x[0], x[2]), model_2.predict(x[0], x[3]),\
                                    model_3.predict(x[0], x[4])))

preds = preds.map(lambda x: ((x[0], x[1]), (x[2][2] + x[3][2] + x[4][2]) / 3))
print(preds.take(3).collect())
rates_and_preds = test_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2])))
#rates_and_preds = rates_and_preds.join(preds)
'''
Beispiel #45
0
# check if spark context is defined
print(sc.version)


# importing the MF libraries
from pyspark.mllib.recommendation import ALS, \ MatrixFactorizationModel, Rating

# reading the movielens data
df_rdd = sc.textFile('C:/Users/Vusal/Desktop/DDA new/ml-1m/ratings.dat')\.map(lambda x: x.split("::"))
            
ratings= df_rdd.map(lambda l:\Rating(int(l[0]),int(l[1]),float(l[2])))

# Splitting the data into train and test sets.
X_train, X_test= ratings.randomSplit([0.8, 0.2])

# Training the model
rank = 10
numIterations = 10
model = ALS.train(X_train, rank, numIterations)

# Evaluate the model on testdata
# dropping the ratings on the tests data
testdata = X_test.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
# joining the prediction with the original test dataset
ratesAndPreds = X_test.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)

# calculating error
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))
    .options(table="ratings", keyspace="movies")\
    .load()
tt = time() - t0
print "Data is loaded in %s seconds" % round(tt, 3)

rank = 8
seed = 5L
iterations = 10
regularization_parameter = 0.1

print "Training the ALS model..."

t0 = time()
model = ALS.train(
    dfRates.rdd.map(lambda r: (int(r[0]), int(r[1]), r[2])).cache(),
    rank=rank,
    seed=seed,
    iterations=iterations,
    lambda_=regularization_parameter)
tt = time() - t0

print "New model trained in %s seconds" % round(tt, 3)

predictions = model.recommendProductsForUsers(10) \
    .flatMap(lambda pair: pair[1]) \
    .map(lambda rating: (rating.user, rating.product, rating.rating))

dfToSave = predictions.toDF(["userid", "movieid", "prediction"])

t0 = time()
options = {
    "table": "recommendations",
validationForPredictRDD = validationRDD.map(lambda (UserID, MovieID, Rating): (UserID, MovieID))

seed = 5L
iterations = 5
regularizationParameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

minError = float('inf')
bestRank = -1
bestIteration = -1
for rank in ranks:
    model = ALS.train(trainingRDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularizationParameter)
    predictedRatingsRDD = model.predictAll(validationForPredictRDD)
    error = computeError(predictedRatingsRDD, validationRDD)
    errors[err] = error
    err += 1
    print 'For rank %s the RMSE is %s' % (rank, error)
    if error < minError:
        minError = error
        bestRank = rank

print 'The best model was trained with rank %s' % bestRank


# In[50]:

# TEST Using ALS.train (2c)
Beispiel #48
0
from pyspark.mllib.recommendation import Rating
from pyspark.mllib.recommendation import ALS

user_id = int(sys.argv[1])

csv = sc.textFile("rank_data.csv")
# print(csv.collect())

data = csv.map(lambda line: line.split(","))

header = data.first()
rank_data = data.filter(lambda x: x != header)
# rank_data.take(5)

model = ALS.train(rank_data, 10, 10, 0.01)
# print(model)

recommend = model.recommendProducts(user_id, 3)
# for p in recommend:
#     print(f"[user_id: {p[0]}, food_id: {p[1]}, rank: {p[2]}]")
print(recommend)
result = []
for rmd in recommend:
    result.append({
        'user_id': rmd[0],
        'food_id': rmd[1],
        'rank': round(rmd[2], 2)
    })

with open('recommend.json', 'w') as f:
                          db='baidutraffice',
                          charset='utf8')
cursor = connect.cursor()
table = 'hospitalUser'
DROP = "drop table if exists %s" % table
create = "create table %s (userId int,hospitalId int,recommend float)" % table
insert = "insert into " + table + " values(%s,%s,%s)"
spark = SparkSession.builder.master("local").appName(
    "hospitalALS").getOrCreate()

df = spark.read.text('file:///export/servers/pycharm_project/hospital.txt')
attrs_rdd1 = df.rdd.map(lambda x: x[0].split(',')).map(
    lambda x: Row(userId=1, hospitalId=x[0], recommend1=x[8]))
attrs_rdd2 = df.rdd.map(lambda x: x[0].split(',')).map(
    lambda x: Row(userId=2, hospitalId=x[0], recommend2=x[9]))
rdd1 = attrs_rdd1.map(lambda x: (x[2], x[0], x[1]))
rdd2 = attrs_rdd2.map(lambda x: (x[2], x[0], x[1]))
model1 = ALS.train(rdd1, 10, 10, 0.01)
model2 = ALS.train(rdd2, 10, 10, 0.01)
cursor.execute(DROP)
cursor.execute(create)
attrs1 = model1.recommendProducts(1, 500)
attrs2 = model2.recommendProducts(2, 500)
for a1 in attrs1:
    cursor.execute(insert, (a1[0], a1[1], a1[2]))
connect.commit()

for a2 in attrs2:
    cursor.execute(insert, (a2[0], a2[1], a2[2]))
connect.commit()
Beispiel #50
0
iterations = 10
regularization_parameter = 0.1
rank_list = []
rmse_list = []
ranks = [16, 20, 24, 28, 32, 36]
errors = [0] * len(ranks)
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(train,
                      rank,
                      seed=seed,
                      iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(test1).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = test.map(
        lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(
        rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    rank_list.append(rank)
    rmse_list.append(error)
    print('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank
Beispiel #51
0
#http://spark.apache.org/docs/latest/mllib-collaborative-filtering.html
# This is one model but we need more combination

# Best results are not commented
ranks = [5, 10, 15, 20]
reguls = [0.1, 1, 10]
iters = [5, 10, 20]

finalModel = None
finalRank = 0
finalRegul = float(0)
finalIter = -1
finalDist = float(100)

for cRank, cRegul, cIter in itertools.product(ranks, reguls, iters):

    model = ALS.train(rddTraining, cRank, cIter, float(cRegul))
    dist = howFarAreWe(model, rddValidating, nbValidating)
    print(str(dist))
    if dist < finalDist:
        finalModel = model
        finalRank = cRank
        finalRegul = cRegul
        finalIter = cIter
        finalDist = dist

print("Rank " + str(finalRank))  # best is 20
print("Regul " + str(finalRegul))  # best is 1
print("Iter " + str(finalIter))  # best is 20
print("Dist " + str(finalDist))  # best is 2.45935601578 (It is bad!!!)
Beispiel #52
0
header = tvViewingData.first()
lines = tvViewingData.filter(lambda row: row != header).map(
    lambda x: x.split(','))
# showUser = lines.map(lambda p: (p[0], int(p[1]), int(p[2])))
showUserCount = showUser.map(lambda p: p[1]).countByValue()

showUserRDD = lines.map(lambda p: Row(show=int(p[1]), user=int(p[2])))
showCount = showUserRDD.map(lambda p: p[0]).countByValue()
userCount = showUserRDD.map(lambda p: p[1]).countByValue()

showUser = spark.createDataFrame(showUserRDD)
# df = spark.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],["user", "item", "rating"])

(training, test) = showUser.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,
          regParam=0.01,
          implicitPrefs=True,
          userCol="user",
          itemCol="show",
          ratingCol="")
model = als.fit(training)
predictions = sorted(model.transform(test).collect(), key=lambda r: r[0])

# Save and load model
model.save(sc, "target/tmp/myCollaborativeFilter")
sameModel = MatrixFactorizationModel.load(sc,
                                          "target/tmp/myCollaborativeFilter")
# $example off$
Beispiel #53
0
def main():
    """ Train and evaluate an ALS recommender.
    """
    # Set up environment
    sc = SparkContext("local[*]", "RecSys")

    # Load and parse the data
    #data = sc.textFile("./data/ratings.dat") 
    ratingsRDD = sc.textFile("file:///Users/xicheng/Dropbox/Crackcode/BitTiger/0603movie/ml-10M100K/ratings.dat")
    
    moviesRDD = sc.textFile("file:///Users/xicheng/Dropbox/Crackcode/BitTiger/0603movie/ml-10M100K/movies.dat")\
                   .map(parse_movie)

    trainingRDD, validationRDD, testRDD = ratingsRDD.randomSplit([6, 2, 2], seed=0L)
    print 'Training: %s, validation: %s, test: %s\n' % (trainingRDD.count(),
                                                        validationRDD.count(),
                                                        testRDD.count())

    #ratings = data.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

    #data = sc.textFile("./ratings.dat")
    ratingsTrain = trainingRDD.map(parse_rating).cache()

    #movies = sc.textFile("./data/movies.dat")\
    #           .map(parse_movie)



    # Evaluate the model on training data
    predictdata = validationRDD.map(lambda p: (p[0], p[1]))

    #colNames = ["userID", "movieID"]
    #df = data.toDF(colNames)
    


    # Build the recommendation model using Alternating Least Squares
    rank = 10
    iterations = 10 #20

    theLambda = 0.01   #  use cross-validation to adjust

    bestMSE = 100
    bestRank = 100

    for i in range(10,20):
        rank = i
        #model = ALS.train(ratings, rank, iterations) 
        model = ALS.train(ratingsTrain, rank, iterations, theLambda) 

        predictions = model.predictAll(predictdata)\
                       .map(lambda r: ((r[0], r[1]), r[2]))

        rates_and_preds = ratingsTrain.map(lambda r: ((r[0], r[1]), r[2]))\
                             .join(predictions)

        MSE = rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

        print("Mean Squared Error = " + str(MSE))
        print(rank ,"mse = {:.3f}".format(MSE))
        if MSE < bestMSE :
            bestMSE = MSE
            bestRank = i

    print (bestMSE, bestRank)
Beispiel #54
0
########### prepare data ############
data = sc.textFile("addrs_intid.csv")
# data = sc.textFile("test.csv")
data_amount = data.map(lambda l: l.split(","))\
    .map(lambda l: ((int(l[0]),int(l[1])), float(l[2])))\
    .reduceByKey(lambda x,y :x + y)

# data_amount: [((payer_addr, payee_addr), amount)]

##### ALS to extract features #####
ratings_amount = data_amount.map(
    lambda l: Rating(int(l[0][0]), int(l[0][1]), float(l[1])))
model_amount = ALS.trainImplicit(ratings_amount,
                                 rank=4,
                                 iterations=5,
                                 lambda_=0.01,
                                 alpha=1.0,
                                 seed=5L)
feature_payee = model_amount.userFeatures().persist()
# feature_payer:[(payer_addr,(d1,d2,d3,d4))]
feature_payer = model_amount.productFeatures().persist()
# feature_payer:[(payee_addr,(d1,d2,d3,d4))]

n_cluster = 50000

##### KMEANS to cluster #####
clusters = KMeans.train(feature_payer.values(),
                        n_cluster,
                        maxIterations=10,
                        initializationMode="random")
labels = clusters.predict(feature_payer.values()).persist()
Beispiel #55
0
train_rate = train.map(
    lambda row: Rating(uid_to_index[row[0]], asin_to_index[row[1]], row[2]))
test_rate = test.map(
    lambda row: Rating(uid_to_index[row[0]], asin_to_index[row[1]], row[2]))

#Rating rdd:
rating_rdd = rdd.map(
    lambda row: Rating(uid_to_index[row[0]], asin_to_index[row[1]], row[2]))

#Best model for param.
ranks = [16]
numIterations = 2
best_model, min_mse = None, None
for rank in ranks:
    print("Rank: ", rank)
    model = ALS.train(train_rate, rank, numIterations)
    testdat = test_rate.map(lambda x: (x[0], x[1]))
    predictions = model.predictAll(testdat).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = test_rate.map(lambda r: ((r[0], r[1]), r[2])).join(
        predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    if min_mse is None or MSE < min_mse:
        min_mse = MSE
        best_model = model

#Best model on the entire dataset
best_model_total, min_mse = None, None
for rank in ranks:
    print("Rank: ", rank)
    model = ALS.train(rating_rdd, rank, numIterations)
    testdat = rating_rdd.map(lambda x: (x[0], x[1]))
# change the data to rdd
train_rdd_temp = inventory_rdd.map(F_Tuple).flatMapValues(lambda x: x)
print(train_rdd_temp.take(10))


def F_Flat(x):
    (index, (appid, ValuE)) = x
    return (index, appid, ValuE)


train_rdd = train_rdd_temp.map(F_Flat)
print(train_rdd.take(10))

# Build a ALS model and train the data
model = ALS.train(train_rdd, 5)
recom_dict = {}
for index in list(user_id_full_list.keys()):
    try:
        recom_list = [i.product for i in model.recommendProducts(index, 10)]
        user_id = user_id_full_list.get(index)
        recom_dict.update({user_id: recom_list})
    except:
        pass

# change it to a sql table
df_recom = pd.DataFrame.from_dict(recom_dict, 'index')
df_recom.index.name = 'stem_user_id'
df_recom.reset_index(inplace=True)
engine = create_engine(
    'mysql+pymysql://root:[email protected]/game_re?charset=utf8mb4')
Beispiel #57
0
def main():
    conf = SparkConf().setAppName("YeJoo_Park_task2_ModelBasedCF")\
     .setMaster("local")

    sc = SparkContext.getOrCreate(conf)
    sc.setLogLevel("ERROR")

    ratingsFilePath = sys.argv[1]
    testFilePath = sys.argv[2]

    data = sc.textFile(testFilePath)
    dataHeader = data.first()

    testingSet = set(data\
     .filter(lambda row: row != dataHeader)\
     .map(lambda r: r.split(","))\
     .map(lambda r: (int(r[USER_INDEX]), int(r[MOVIE_INDEX])))\
     .collect())

    # Load and parse the data
    data = sc.textFile(ratingsFilePath)
    dataHeader = data.first()

    trainRatings = data\
     .filter(lambda row: row != dataHeader)\
     .map(lambda r: r.split(","))\
     .map(lambda r: Rating(int(r[USER_INDEX]), int(r[MOVIE_INDEX]), float(r[RATING_INDEX])))

    print "ratings.count() before filter=" + str(trainRatings.count())

    testRatings = trainRatings.filter(
        lambda rating: (rating.user, rating.product) in testingSet)
    trainRatings = trainRatings.filter(
        lambda rating: (rating.user, rating.product) not in testingSet)

    print "testingSetRatings.count()=" + str(testRatings.count())
    print "ratings.count() after filter=" + str(trainRatings.count())

    rank = 10
    numIterations = 12
    lamb = 0.1
    model = ALS.train(trainRatings, rank, numIterations, lamb)

    print "Training complete"

    userProducts = testRatings.map(lambda rating:
                                   (rating.user, rating.product))
    predictions = model.predictAll(userProducts).map(lambda r:
                                                     ((r[0], r[1]), r[2]))
    ratesAndPreds = testRatings.map(lambda r: ((r[0], r[1]), r[2])).join(
        predictions)
    absDiffBuckets = ratesAndPreds.map(lambda r: int(abs(r[1][0] - r[1][1]))) \
     .map(lambda d: min(d, 4)).cache()
    RMSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

    # Write predictions to file
    outputFileName = "YeJoo_Park_ModelBasedCF.txt"
    printWriter = open(outputFileName, "a")

    outputPreds = ratesAndPreds.map(lambda r:
                                    (r[0][0], r[0][1], r[1][1])).collect()
    outputPreds.sort()

    for pred in outputPreds:
        printWriter.write(
            str(pred[0]) + ", " + str(pred[1]) + ", " + str(pred[2]))
        printWriter.write("\n")

    printWriter.close()

    print ">=0 and <1: " + str(absDiffBuckets.filter(lambda d: d == 0).count())
    print ">=1 and <2: " + str(absDiffBuckets.filter(lambda d: d == 1).count())
    print ">=2 and <3: " + str(absDiffBuckets.filter(lambda d: d == 2).count())
    print ">=3 and <4: " + str(absDiffBuckets.filter(lambda d: d == 3).count())
    print ">=4: " + str(absDiffBuckets.filter(lambda d: d == 4).count())

    print "RMSE=" + str(RMSE)



# In[ ]:





# In[7]:



# model
als = ALS(userCol="userid", itemCol="item", ratingCol="rating",coldStartStrategy='drop',nonnegative=False)
     
# evaluator
rmseevaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# parameter grid
paramGrid = ParamGridBuilder()    .addGrid(als.rank, [1, 5, 10,50,70])     .addGrid(als.maxIter, [15])    .addGrid(als.regParam, [0.05, 0.1, 0.5,5])    .build()

# train validation split
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=paramGrid,
                           evaluator=rmseevaluator,
                           trainRatio=0.8)
# fit model and time

tvsmodel = tvs.fit(data_train)
    return movie_names


conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")
sc = SparkContext(conf=conf)

print("Loading Movie Names")
movie_names = load_movie_names()

data = sc.textFile("file:///SparkCourse/ml-100k/u.data")

ratings = data.map(lambda line: line.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

rank = 10
num_iters = 15
model = ALS.train(ratings, rank, num_iters)

user_ratings = ratings.filter(lambda x: x[0] == user_id).collect()

print("User {} ratings:".format(user_id))
for rating in user_ratings:
    print("{}, score: {}".format(movie_names[int(rating[1])], rating[2]))

recs = model.recommendProducts(user_id, 10)

print("Top 10 recommendations:")
for rec in recs:
    print("{}, score: {}".format(movie_names[int(rec[1])], rec[2]))


Beispiel #60
0
# filter out header
header = data.first()  #extract header
data = data.filter(lambda row: row != header)

# convert into a sequence of Rating objects
ratings = data.map(lambda l: l.split(',')).map(
    lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

# split into train and test
train, test = ratings.randomSplit([0.8, 0.2])

# train the model
K = 10
epochs = 10
model = ALS.train(train, K, epochs)

# evaluate the model

# train
x = train.map(lambda p: (p[0], p[1]))
p = model.predictAll(x).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = train.map(lambda r: ((r[0], r[1]), r[2])).join(p)
# joins on first item: (user_id, movie_id)
# each row of result is: ((user_id, movie_id), (rating, prediction))
mse = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("train mse: %s" % mse)

# test
x = test.map(lambda p: (p[0], p[1]))
p = model.predictAll(x).map(lambda r: ((r[0], r[1]), r[2]))