def main(sc):

    seed = 5L
    iterations = 10
    regularization_parameter = 0.1
    rank = 4


    data = sc.textFile("file:///Expedia/data/train1.csv")
    
    ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    
    new_data = sc.textFile("file:///Expedia/data/new_set.csv")
    
    new_ratings = new_data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    new_ratings_for_predict_RDD = new_ratings.map(lambda x: (x[0], x[1])).cache()
    
    complete_data = ratings.union(new_ratings).cache()
    
    new_ratings_model = ALS.trainImplicit(complete_data, rank, seed=seed, 
                              iterations=iterations, lambda_=regularization_parameter)
                              
    
    # that not work need more invistigation                        
    #predictions = new_ratings_model.predictAll(0,'83').collect()
    predictions = new_ratings_model.predictAll(new_ratings_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2])).collect()
    recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:2]
    
    recommendations.take(5)
def main(sc):

    seed = 5L
    iterations = 10
    regularization_parameter = 0.1
    rank = 4

    data = sc.textFile("file:///Expedia/data/train1.csv")

    ratings = data.map(lambda l: l.split(',')).map(
        lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

    new_data = sc.textFile("file:///Expedia/data/new_set.csv")

    new_ratings = new_data.map(lambda l: l.split(',')).map(
        lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    new_ratings_for_predict_RDD = new_ratings.map(lambda x:
                                                  (x[0], x[1])).cache()

    complete_data = ratings.union(new_ratings).cache()

    new_ratings_model = ALS.trainImplicit(complete_data,
                                          rank,
                                          seed=seed,
                                          iterations=iterations,
                                          lambda_=regularization_parameter)

    # that not work need more invistigation
    #predictions = new_ratings_model.predictAll(0,'83').collect()
    predictions = new_ratings_model.predictAll(
        new_ratings_for_predict_RDD).map(lambda r:
                                         ((r[0], r[1]), r[2])).collect()
    recommendations = sorted(predictions, key=lambda x: x[2], reverse=True)[:2]

    recommendations.take(5)
Beispiel #3
0
def build_model(train_data):
    model = ALS.trainImplicit(train_data,
                              rank=FACTOR,
                              iterations=ITERS,
                              lambda_=LAMBDA,
                              alpha=ALPHA)
    return model
def evaluate(sc, raw_user_movies, raw_hot_movies):
    movies_name = build_movies(raw_hot_movies)
    user_id_to_int = raw_user_movies.map(lambda line: line.split(',')[0]).distinct().zipWithUniqueId().collectAsMap()
    ratings = build_ratings(raw_user_movies, user_id_to_int)
    num_iterations = 10
    for rank in [10, 50]:
        for lam in [1.0, 0.01, 0.0001]:
            model =  ALS.train(ratings, rank, num_iterations, lam)
            user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
            predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
            print predictions.take(3)
            rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
            print rates_and_preds.take(3)
            mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
            print "(rank:%d, lambda: %f,) Mean Squared Error = %f" % (rank, lam, mse)
    for rank in [10, 50]:
        for lam in [1.0, 0.01, 0.0001]:
            for alpha in [1.0, 40.0]:
                model = ALS.trainImplicit(ratings, rank, num_iterations, lam, alpha=alpha)
                user_movies = ratings.map(lambda tokens: (tokens[0], tokens[1]))
                predictions = model.predictAll(user_movies).map(lambda r: ((r[0], r[1]), r[2]))
                rates_and_preds = ratings.map(lambda tokens: ((tokens[0], tokens[1]), tokens[2])).join(predictions)
                print rates_and_preds.take(3)
                mse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
                print "(rank:%d, lambda: %f, alpha: %f, implicit  ) Mean Squared Error = %f" % (rank, lam, alpha, mse)
def als_training(ratings, rank=10, num_iteration=12, lambda_=0.01, alpha=0.01):
    model = ALS.trainImplicit(ratings,
                              rank,
                              num_iteration,
                              lambda_=lambda_,
                              alpha=alpha)
    return model
def train(training_data, rank, iteration, lmbda, alpha, model_id):
    """ Train model.

        Args:
            training_data (rdd): Used for training.
            rank (int): Number of factors in ALS model.
            iteration (int): Number of iterations to run.
            lmbda (float): Controls regularization.
            alpha (float): Constant for computing confidence.
            model_id (str): Model identification string.

        Returns:
            model: Trained model.

    """
    try:
        model = ALS.trainImplicit(training_data,
                                  rank,
                                  iterations=iteration,
                                  lambda_=lmbda,
                                  alpha=alpha)
        return model
    except Py4JJavaError as err:
        current_app.logger.error('Unable to train model "{}"\n{}'.format(
            model_id, str(err.java_exception)),
                                 exc_info=True)
        raise
Beispiel #7
0
def als(train, opts):
    vals = []
    for u in xrange(train.shape[0]):
        for i in train[u].indices:
            vals.append(Rating(int(u), int(i), float(train[u, i])))

    sc = pyspark.SparkContext("local")
    sc.setCheckpointDir("/tmp/" + str(random.random()))
    quiet_logs(sc)
    ratings = sc.parallelize(vals)

    if opts.implicit:
        model = ALS.trainImplicit(ratings,
                                  opts.D,
                                  opts.iters,
                                  opts.lbda,
                                  alpha=opts.alpha)
    else:
        model = ALS.train(ratings, opts.D, opts.iters, opts.lbda)

    U = []
    for ut in model.userFeatures().sortBy(lambda a: a[0]).collect():
        U.append(ut[1])

    rddItems = model.productFeatures()
    maxItem = rddItems.map(lambda a: int(a[0])).max()
    items = dict(rddItems.sortBy(lambda a: int(a[0])).collect())

    V = []
    for i in xrange(maxItem):
        item = items.get(i, np.zeros(opts.D))
        V.append(item)

    return (U, V)
Beispiel #8
0
def train(training_data, validation_data, num_validation, ranks, lambdas,
          iterations):
    best_model = None
    alpha = 3.0  # controls baseline confidence growth

    for rank, lmbda, iteration in itertools.product(ranks, lambdas,
                                                    iterations):
        print(
            'Training model with rank = %.2f, lambda = %.2f, iterations = %d...'
            % (rank, lmbda, iteration))
        model = ALS.trainImplicit(training_data,
                                  rank,
                                  iterations=iteration,
                                  lambda_=lmbda,
                                  alpha=alpha)
        validation_rmse = compute_rmse(model, validation_data, num_validation)
        print("    RMSE (validation) = %f for the model trained with " % validation_rmse + \
              "rank = %d, lambda = %.1f, and numIter = %d." % (rank, lmbda, iteration))
        if best_model is None or validation_rmse < best_model.error:
            best_model = Model(model=model,
                               error=validation_rmse,
                               rank=rank,
                               lmbda=lmbda,
                               iteration=iteration)

    print(
        'Best model has error = %.2f, rank = %.2f, lambda = %.2f, iteration=%d'
        % (best_model.error, best_model.rank, best_model.lmbda,
           best_model.iteration))
    return best_model
def model(sc, rawUserArtistData, rawArtistData, rawArtistAlias):
    bArtistAlias = sc.broadcast(buildArtistAlias(rawArtistAlias))
    trainData = buildRatings(rawUserArtistData, bArtistAlias).cache()
    model = ALS.trainImplicit(ratings=trainData, rank=10, iterations=5, lambda_=0.01, alpha=1.0)

    trainData.unpersist()
    print(model.userFeatures().mapValues(lambda v: ", ".join( map(lambda x: str(x),v) )).first())

    userID = 2093760

    recommendations = model.recommendProducts(userID, 5)
    for val in recommendations:
        print(val)
    recommendedProductIDs = map(lambda rec: rec.product, recommendations)

    #get specific user data
    rawArtistsForUser = rawUserArtistData\
        .map(lambda x: x.split(' '))\
        .filter(lambda x: int(x[0]) == userID)

    #map artist id to int
    existingProducts = rawArtistsForUser.map(lambda x: int(x[1])).collect()

    artistByID = buildArtistByID(rawArtistData)

    existingArtists = artistByID.filter(lambda artist: artist[0] in existingProducts).collect()
    for val in existingArtists:
        print(val)

    recommendedArtists = artistByID.filter(lambda artist: artist[0] in recommendedProductIDs).collect()
    for val in recommendedArtists:
        print(val)

    unpersist(model)
Beispiel #10
0
    def _recommend(self, train_ratings, users):
        from pyspark.mllib.recommendation import ALS, Rating

        # Preparing the user/item mapping as integers, since Spark's ALS implementation only works with integer values
        train_ratings['user'] = train_ratings['user'].astype('category')
        train_ratings['item'] = train_ratings['item'].astype('category')
        user_cat, item_cat = train_ratings['user'].cat, train_ratings['item'].cat
        self.user_cat = user_cat
        self.item_cat = item_cat
        self.train_ratings = train_ratings

        # Training the model
        self.ratings = self.sc.parallelize(Rating(u, i, rating) for u, i, rating in zip(user_cat.codes, item_cat.codes, train_ratings.rating))
        if self.implicit:
            model = ALS.trainImplicit(self.ratings, **self.spark_args)
        else:
            model = ALS.train(self.ratings, **self.spark_args)

        # Getting predictions from the model
        self.ratings_to_predict = self.sc.parallelize((user, item) for user in users for item in item_cat.codes.unique())
        self.predictions = model.predictAll(self.ratings_to_predict).collect()
        # Presenting the recommendations as a DataFrame
        self.predictions = [(user_cat.categories[p.user], item_cat.categories[p.product], p.rating) for p in self.predictions]
        self.predictions_df = pd.DataFrame(self.predictions, columns=['user', 'item', 'rating'])
        return self.predictions_df
def main(argv):

    Conf = (SparkConf().setAppName("recommendation"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)

    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
    rawDF = sqlContext.read.parquet(dirPath).persist(StorageLevel.MEMORY_AND_DISK_SER)
    # argv[1] is the dump of training data in hdfs
    # argv[2] is the user perferences

    # User Hash Lookup stored into cassandra
    user_hash = rawDF.map(lambda (a,b,c): (a,hashFunction(a)))
    distinctUser = user_hash.distinct()
    userHashDF = sqlContext.createDataFrame(distinctUser,["user","hash"])
    userHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="userhash", keyspace =  keyspace).save(mode="append")
    

    # Product Hash Lookup stored into cassandra
    product_hash = rawDF.map(lambda (a,b,c): (b, hashFunction(b)))
    distinctProduct = product_hash.distinct()
    productHashDF = sqlContext.createDataFrame(distinctProduct,["product","hash"])
    productHashDF.write.format("org.apache.spark.sql.cassandra").options(table ="producthash", keyspace =  keyspace).save(mode="append")

    # Ratings for training
    # ALS requires a java hash of string. This function does that and stores it as Rating Object
    # for the algorithm to consume
    ratings = rawDF.map(lambda (a,b,c) : Rating(hashFunction(a),hashFunction(b),float(c)))

    
    model = ALS.trainImplicit(ratings,10,10,alpha=0.01,seed=5)
    model.save(sc, "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model")

    sc.stop()
Beispiel #12
0
 def train(self,
           rank,
           numIter,
           lmbda,
           istest=False,
           isimp=False,
           alpha=0.1):
     training, val = self.prepareTraining()
     if self.conf.has_key("model"):
         model = self.loadModel()
     else:
         if not isimp:
             print "use ALS.train"
             model = ALS.train(training, rank, \
                     iterations=numIter, lambda_=lmbda, \
                     blocks=self.conf["parallelize"], \
                     seed=0, nonnegative=False
                     )
         else:
             print "use ALS.trainImplicit"
             model = ALS.trainImplicit(training, rank, \
                     iterations=numIter, lambda_=lmbda, \
                     blocks=self.conf["parallelize"], seed=0, \
                     alpha=alpha, nonnegative=False \
                     )
         if True:
             os.system("rm -rf cachem")
             model.save(self.sc, "cachem")
             print "saved model"
     if istest:
         self.mkTest(model, val)
     self.afterTrain(model)
     return model, val
Beispiel #13
0
def create_model(RDD_ratings_data):
    '''Create the final ALS model on the entire set of ratings (after training/testing work)'''
    model = ALS.trainImplicit(RDD_ratings_data,
                              rank=50,
                              iterations=20,
                              lambda_=10.0,
                              alpha=20.0)
    return model
def mf_als_rec(is_test):
    print('*** Test MF-ALS Recommender ***')

    conf = SparkConf().setAppName("MF-ALS Rec").setMaster("local")
    sc = SparkContext(conf=conf)

    b = Builder()
    ev = Evaluator(is_test=is_test)
    ev.split()

    UCM = b.get_UCM(ev.get_URM_train())

    target_playlists = ev.get_target_playlists()
    urm_train_indices = ev.get_URM_train().nonzero()
    ratings_list = []

    print('Creating RDD of tuples')
    for index in tqdm(range(0, urm_train_indices[0].size)):
        ratings_list.append(
            Rating(urm_train_indices[0][index], urm_train_indices[1][index],
                   1))

    ratings = sc.parallelize(ratings_list)

    model = ALS.trainImplicit(ratings, rank=10, iterations=5, alpha=0.01)

    dataframe_list = []

    print('Predicting...', flush=True)

    all_predictions = model.recommendProductsForUsers(10).filter(lambda r: r[0] in target_playlists)\
                                                         .collect()

    for u in tqdm(all_predictions):
        prediction = []
        for i in u[1]:
            prediction.append(i.product)
        dataframe_list.append([u[0], prediction])

    def get_id(e):
        return e[0]

    dataframe_list.sort(key=get_id)

    train_df = pd.DataFrame(dataframe_list,
                            columns=['playlist_id', 'track_ids'])

    if is_test:
        map5 = ev.map5(train_df)
        print('Hybrid MAP@10:', map5)
        return map5
    else:
        print('Prediction saved!')
        train_df.to_csv(os.path.dirname(os.path.realpath(__file__))[:-19] +
                        "/all/sub.csv",
                        sep=',',
                        index=False)
        return 0
Beispiel #15
0
def cross_validation(training, validation, test, candidates, id_title_map,
                     ranks, lambdas, numIters, alphas):
    # train models and evaluate them on the validation set

    result_dict = {}
    result_template = "rank:%d  iters:%d  lambda: %f"
    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1
    numTraining = training.count()
    numValidation = validation.count()
    numTest = test.count()
    if not IMPLICIT:
        alphas = [1.0]
    for rank, lmbda, numIter, alpha in itertools.product(
            ranks, lambdas, numIters, alphas):
        if IMPLICIT:
            model = ALS.trainImplicit(training,
                                      rank,
                                      iterations=numIter,
                                      lambda_=lmbda,
                                      alpha=alpha,
                                      nonnegative=True)
        else:
            model = ALS.train(training,
                              rank,
                              iterations=numIter,
                              lambda_=lmbda,
                              nonnegative=True)
        validationRmse = 0.0  #computeRmse(model, validation, numValidation)
        print "RMSE (validation) = %f for the model trained with " % validationRmse + \
              "rank = %d, lambda = %.4f, and numIter = %d and alpha=%f." % (rank, lmbda, numIter, alpha)

        qe_results = qualitative_evaluation(model, candidates, id_title_map)

        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
        result_dict[result_template % (rank, numIter, lmbda)] = validationRmse
    testRmse = 0.0  #computeRmse(bestModel, test, numTest)
    # evaluate the best model on the test set
    print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
      + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
    result_dict['BEST Model on Test:' + result_template %
                (bestRank, bestNumIter, bestLambda)] = testRmse
    # compare the best model with a naive baseline that always returns the mean rating
    meanRating = training.union(validation).map(lambda x: x[2]).mean()
    baselineRmse = sqrt(
        test.map(lambda x: (meanRating - x[2])**2).reduce(add) / numTest)
    improvement = (baselineRmse - testRmse) / baselineRmse * 100
    print "The best model improves the baseline by %.2f" % (improvement) + "%."
    result_dict['BEST gain over baseline'] = improvement
    return bestModel, result_dict
Beispiel #16
0
def train(training_rdd):
    model = ALS.trainImplicit( \
        training_rdd.map(lambda rr: (rr[0], rr[1], 1)),
        rank=16,
        iterations=10,
        lambda_=0.1,
        alpha=80.0
    )
    return model.productFeatures()
Beispiel #17
0
 def train_model(self):
     """Train the implicit ALS model with the current dataset for A certain parameter:
     stats.stackexchange.com/questions/133565/how-to-set-preferences-for-als-implicit-feedback-in-collaborative-filtering
     """
     print(self.rank, self.seed, self.iterations, self.reg_parameter,
           self.alpha)
     self.model = ALS.trainImplicit(self.trainData, rank=self.rank, \
                 iterations=self.iterations, lambda_=self.reg_parameter, \
                 blocks=-1, alpha=self.alpha, nonnegative=False, seed=self.seed)
Beispiel #18
0
 def trainALS(self):
     try:
         self.model = MatrixFactorizationModel.load(self.sc, "als_final")
     except (RuntimeError, TypeError, NameError) as e:
         rank = 4
         numIterations = 20
         self.model = ALS.trainImplicit(self.implicit_ratings, rank,
                                        numIterations)
         self.model.save(self.sc, "als_final")
def main(sc):

    #load files
    train_1 = sc.textFile("file:///Expedia/data/train_1.csv")
    training_RDD = train_1.map(lambda l: l.split()).map(
        lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

    #load folds files
    train_2 = sc.textFile("file:///Expedia/data/train_1.csv")
    validation_RDD = train_2.map(lambda l: l.split()).map(
        lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

    validation_for_predict_RDD = validation_RDD.map(lambda x:
                                                    (x[0], x[1])).cache()
    train_RDD = training_RDD.map(lambda x: (x[0], x[1])).cache()

    #Train model in tain set and cross validation set and choose the best model with
    # the best RMSE in Cross validation set
    seed = 5L
    iterations = 10
    regularization_parameter = [0.1, 0.5, 1.0]
    ranks = [4, 8]
    errors = []

    min_error = float('inf')
    best_rank = -1

    for rank, regularization_parameter in itertools.product(
            ranks, regularization_parameter):
        #train implicit model in train set
        model = ALS.trainImplicit(training_RDD,
                                  rank,
                                  seed=seed,
                                  iterations=iterations,
                                  lambda_=regularization_parameter)
        #Predict model in validation set
        predictions = model.predictAll(validation_for_predict_RDD).map(
            lambda r: ((r[0], r[1]), r[2]))
        rates_and_preds = validation_RDD.map(
            lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)

        #compute root mean square error in prediction validation set
        rmse = math.sqrt(
            rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        errors.append(rmse)

        print 'For rank %s the RMSE is %s' % (rank, rmse)
        if rmse < min_error:
            min_error = rmse
            best_rank = rank


    print "The best model was trained with rank = %d and lambda = %.1f, " % (
    best_rank , regularization_parameter) \
    + "and numIter = %d, and its RMSE on the validation set is %f." % (iterations,
    min_error)
 def __train_model(self):
     """Train the ALS model with the current dataset
     """
     logger.info("Training the ALS model...")
     self.model = ALS.trainImplicit(self.taste_RDD,
                                    self.rank,
                                    seed=self.seed,
                                    iterations=self.iterations,
                                    lambda_=self.regularization_parameter)
     logger.info("ALS model built!")
Beispiel #21
0
    def train(self, rank=3, iterations=20, lambda_=0.01, alpha=None, blocks=-1):
        """
        train a mf model against the given parameters
        """
        if alpha:
            model = ALS.trainImplicit(self.train_data, rank, iterations, lambda_, blocks, alpha)
        else:
            model = ALS.train(self.train_data, rank, iterations, lambda_)

        return model
Beispiel #22
0
    def training_models(self, rank=5, seed=32, iterations=20, alpha=0.01, reg=0.01):
        '''ALS training parameters:
            rank - Number of latent factors.
            iterations - Number of iterations of ALS. (default: 5)
            lambda_ - Regularization parameter. (default: 0.01)
            alpha - constant used in computing confidence. (default: 0.01)
            seed - Random seed for initial matrix factorization model. (default: None)
        '''

        print (self.training.take(5), self.test.take(5))

        weights = [.8, .2]
        trainData_RDD, valData_RDD = self.training.randomSplit(weights, seed)  # split training to training and validation sets

        trainData_RDD.cache(), valData_RDD.cache()

        print (trainData_RDD.count(), valData_RDD.count())


        #X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).filter(lambda x: x[0] in set({92396, 198196, 111182, 2350, 46158})).cache()
        X_val_RDD = valData_RDD.map(lambda x: (x.user, x.product)).cache()
   
        sum_ratings_val = valData_RDD.map(lambda x: x.rating).sum()

        product_nums_for_users = X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).map(lambda x: x[1]).collect()
        #print (X_val_RDD.collect())
        print ('num of users', X_val_RDD.map(lambda x: (x[0], 1)).reduceByKey(add).count())
        #print (product_num_for_users)
        rank_lists = Rank_list(product_nums_for_users)

        print (rank_lists)
        #print (rank_lists[4])

        #return

        model = ALS.trainImplicit(trainData_RDD, rank, iterations=iterations,\
                            lambda_=reg, blocks=-1, alpha=alpha, nonnegative=False, seed=seed)

        # prediced results for validation results
        predictions_RDD = model.predictAll(X_val_RDD).map(lambda x: ((x[0], x[1]), x[2]))
        ratings_and_preds_RDD = valData_RDD.map(lambda x: ((x[0], x[1]), x[2])).join(predictions_RDD)

        print()
        print('model training is convergenent')
        print()
        #return

        MPR = self.percentage_ranking(ratings_and_preds_RDD, rank_lists, sum_ratings_val)


        print ('Rank %s, reg %s, alpha %s, AvgRank = %s' % (rank, reg, alpha, MPR))
def main(sc):
   

    #load files
    train_1 = sc.textFile("file:///Expedia/data/train_1.csv")
    training_RDD = train_1.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()
    
	#load folds files
    train_2 = sc.textFile("file:///Expedia/data/train_1.csv")
    validation_RDD = train_2.map(lambda l: l.split()).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()


    validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1])).cache()
    train_RDD = training_RDD.map(lambda x: (x[0], x[1])).cache()


    #Train model in tain set and cross validation set and choose the best model with 
    # the best RMSE in Cross validation set
    seed = 5L
    iterations = 10
    regularization_parameter = [0.1, 0.5 , 1.0 ] 
    ranks = [4, 8]
    errors = []

    min_error = float('inf')
    best_rank = -1

    for rank, regularization_parameter in itertools.product(ranks, regularization_parameter):
	    #train implicit model in train set  
        model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
        #Predict model in validation set              
        predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
        rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
        
        #compute root mean square error in prediction validation set  
        rmse = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        errors.append(rmse)
    
        print 'For rank %s the RMSE is %s' % (rank, rmse)
        if rmse < min_error:
            min_error = rmse
            best_rank = rank


    print "The best model was trained with rank = %d and lambda = %.1f, " % (
    best_rank , regularization_parameter) \
    + "and numIter = %d, and its RMSE on the validation set is %f." % (iterations,
    min_error)
Beispiel #24
0
 def label(self, rank=50, numIterations=10, alpha=0.01):
     """
     INPUT:
     - rank: number of topics
     - numIterations: number of iterations for matrix factorization
     - alpha: learning rate
     OUTPUT:
     - data for training naive bayes with label, feature tuples
     """
     als_model = ALS.trainImplicit(self.tfidf_rating, rank, numIterations, alpha)
     index_label = als_model.userFeatures().map(lambda x: (x[0], np.argmax(x[1])))
     index_feature = self.tfidf.zipWithIndex().map(lambda x: (x[1], x[0]))
     index_label_feature = index_label.join(index_feature)
     label_feature = index_label_feature.map(lambda x: x[1])
     self.train_data = label_feature.map(lambda x: LabeledPoint(x[0], x[1]))
    def train_als(self):
        self.ratings = self.df.select("user_id", "repo_id")\
            .map(lambda x: Rating(x[0], x[1], 1.0))
        
        rank = 10
        numIterations = 20
        model = ALS.trainImplicit(self.ratings, rank, numIterations, alpha=0.01)

        testdata = self.ratings.map(lambda p: (p[0], p[1]))
        predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
        ratesAndPreds = self.ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
        print("Mean Squared Error = " + str(MSE))

        model.save(self.sc, "ALS_model")
def prepare_model(sc, filename, user_id, ratings_train):
    if filename is None and os.path.exists(config.MSD_MODEL):
        # load the trained model
        print("\n\nLoading existing recommendation model from %s\n\n"
              % config.MSD_MODEL)
        model = MatrixFactorizationModel.load(sc, config.MSD_MODEL)
    else:
        # train a new model
        print("\n\nRetraining recommendation model for User %s\n\n" % user_id)
        rank, lambda_val = (
            evaluate.load_best_params(config.MSD_BEST_PARAMS_FILE))
        rank, lambda_val = int(rank), float(lambda_val)
        model = ALS.trainImplicit(ratings_train, rank, evaluate.ITERATIONS,
                                  lambda_val, nonnegative=True)

    return model
Beispiel #27
0
 def fit_and_save_model(self,
                        train_num=0.8,
                        test_num=0.2,
                        seed_num=2711,
                        rank=5,
                        iterations=5):
     user_item_spark_df = self.spark.createDataFrame(self.user_item_df)
     user_item_rdd = user_item_spark_df.rdd
     train, test = user_item_rdd.randomSplit([train_num, test_num],
                                             seed=seed_num)
     testdata = test.map(lambda p: (p[0], p[1]))
     model = ALS.trainImplicit(train,
                               rank=rank,
                               iterations=iterations,
                               nonnegative=True)
     model.save(self.sc, 'data/firstmodel')
Beispiel #28
0
def main(cores, prefs):
    """
	args:
	cores (int) : number of cores for spark job
	prefs (list[str]) : list of strings containing subreddit names - capital letters are non-trivial
	"""

    scfg = SparkConf()
    scfg.set("spark.cores.max", cores)
    sc = SparkContext(master="spark://final-gateway:7077",
                      appName="reddit-cf",
                      conf=scfg)

    try:
        # prep data
        raw_counts = sc.textFile(
            "hdfs://final-gateway/w251_cf-user-site-total")
        parsed_counts = raw_counts.map(lambda st: eval(st))
        all_ratings = parsed_counts.map(tup_to_rating)
        # assign user-identified preferred subreddits
        raw_prefs = [(999, x, 100) for x in prefs]
        my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

        # train model
        model_input = all_ratings.union(my_prefs)
        model = ALS.trainImplicit(model_input, 10, 10, alpha=.01)

        # candidate prefs for prediction
        my_prefs_ids = set([javahash(x) for x in prefs])
        all_subreddit_ids = parsed_counts.map(
            lambda (a, b, c): (javahash(b), b)).distinct().cache()
        candidates = all_subreddit_ids.map(lambda (a, b): a).filter(
            lambda r: r not in my_prefs_ids)

        predictions = model.predictAll(
            candidates.map(lambda x: (999, x))).cache()

        final = predictions.map(lambda (a, b, c): (b, c)).join(
            all_subreddit_ids).map(lambda (b, (c, d)): (c, d)).sortByKey(False)

        output = list(final.take(30))
        sc.stop()
        return output
    except Exception, e:
        print("App failed. Stopping gracefully")
        sc.stop()
        raise Exception(e)
Beispiel #29
0
def cross_validation(training, validation, test, candidates, id_title_map, ranks, lambdas, numIters, alphas):
# train models and evaluate them on the validation set

    result_dict = {}
    result_template = "rank:%d  iters:%d  lambda: %f"
    bestModel = None
    bestValidationRmse = float("inf")
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1
    numTraining = training.count()
    numValidation = validation.count()
    numTest = test.count()
    if  not IMPLICIT:
        alphas = [1.0]
    for rank, lmbda, numIter, alpha in itertools.product(ranks, lambdas, numIters, alphas):
        if IMPLICIT:
            model = ALS.trainImplicit(training, rank, iterations=numIter, lambda_=lmbda, alpha=alpha, nonnegative=True)
        else:
            model = ALS.train(training, rank, iterations=numIter, lambda_=lmbda, nonnegative=True)
        validationRmse = 0.0 #computeRmse(model, validation, numValidation)
        print "RMSE (validation) = %f for the model trained with " % validationRmse + \
              "rank = %d, lambda = %.4f, and numIter = %d and alpha=%f." % (rank, lmbda, numIter, alpha)

        qe_results = qualitative_evaluation(model, candidates, id_title_map)

        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
        result_dict[result_template % (rank, numIter, lmbda)] = validationRmse
    testRmse = 0.0 #computeRmse(bestModel, test, numTest)
    # evaluate the best model on the test set
    print "The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
      + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRmse)
    result_dict['BEST Model on Test:' + result_template % (bestRank, bestNumIter, bestLambda)] = testRmse
    # compare the best model with a naive baseline that always returns the mean rating
    meanRating = training.union(validation).map(lambda x: x[2]).mean()
    baselineRmse = sqrt(test.map(lambda x: (meanRating - x[2]) ** 2).reduce(add) / numTest)
    improvement = (baselineRmse - testRmse) / baselineRmse * 100
    print "The best model improves the baseline by %.2f" % (improvement) + "%."
    result_dict['BEST gain over baseline'] = improvement
    return bestModel, result_dict
def main(cores, prefs):

	"""
	ALS Algorithm to Recommend Subreddits to User based on User-defined preferences
	
	args:
	cores (int) : number of cores for spark job
	prefs (list[str]) : list of strings containing subreddit names - capital letters are non-trivial
	"""

	scfg=SparkConf()
	scfg.set("spark.cores.max",cores)
	sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)

	try:
		# prep data
		raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
		parsed_counts = raw_counts.map(lambda st: eval(st))
		all_ratings = parsed_counts.map( tup_to_rating )
		# assign user-identified preferred subreddits
		raw_prefs = [ (999, x, 100) for x in prefs ]
		my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

		# train model
		model_input = all_ratings.union(my_prefs)
		model = ALS.trainImplicit(model_input, 10, 10, alpha=.01, seed=5)

		# candidate prefs for prediction
		my_prefs_ids = set([javahash(x) for x in prefs])
		all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
		candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)

		predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()

		final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)

		output = list( final.take(30) )
		sc.stop()
		return output
	except Exception, e:
		print("App failed. Stopping gracefully")
		sc.stop()
		raise Exception(e)
Beispiel #31
0
    def fit_als(self):
        sc = self.__set_context_als()

        data = sc.textFile('data_set/train_set_als.csv')
        header = data.first()
        data = data.filter(lambda row: row != header)

        trans = open('data_set/train_set_als.csv', 'r')
        lines = trans.readlines()

        users_dict = dict()  # {CUST_ID : Num}
        items_dict = dict()
        i = 0
        for line in lines[1:]:
            parts = line.split(',')
            user = int(parts[0])
            if user not in users_dict:
                users_dict[user] = i
                i += 1
        j = 0
        for line in lines[1:]:
            parts = line.split(',')
            item = int(parts[1])
            if item not in items_dict:
                items_dict[item] = j
                j += 1

        self.users_als = {v: k
                          for k, v in users_dict.items()}  #{Num : CUST_ID}
        self.items_als = {v: k for k, v in items_dict.items()}

        self.users_dict = users_dict
        self.items_dict = items_dict

        ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(
            users_dict[int(l[0])], items_dict[int(l[1])], float(l[2])))
        self.model_ALS = ALS.trainImplicit(ratings=ratings,
                                           rank=40,
                                           iterations=30,
                                           lambda_=0.001,
                                           blocks=-1,
                                           alpha=10.0)
Beispiel #32
0
def execute_recommendation():

    sc = SparkContext(appName="PythonCollaborativeFilteringExample")
    #sc = SparkContext( 'local', 'pyspark')
    
    #Load train data and train
    train_file_name = get_training_file_name()
    train_data = sc.textFile(train_file_name)
    ratings = train_data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    
    rank = 10
    number_iteration = 10
    model = ALS.trainImplicit(ratings, rank, number_iteration)
    
    #load test data and do prediction 
    test_file_name = get_testing_file_name()
    test_data = sc.textFile(test_file_name)
    test_ranking=test_data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    
    
    testdata = test_ranking.map(lambda p: (p[0], p[1]))
    count_rdd=testdata.count()
    
    
    
    if count_rdd > 0:
        predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
        
        #predictions_lines = predictions.map(toCSVLine)
        result_file = get_rdd_output()
        predictions.saveAsTextFile(result_file)
    
        count_rdd = predictions.count()
        print("after prediction: count_rdd=",count_rdd)
    else:
        print("Error: empty testdata")
        

    sc.stop()
Beispiel #33
0
def train_als(ratings, explicit, rank, rp, iteration, non_negative):

    # To avoid stackoverflow issue.

    sc.setCheckpointDir("checkpoint/")
    ALS.checkpointInterval = 2

    if explicit == True:
        model = ALS.train(ratings,
                          rank=rank,
                          iterations=iteration,
                          lambda_=rp,
                          nonnegative=non_negative)
    else:
        model = ALS.trainImplicit(ratings,
                                  rank=r,
                                  iterations=iteration,
                                  lambda_=rp,
                                  nonnegative=non_negative)

    return model
Beispiel #34
0
def train(training_data, validation_data, num_validation, ranks, lambdas,
          iterations):
    best_model = None
    best_model_metadata = {}
    model_metadata = []
    alpha = 3.0
    for rank, lmbda, iteration in itertools.product(ranks, lambdas,
                                                    iterations):
        t0 = time()
        model = ALS.trainImplicit(training_data,
                                  rank,
                                  iterations=iteration,
                                  lambda_=lmbda,
                                  alpha=alpha)
        mt = '{:.2f}'.format((time() - t0) / 60)
        model_id = 'listenbrainz-recommendation-model-{}'.format(uuid.uuid4())
        t0 = time()
        validation_rmse = compute_rmse(model, validation_data, num_validation)
        vt = '{:.2f}'.format((time() - t0) / 60)
        model_metadata.append((model_id, mt, rank, '{:.1f}'.format(lmbda),
                               iteration, "%.2f" % (validation_rmse), vt))
        if best_model is None or validation_rmse < best_model.error:
            best_model = Model(model=model,
                               error=validation_rmse,
                               rank=rank,
                               lmbda=lmbda,
                               iteration=iteration,
                               model_id=model_id,
                               training_time=mt,
                               rmse_time=vt)
    best_model_metadata = {
        'error': '{:.2f}'.format(best_model.error),
        'rank': best_model.rank,
        'lmbda': best_model.lmbda,
        'iteration': best_model.iteration,
        'model_id': best_model.model_id,
        'training_time': best_model.training_time,
        'rmse_time': best_model.rmse_time
    }
    return best_model, model_metadata, best_model_metadata
def main(sc):
    hbaseconf = {
        "hbase.zookeeper.quorum": 'cluster3',
        "hbase.mapreduce.inputtable": 'testUserBehavior'
    }
    keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
    valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
    hbase_rdd = sc.newAPIHadoopRDD(
        "org.apache.hadoop.hbase.mapreduce.TableInputFormat",
        "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
        "org.apache.hadoop.hbase.client.Result",
        keyConverter=keyConv,
        valueConverter=valueConv,
        conf=hbaseconf)

    data = hbase_rdd.map(change_format).filter(lambda x: x is not None).cache()

    users = set(data.map(lambda x: (x[0], 1)) \
                .reduceByKey(lambda x, y: x + y) \
                .map(lambda x: (x[1], x[0])) \
                .sortByKey(ascending = False) \
                .map(lambda x: x[1]).take(1000))

    data = data.filter(lambda x: True if x[0] in users else False)

    model = ALS.trainImplicit(data, 1, seed=10)
    results = model.recommendProductsForUsers(3).map(output_format).filter(
        lambda x: x is not None)

    sqlContext = SQLContext(sc)
    schema = StructType([
        StructField("Uid", StringType(), True),
        StructField("results", StringType(), True)
    ])
    r = sqlContext.createDataFrame(results, schema)

    url = "jdbc:mysql://cluster2"
    table = "CSDN.recommendation"
    properties = {"user": "******", "password": "******"}
    r.write.jdbc(url, table, 'overwrite', properties)
def main(argv):

    Conf = (SparkConf().setAppName("recommendation"))
    sc = SparkContext(conf=Conf)
    sqlContext = SQLContext(sc)

    dirPath = "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/data/sr_userCount.parquet"
    rawDF = sqlContext.read.parquet(dirPath).persist(
        StorageLevel.MEMORY_AND_DISK_SER)
    # argv[1] is the dump of training data in hdfs
    # argv[2] is the user perferences

    # User Hash Lookup stored into cassandra
    user_hash = rawDF.map(lambda (a, b, c): (a, hashFunction(a)))
    distinctUser = user_hash.distinct()
    userHashDF = sqlContext.createDataFrame(distinctUser, ["user", "hash"])
    userHashDF.write.format("org.apache.spark.sql.cassandra").options(
        table="userhash", keyspace=keyspace).save(mode="append")

    # Product Hash Lookup stored into cassandra
    product_hash = rawDF.map(lambda (a, b, c): (b, hashFunction(b)))
    distinctProduct = product_hash.distinct()
    productHashDF = sqlContext.createDataFrame(distinctProduct,
                                               ["product", "hash"])
    productHashDF.write.format("org.apache.spark.sql.cassandra").options(
        table="producthash", keyspace=keyspace).save(mode="append")

    # Ratings for training
    # ALS requires a java hash of string. This function does that and stores it as Rating Object
    # for the algorithm to consume
    ratings = rawDF.map(
        lambda (a, b, c): Rating(hashFunction(a), hashFunction(b), float(c)))

    model = ALS.trainImplicit(ratings, 10, 10, alpha=0.01, seed=5)
    model.save(
        sc,
        "hdfs://ec2-52-71-113-80.compute-1.amazonaws.com:9000/reddit/recommend/model"
    )

    sc.stop()
def runModel(Sql, TableName, Rank = RANK, No_iterations = NO_ITERATIONS, Alpha = APLHA):
    ### ALS ###
    print("start runModel")
    dfNonVariety_ALS = saveToTempTable(Sql = Sql, TableName = TableName)
    print(dfNonVariety_ALS.count())
    
    start = time.time()
    indexed_user = indexedUser(dfNonVariety_ALS)
    stop = time.time()
    print("done -- indexed_user " + str(stop-start))

    start = time.time()
    indexed_product = indexedProduct(indexed_user)
    stop = time.time()
    print("done -- indexed_product " + str(stop-start))

    start = time.time()
    ratings_NonVariety = indexed_product.rdd.map(lambda r: Rating(r.UserIdNew, r.PidNew, r.Visit))
    stop = time.time()
    print("done -- rating " + str(stop-start))

    alsModel = ALSModel()
    print("done -- create instance")
    
    start = time.time()
    alsModel.joined_rdd = indexed_product.select('UserIdNew').dropDuplicates().crossJoin(indexed_product.select('PidNew').dropDuplicates()).rdd.map(lambda x: (x[0], x[1]))
    stop = time.time()
    print("done -- cross join " + str(stop-start))

    start = time.time()
    saveToTempTable(DFObject = indexed_product, TableName='tbData')
    stop = time.time()
    print("done -- dump df_ALS to temp table " + str(stop-start))

    # ALS Implicit Model
    start = time.time()
    alsModel.model = ALS.trainImplicit(ratings_NonVariety, Rank, No_iterations, Alpha)
    stop = time.time()
    print("done -- model " + str(stop-start))
    return alsModel
Beispiel #38
0
def Recommendation(filename, foods):
	print ('successful')
	sc = SparkContext('local', 'Simple App')

	ratings = sc.textFile(filename)
	processedRatings = ratings.map(lambda line: (int(line.split(",")[0]),int(line.split(",")[1]),float(line.split(",")[2])))
	users = ratings.map(lambda rating: int(rating.split(",")[0])).distinct().collect()

    #train model
	model = ALS.trainImplicit(processedRatings, 1,seed=10)

	rdict = {}
	recommenddict = {}
	for user in users:
		recommenddict.setdefault(user,[])
    
	for user in users: 
		rdict[user]= model.recommendProducts(user,5)
		for Rating in rdict[user]:
			recommenddict[user].append(foods[Rating.product])
		#recommenddict[user].append(Rating[1])       
	print (recommenddict)
	sc.stop()
	return recommenddict
Beispiel #39
0
def calc_cf_mllib(y_training_data, num_partitions = 20):
    """
    Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = y_training_data.map(lambda (user, item, rating): rating).max()
    min_rating = y_training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating=0

    #MLLIb has two methods, train and trainImplicit().  Implicit data will go between zero and 1
    if min_rating==0 and max_rating==1:
        model = ALS.trainImplicit(y_training_data, rank = 10, iterations = 5)
    else:
        model = ALS.train(y_training_data, rank = 10, iterations = 5)

    #predict all user, item pairs
    item_ids = y_training_data.map(lambda (u,i,r): i).distinct()
    user_ids = y_training_data.map(lambda (u,i,r): u).distinct()
    user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)

    predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1])))

    norm_predictions = predicted.map(lambda (user,item,pred): (user,item, rechelp.squish_preds(pred,min_rating,max_rating)))


    return norm_predictions
    def train_implicit(self, rank, seed=0, iterations=50, lambda_=0.01, **kwargs):
        """
        Train the model using implicit ratings.

        Parameters
        ----------
        rank : int
            The number of factors in the underlying model.  Generally, larger numbers of factors
            lead to better models, but increase the memory required.  
            A rank in the range of 10 to 200 is usually reasonable.

        iterations : int, optional
            The number of iterations to perform.  With each iteration, the model improves.  ALS
            typically converges quickly, so a value of 10 is recommended.

        lambda : float, optional
            This parameter controls regularization, which controls overfitting.  
            The higher the value of lambda applies more regularization.  
            The appropriate value here depends on the problem, and needs
            to be tuned by train/test techniques, which measure overfitting.

        Returns
        -------
        out: : model
            A RecommenderModel.  This can be used to make predidictions on how a 
            user would rate an item.
        """

        ratings = self._prepare_ratings()
        model = ALS.trainImplicit(ratings.to_rdd(),
                          rank, 
                          iterations=iterations, 
                          lambda_=lambda_, 
                          seed=seed, 
                          **kwargs)
        return MatrixFactorizationModel(model, self.ratings, self.user_col, self.item_col, self.rating_col)
Beispiel #41
0
def home(request):

	prefs = ["IAmA","funny","nfl"]

	scfg=SparkConf()
	scfg.set("spark.cores.max",64)
	sc=SparkContext(master="spark://final-gateway:7077", appName="reddit-cf", conf=scfg)

	try:
		# prep data
		raw_counts = sc.textFile("hdfs://final-gateway/w251_cf-user-site-total")
		parsed_counts = raw_counts.map(lambda st: eval(st))
		all_ratings = parsed_counts.map( tup_to_rating )
		# assign user-identified preferred subreddits
		raw_prefs = [ (999, x, 100) for x in prefs ]
		my_prefs = sc.parallelize(raw_prefs).map(tup_to_rating)

		# train model
		model_input = all_ratings.union(my_prefs)
		model = ALS.trainImplicit(model_input, 10, 10, alpha=.01)

		# candidate prefs for prediction
		my_prefs_ids = set([javahash(x) for x in prefs])
		all_subreddit_ids = parsed_counts.map( lambda (a,b,c): (javahash(b),b) ).distinct().cache()
		candidates = all_subreddit_ids.map(lambda (a,b): a ).filter( lambda r: r not in my_prefs_ids)

		predictions = model.predictAll(candidates.map( lambda x: (999, x))).cache()

		final = predictions.map(lambda (a,b,c): (b,c)).join(all_subreddit_ids).map(lambda (b,(c,d)): (c,d) ).sortByKey(False)

		output = list( final.take(30) )
		sc.stop()
	except Exception, e:
		print("App failed. Stopping gracefully")
		sc.stop()
		raise Exception(e)
Beispiel #42
0
def calc_cf_mllib(y_training_data, num_partitions=20):
    """
    Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings

    Args:
        y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ]

    Returns:
        predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ].

    """

    #Predicted values can be anywhere - because we are normalizing the content based algorithms we should likely normalize here
    max_rating = y_training_data.map(lambda (user, item, rating): rating).max()
    min_rating = y_training_data.map(lambda (user, item, rating): rating).min()

    if max_rating == min_rating:
        min_rating = 0

    #MLLIb has two methods, train and trainImplicit().  Implicit data will go between zero and 1
    if min_rating == 0 and max_rating == 1:
        model = ALS.trainImplicit(y_training_data, rank=10, iterations=5)
    else:
        model = ALS.train(y_training_data, rank=10, iterations=5)

    #predict all user, item pairs
    item_ids = y_training_data.map(lambda (u, i, r): i).distinct()
    user_ids = y_training_data.map(lambda (u, i, r): u).distinct()
    user_item_combo = user_ids.cartesian(item_ids).coalesce(num_partitions)

    predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1])))

    norm_predictions = predicted.map(lambda (user, item, pred): (
        user, item, rechelp.squish_preds(pred, min_rating, max_rating)))

    return norm_predictions
def recommend(sc, rawUserArtistData, rawArtistData, rawArtistAlias):
    bArtistAlias = sc.broadcast(buildArtistAlias(rawArtistAlias))
    allData = buildRatings(rawUserArtistData, bArtistAlias).cache()
    model = ALS.trainImplicit(ratings=allData, rank=50, iterations=10, lambda_=1.0, alpha=40.0)

    allData.unpersist()

    userID = 2093760
    recommendations = model.recommendProducts(userID, 5)
    recommendedProductIDs = map(lambda rec: rec.product, recommendations)

    artistByID = buildArtistByID(rawArtistData)

    recommendedArtists = artistByID.filter(lambda artist: artist[0] in recommendedProductIDs).collect()
    for val in recommendedArtists:
        print(val)

    someUsers = allData.map(lambda item: item.user).distinct().take(100)
    someRecommendations = map(lambda userId: model.recommendProducts(userId, 5),someUsers)
    formattedRecommendations = map(lambda recs: str(recs[0].user) + " -> " + ", ".join( map(lambda x: str(x.product), recs) ),someRecommendations)
    for val in formattedRecommendations:
        print(val)

    unpersist(model)
Beispiel #44
0
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql import SparkSession

sc = SparkSession.builder \
      .appName("ALSmodel") \
      .getOrCreate()

# Load and parse the data
data = sc.sparkContext.textFile("/tmp/transactions_andre_als")

ratings = data.map(lambda l: l.split(','))\
    .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

# Build the recommendation model using Alternating Least Squares
rank = 15
numIterations = 5
model = ALS.trainImplicit(ratings, rank, numIterations, alpha=0.01)

# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

# Save and load model
model.save(sc.sparkContext, "/tmp/amolenaar/model")
        longToShortLocations[str(longLocation)] = shortLocation
    outStr = str(shortUserID) + "," + str(shortLocation) + "," + numVisits + "\n"
    fpout.write(outStr)
pickle.dump( longToShortLocations, open( "shortenLocations.p", "wb" ))
fp.close()
fpout.close()


# Load and parse the data
data = sc.textFile("file:///home/hadoop/RealVisitsDataShort.csv")
ratings = data.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

# Build the recommendation model using Alternating Least Squares
rank = 10
numIterations = 10
model = ALS.trainImplicit(ratings, rank, numIterations)

# Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

#Save and load model
#commented out the save for now because the model already exists on hdfs
#uncomment this when you are ready to train a new model!
#model.save(sc, "target/tmp/myCollaborativeFilter")
sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
#parse the AskForRecsFor.csv file
f = open('AskForRecsForShort.csv')
userArtistDataFile = filePath + 'user_artist_data.txt'
rawUserArtistData = sc.textFile(userArtistDataFile)

# parse Artist data file
artistDataFile = filePath + 'artist_data.txt'
rawArtistData = sc.textFile(artistDataFile)
artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1)

# parse artist alias file
artistAliasDataFile = filePath + 'artist_alias.txt'
rawArtistAliasData = sc.textFile(artistAliasDataFile)
artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap()

# broadcast variable
bArtistAlias = sc.broadcast(artistAlias)


def processTrainData(line):
    (userId, artistId, count) = map(int, line.split(' '))
    
    artistAliasId = bArtistAlias.value.get(artistId)
    if artistAliasId == None: 
        artistAliasId = artistId
    return Rating(userId, artistAliasId, count)

trainData = rawUserArtistData.map(processTrainData).cache()

model = ALS.trainImplicit(trainData, 10)
print model.productFeatures()

Beispiel #47
0
def findBestModel(data):
    # Build the recommendation model using Alternating Least Squares
    # TODO need to test the best configuration of model
    rank = 10
    numIterations = 20
    return ALS.trainImplicit(data, 1)
    ranks = [8,12]
    lambdas = [0.1, 1.0]
    numIters = [10,20]
    bestModel = None
    bestValidationRmse = float('inf')
    bestRank = 0
    bestLambda = -1.0
    bestNumIter = -1


    
    #Developing the best model.  Using different values to find the model with the least Mean Square Error.
    #Once I find that I will use that model to predict results. 
    for rank,lmbda,numIter in itertools.product(ranks,lambdas,numIters):
        
        model = ALS.trainImplicit(rec_list, rank, numIter,lambda_=lmbda,alpha=0.01)
        
        validationRmse = computeRmse(model, validation, numValidation)
        
        #print "RMSE (validation) = %s for the model trained with " % str(validationRmse) + \
        #      "rank = %d, lambda = %f, and numIter = %d." % (rank,lmbda,numIter)
        if (validationRmse < bestValidationRmse):
            bestModel = model
            bestValidationRmse = validationRmse
            bestRank = rank
            bestLambda = lmbda
            bestNumIter = numIter
    

    #Computing Best MSE to display
    testRmse = computeRmse(bestModel, test, numTest)
    def trainModel(self):
        data_triplets = self.loadRatings(self.sc, self.dataFile)

        #Split the data into 80% training data and 20% valdiation data
        train_triplet = data_triplets.sample(False,0.8, seed=1).cache()
        validation_triplets = data_triplets.subtract(train_triplet).cache()

        print 20 * '-', 'Started Training the ALS model', 20 * '-'
        #TODO set different ranks and lambdas
        ranks = [8, 10, 12]
        lambdas = [1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0]
        numIters = [20]
        alp = [1.0,10.0, 50.0, 100.0]
        bestModel = None
        bestValidationRMSE = float("inf")
        bestRank = 0
        bestLambda = -1.0
        bestNumIter = -1
        bestalpha = -1.0


        for rank, lmbda, numIter, a in itertools.product(ranks, lambdas, numIters, alp):

            print ("\nTraining ALS with rank = {}, Regularization parameter = {}, \n"
                   "number of iterations = {}, alpha = {}".format(rank, lmbda, numIter, a))

            model = ALS.trainImplicit(train_triplet, rank, lambda_=lmbda, iterations=numIter, alpha=a)

            #testdata contains only userId and songId
            testdata = validation_triplets.map(lambda r: (r[0], r[1]))

            #prediciton will contain userId, songId and predicted ratings
            predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

            #Join the predicted ratings with actual rating to compute root mean square error
            actualsAndPredictions = validation_triplets.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
            RMSE = math.sqrt(actualsAndPredictions.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean())
            print ('\n RMSE = {} ').format(RMSE)

            if (RMSE < bestValidationRMSE):
                bestModel = model
                bestValidationRMSE = RMSE
                bestRank = rank
                bestLambda = lmbda
                bestNumIter = numIter
                bestalpha = a

        print ("\nThe best model was trained with Rank = {}, Regularization parameter ={} and"
               "\nNumber of Iterations = {}\n RMSE = {}, best alpha = {}"
               .format(bestRank, bestLambda, bestNumIter, bestValidationRMSE, bestalpha))

        print 20 * '-', 'Finished Training the ALS model', 20 * '-'


        # Evaluate the best model on the test set. Use the entire data file as test set.


        print(20 * '-', 'Testing on the given data file itself', 20 * '-')

        test_ratings = self.loadRatings(self.sc, self.dataFile)
        testdata = test_ratings.map(lambda p: (p[0], p[1]))
        predictions = bestModel.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
        ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
        MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean()
        MAE = ratesAndPreds.map(lambda r: (abs(abs(r[1][0]) - abs(r[1][1])))).mean()

        print("Mean Squared Error = " + str(MSE))
        print("Mean Absolute Error = " + str(MAE))
        print("Root Mean Square Error = ", str(MSE ** .5))
        print(20 * '-', 'Testing Finished', 20 * '-')

        # Save the best model

        #bestModel.save(self.sc, self.modelPath)
        bestModel.save(self.sc, self.modelPath)
Beispiel #50
0
    
raw_artist_data = sc.textFile("s3://ehec2/audio_data/artist_data.txt", 5)
artist_by_id = raw_artist_data.map(lambda line: line.split('\t'))\
                        .map(lambda token: tokenize(token, 1))
raw_artist_alias = sc.textFile("s3://ehec2/audio_data/artist_alias.txt")
artists_alias = raw_artist_alias.map(lambda line: line.split("\t"))\
                        .map(lambda token: tokenize(token))
raw_user_artist_data = sc.textFile("s3://ehec2/audio_data/user_artist_data.txt")
user_artist_data = raw_user_artist_data.map(lambda line: line.split())\
                        .map(lambda token: tokenize(token))

# Broadcast global variable for use in getting the right artist id
brdc_artists_alias = sc.broadcast(artists_alias.collectAsMap())

trainRDD = user_artist_data.map(lambda row: train_tokenize(row)).cache()
model = ALS.trainImplicit(trainRDD, 10, 5, 0.01, 1)

# Spot Checking Recommendations
# Extract the IDs of artists that this user has listened to and print their names. This means searching the input for artist IDs for this user, and then filtering the set of artists by these IDs so you can collect and print the names in order:
# Testing the model by looking at recommendations for user: 1052043

# To check recommendation for other users, change user_id here.
test_user_id = 2093760

# Getting all artists user test_user_id has listened to
artists_for_user = user_artist_data.filter(lambda l: l[0] == test_user_id)
existing_products = set(artists_for_user.map(lambda l: l[1]).collect())

# Getting the names and ids of artists user has listened to
artists_for_user = artist_by_id.filter(lambda art: art[0] in existing_products)\
                    .map(lambda art: art[1]).collect()
Beispiel #51
0
(training,test) = CatRating.randomSplit([0.8,0.2])

training.count()
1202345

test.count()
300611

# Call the ALS.train mehod to train the model

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

rank = 10
numIterations = 10
model = model = ALS.trainImplicit(training, rank, numIterations, alpha=0.01)

model
<pyspark.mllib.recommendation.MatrixFactorizationModel object at 0x7fb12319b810>

# Evaluate the model on training data

testdata = test.map(lambda r: (r[0],r[1]))

type(testdata)
<class 'pyspark.rdd.PipelinedRDD'>

testdata.take(5)
[(278716, 1558), (1387683, 324), (639095, 1192), (240681, 646), (31895, 969)]

Beispiel #52
0
#rawRatings3.take(10)


# In[15]:

#ALS.extractParamMap()


# In[16]:

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
rank = 200
numIterations = 20
#model = ALS.train(rawRatings3, rank, numIterations, 0.01)
#model = ALS.trainImplicit(rawRatings3, rank, numIterations, 0.01)
model = ALS.trainImplicit(rawRatings3, rank, numIterations, 0.03)


# In[17]:

model.recommendProducts(100000,5)


# In[18]:

#type(model)


# In[19]:

#type(ALS)
Beispiel #53
0
if split:
    rand_a,rand_b = raw_data.randomSplit(weights=[0.5,0.5],seed=99).persist()
    ratings_a = rand_a.map(lambda row: Rating(row[0],row[1],row[2])).persist()
    ratings_b = rand_b.map(lambda row: Rating(row[0],row[1],row[2])).persist()
else:
    ratings = raw_data.map(lambda row: Rating(row[0],row[1],row[2])).persist()
    base_model_name = d+'model_'


#with open(d+'log_rmse','a') as fout:
for k in k_range:
    start = time.time()

    if split:
        model_a = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True)
        model_b = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True)
        model_a.save(sc,'model_rand_a_'+str(k))
        model_b.save(sc,'model_rand_b_'+str(k))
        artist_features_a = np.array(model_a.productFeatures().sortByKey().map(lambda row: row[1]).collect())
        np.save(d+"features_rand_a_{}".format(k))
        artist_features_b = np.array(model_b.productFeatures().sortByKey().map(lambda row: row[1]).collect())
        np.save(d+"features_rand_b_{}".format(k))
    else:
        model = ALS.trainImplicit(ratings,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True)
        model.save(sc,d+'model_'+str(k))
        artist_features = np.array(model.productFeatures().sortByKey().map(lambda row: row[1]).collect())
        np.save(d+"features_{}".format(k),artist_features)
        user_features = np.array(model.userFeatures().sortByKey().map(lambda row: row[1]).collect())
        np.save(d+"user_features_{}".format(k),user_features)
Beispiel #54
0

if __name__ == "__main__":

    if len(sys.argv) != 3:
        print("""
        first argument is ratings file and second is output file
        """)
        exit(-1)

    txtFile = sys.argv[1]
    outputFile = sys.argv[2]

    sc = SparkContext(appName="ALSExample")
    ratings = sc.textFile(txtFile)
    processedRatings = ratings.map(lambda line: (int(line.split(",")[0]),int(line.split(",")[1]),float(line.split(",")[2])))
    users = ratings.map(lambda rating: int(rating.split(",")[0])).distinct().collect()

    #train model
    model = ALS.trainImplicit(processedRatings, 1,seed=10)

    outArray=[]
    f=open(outputFile,'w')
    
    for user in users:        
        outArray.append(model.recommendProducts(user,20))

    f.write(json.dumps(outArray))

    sc.stop()
    f.close()
Beispiel #55
0
  #   * k-folds (inner loop)
  iter_cnt = 0
  for rv in rankVal:
    for train_index, test_index in kf:
      iter_cnt += 1

      results = {}
      pair_train, pair_test = pairs[train_index], pairs[test_index]

      bid_train = sc.parallelize(pair_train).flatMap(lambda x: ((x[0],x[3]), (x[2],x[1])))

      train = uni.union(bid_train)\
                 .union(wish)\
                 .map(lambda l: Rating(int(l[0]), int(l[1]), float(1.0)))

      model = ALS.trainImplicit(train, rv, numIterations, lambda_=_lambda, alpha=_alpha)

      # Reconstruction error
      testdata = train.map(lambda p: (p[0], p[1]))
      print "Test data len: ", testdata.count()
      predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
      ratesAndPreds = train.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
      MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()

      print "-"*50, " RANK: ", rv, "ITER: ", iter_cnt

      users_in_train = train.map(lambda x: x[0] ).distinct().collect()
      items_in_train = train.map(lambda x: x[1] ).distinct().collect()

      recom = []
      for i,p in enumerate(pair_test):
Beispiel #56
0
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS
import terragon

sc = SparkContext()

r1 = (1, 1, 1.0)
r2 = (1, 2, 2.0)
r3 = (2, 1, 2.0)
ratings = sc.parallelize([r1, r2, r3])
model = ALS.trainImplicit(ratings, 1, seed=10)
model.predict(2, 2)


stringified_spark_model = terragon.dumps_spark_to_base64(sc, model)

with open("/tmp/test.sparkle", "w") as f:
    f.write(stringified_spark_model)
Beispiel #57
0
    top_repos = starpairs\
        .groupBy(lambda t: t[1])\
        .sortBy(lambda t: len(t[1]), False)\
        .map(lambda t: t[0])\
        .take(sample)
    top_repos_rdd = sc.parallelize(top_repos)
    top_repos_rdd.cache()
    top_repos_bc = sc.broadcast(top_repos)
    pprint(top_repos[:5])

    starpairs_filtered = starpairs.filter(lambda t: t[1] in top_repos_bc.value)
    starpairs_filtered.cache()

    # train recommendation model using alternating least squares
    stars_with_rating = starpairs_filtered.map(lambda t: array([t[0], t[1], 1]))
    model = ALS.trainImplicit(stars_with_rating, rank=1)

    # get all user->repo pairs without stars
    users_repos = users.cartesian(top_repos_rdd).groupByKey()
    stars_grouped = starpairs_filtered.groupByKey()
    unstarred = users_repos.join(stars_grouped)\
        .map(lambda i: (i[0], set(i[1][0]) - set(i[1][1]) ))\
        .flatMap(lambda i: [ (i[0], repo) for repo in i[1] ] )

    # predict unstarred user-repo pairs.
    predictions = model.predictAll(unstarred)

    # for each user, associate the 5 repos with the highest predicted rating.
    top = predictions\
        .map(lambda t: (t[0], (t[1],t[2])))\
        .groupByKey()\
Beispiel #58
0
	and 0 indicates that the user didn't listen to the song. 
	'''	
	# calls format_line to convert strings to integers based off of the hash table. Gets all songs(ints) for a user as an array
	data = values.map(lambda x: format_line(x, UID_INDEX, int(indexOfField), value_hash, id_hash)).distinct().groupByKey() # or cache()
	
	# consider all songs that a user hasn't listened to and store in a numpy array
	user_arrays = data.map(lambda x: dichotomize(x))

	# create (user.id, np.array) format ====> (user.id, product.id, viewed) double lambda FTW
	ratings = user_arrays.flatMap(lambda user: map(lambda(i, x): array([float(user[0]), float(i), float(x)]), enumerate(user[1])))

	# sample without replacement
	training, test = make_training_and_test(data)

	# train the ALS model  w/ 1 latent variables, and with 10 iterations (should be enough to converge ALS)
	model = ALS.trainImplicit(ratings, 1, 10)	
	

	'''
	ALS Prediction for Model:
		* For a given user ID, get the top n-recommendations 
		* Due to the sparsity of the matrix, many of the recommendations have very low confidence scores
	'''
	prompt = "\n Enter the user ID for the user you want to get recommendations for \n"
	print prompt
	id_to_recommend = float(raw_input)
	try :
		id_to_num = id_hash[id_to_recommend]
	except :
		print "ID %s is currently not stored in the predictive model :(" % id_to_recommend
		break