Esempio n. 1
0
users.distinct().count() 

#We don't have to extract data to its own RDD
#This command counts the distinct movies
#There are 1,682 movies
clean_data.map(lambda y: int(y[1])).distinct().count()

#Need to import  functions / objects from the MLlib
from pyspark.mllib.recommendation import MatrixFactorizationModel, Rating
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD

#We'll need to map the movielens data to a Ratings object 
#A Ratings object is made up of (user, item, rating)
mls = movielens.map(lambda l: l.split('\t'))
ratings = mls.map(lambda x: Rating(int(x[0]),\
    int(x[1]), float(x[2])))
    
#Need a training and test set
train, test = ratings.randomSplit([0.7,0.3],7856)

train.count() #70,005
test.count()

rank = 5 # Latent Factors to be made
numIterations = 100 # Times to repeat process
#Create the model on the training data
model = LinearRegressionWithSGD.train(train, rank, numIterations)

# Evaluate the model on testdata
# dropping the ratings on the tests data
testdata = test.map(lambda p: (p[0], p[1]))
Esempio n. 2
0
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

sc.stop()
conf = SparkConf().setMaster("local").setAppName("Question2")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# Load and parse the data
data = sc.textFile("dbfs:/FileStore/tables/ratings.dat").map(lambda line: line.split("::")).map(lambda x: Rating(int(x[0]), int(x[1]), int(x[2])))
splits = data.randomSplit([6, 4], 24)
trainData = splits[0]
testData = splits[1]
rank = 10
iterations = 20
model = ALS.train(trainData, rank, iterations)

testLabel = testData.map(lambda p: ((p[0], p[1]), p[2]))
testData = testData.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testData).map(lambda r: ((r[0], r[1]), r[2]))

combined_result = predictions.join(testLabel)
MSE = combined_result.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))
Esempio n. 3
0
def Rating_info(sc, file_path):
    rating = sc.textFile(file_path).map(lambda x: x.split('\t')).map(
        lambda x: Rating(x[0], x[1], x[2]))
    return rating
Esempio n. 4
0

d = 'NMF_tests_10iter/'
n_iter = 10

k_range = np.arange(200,201,10)
split = False
#alpha_range = [0.0001,0.001,0.01,0.05,1.0,5,10]
#reg_range =[1e-8,1e-04,0.01,0.1,0.5,1.0]


raw_data = sc.textFile("mf_format.txt").map(lambda row: [int(val) for val in row.strip().split(',')])

if split:
    rand_a,rand_b = raw_data.randomSplit(weights=[0.5,0.5],seed=99).persist()
    ratings_a = rand_a.map(lambda row: Rating(row[0],row[1],row[2])).persist()
    ratings_b = rand_b.map(lambda row: Rating(row[0],row[1],row[2])).persist()
else:
    ratings = raw_data.map(lambda row: Rating(row[0],row[1],row[2])).persist()
    base_model_name = d+'model_'


#with open(d+'log_rmse','a') as fout:
for k in k_range:
    start = time.time()

    if split:
        model_a = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True)
        model_b = ALS.trainImplicit(ratings_a,rank=k,iterations=n_iter,alpha=0.01,nonnegative=True)
        model_a.save(sc,'model_rand_a_'+str(k))
        model_b.save(sc,'model_rand_b_'+str(k))
Esempio n. 5
0
import pyspark
sc = pyspark.SparkContext(master="local[3]", appName="ML project 2")

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

from helpers import row_col_spark, load_data, create_csv_submission

r = re.compile(r'r(\d+)_c(\d+)')
	
# Load and parse the data
data2 = sc.textFile("data_train.csv")
header = data2.first() #extract header
data2 = data2.filter(lambda row: row != header) 

ratings = data2.map(lambda l: l.split(','))
ratings = ratings.map(lambda l: Rating(*row_col_spark(l[0], r), float(l[1])))
train2, test2 = ratings.randomSplit([0.9, 0.1], seed=4242)

#Need to cache the data to speed up training
train2.cache()
test2.cache()

sc.setCheckpointDir('checkpoint/')

l_s = 0.09
r_s = 100

print("""
============================
lambda = {}   rank = {}
============================
Esempio n. 6
0
def parseLine(line, sep=','):
    '''Parse RDD into a Ratings() object for ALS model: user,item,rating'''
    fields = line.split(sep)
    return Rating(user=int(fields[0]),
                  product=int(fields[1]),
                  rating=float(fields[2]))
Esempio n. 7
0
import json
from crux.util import loadPickle
from operator import add
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark import SQLContext, SparkContext, SparkConf
reload(sys)
sys.setdefaultencoding('utf-8')

conf = SparkConf().setAppName("chencheng's task").setMaster(
    "spark://anti-spam-spark-001.yz.momo.com:8081,anti-spam-spark-002.yz.momo.com:8081"
)
sc = SparkContext(conf=conf)

user_artist_data = sc.textFile(
    "hdfs://antispam/user/hadoop/output/chencheng/crux/data/bobby/book/")

#tfidf=loadPickle('/home/hadoop/chen.cheng/moa/book_tfidf.pkl')
#b = sc.broadcast(tfidf)

ratings = user_artist_data.map(lambda x: json.loads(x))\
        .flatMap(lambda x: [[x[0], item] for item in x[1]]) \
        .filter(lambda x: x[0] and x[1] )\
        .map(lambda x: Rating(int(x[0]), int(x[1]), 1))
ratings.cache()

rank = 10
numIterations = 20
model = ALS.trainImplicit(ratings, rank, numIterations, alpha=10.0)

model.save(
    sc, "hdfs://antispam/user/hadoop/output/chencheng/model/als_book_alpha=10")
Esempio n. 8
0
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
import itertools

test_ratings=[]
ratings = sc.textFile("/FileStore/tables/ratings.dat")
ratings=ratings.map(lambda x: x.split("::")).map(lambda row: [int(row[0]), int(row[1]),float(row[2])])
training, test = ratings.randomSplit([0.6, 0.4])
test_data=test.map(lambda r: (r[0],r[1]))
training=training.map(lambda row: Rating(row[0],row[1],row[2]))

rank = 50
numIterations =20
model = ALS.train(training, rank, numIterations,0.01)
predictions = model.predictAll(test_data).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
pred=ratesAndPreds.map(lambda r:(r[1][0],round(r[1][1])))
accuracy = 100 *(pred.filter(lambda pl: pl[0] == pl[1]).count())/ test.count()
print('Model Accuracy: {}'.format(accuracy))
def parseLine(line):
    fields = line.split("|")
    return Rating(int(fields[0]), int(fields[1]), float(fields[2]) - 2.5)
    1]], [float(a[2])])).reduceByKey(lambda a, b: a + b).map(
        lambda a: (a[0], sum(a[1]) / len(a[1]))).collect()
for id in missing_businessid:
    business_average[id[0]] = id[1]
#print(business_average)

validation_RDD = sc.textFile(validation_file_path).map(lambda a: a.split(","))
# saving the header #
header_1 = validation_RDD.first()
# filtering based on header #
validation_data = validation_RDD.filter(lambda a: a != header_1)
#print(validation_data.count())

# Model Based Collaborative Filtering #
if case == 1:
    ratings = training_data.map(lambda a: Rating(int(user_dict[a[
        0]]), int(business_dict[a[1]]), float(a[2])))
    rank = 8
    numIterations = 10
    model = ALS.train(ratings, rank, numIterations, 0.2)

    #checking the existence of ids from validation set in training set #
    def check_id(id):
        if id[0] not in user_dict:
            user_id = -1
        elif id[0] in user_dict:
            user_id = user_dict[id[0]]
        if id[1] not in business_dict:
            business_id = -1
        elif id[1] in business_dict:
            business_id = business_dict[id[1]]
        return ((user_id, business_id))
def create_rating(rating_record):
    tokens = rating_record.split(',')
    userID = int(tokens[0])
    productID = int(tokens[1])
    rating = float(tokens[2])
    return Rating(userID, productID, rating)
user_data = sys.argv[2]
output = sys.argv[3]

userId = 112132212
movie = Row("id", "movieName")

movie_table = sc.textFile(movie_data + str("/movies.dat"))
rating_table = sc.textFile(movie_data + str("/ratings.dat"))
user_data_table = sc.textFile(movie_data + str("/users.dat"))
new_user = sc.textFile(user_data)

movieRDD = movie_table.map(lambda movie: movie.split("::"))

ratingDF = (rating_table.map(lambda rating: rating.split("::")).map(
    lambda rate: (int(rate[0]), int(rate[1]), float(rate[2]))).map(
        lambda (uid, mid, rate): Rating(uid, mid, rate))).toDF()

newUserRDD = new_user.map(lambda movie: movie.split(" ", 1))

joinRDD = movieRDD.cartesian(newUserRDD)
joinRDD = (joinRDD.map(lambda (movie, umovie): (movie[0], movie[1], umovie[
    0], umovie[1])).map(lambda (id, movie, urate, umovie): (umovie, (
        id, urate, levenshtein(movie, umovie)))).reduceByKey(
            lambda x1, x2: min(x1, x2, key=lambda x: x[-1])))

userMovie = (
    joinRDD.map(lambda (key, value): (userId, value[0], value[1])).map(
        lambda (uid, mid, rate): Rating(uid, int(mid), float(rate)))).toDF()

trainDF = ratingDF.cache()
Esempio n. 13
0
    namedict = dict()
    file = open("./ml-100k/ml-100k/u.item")
    for record in file:
        fields = record.split("|")
        namedict[(int(fields[0]))] = fields[1].encode(encoding="ascii",
                                                      errors="ignore")
    return namedict


conf = SparkConf().setMaster("local[*]").setAppName("ALS recommendation")
sc = SparkContext(conf=conf)

namedict = names()
lines = sc.textFile("file:///SparkCourse/ml-100k/ml-100k/u.data")
train_data = lines.map(lambda x: x.split()).map(
    lambda x: Rating(int(x[0]), int(x[1]), x[2])).cache()

numiter = 10
rank = 10

print("Training model")
model = ALS.train(train_data, rank, numiter)

id = int(sys.argv[1])

movies = train_data.filter(lambda x: x[0] == id)

for i in movies.collect():
    print(namedict[i[1]])

#top 10 recommendation
Esempio n. 14
0
def main(data_source, users_source, output, number_recs):

    users_data = {}
    with open(users_source) as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            users_data[int(row[0]) - 1] = row[1]

    # This should be changed if running on cluster
    conf = SparkConf().setMaster("local[*]").setAppName("AptoideALS")

    sc = SparkContext(conf=conf)
    sc.setLogLevel("OFF")
    # Load and parse the data
    data = sc.textFile(data_source)
    ratings = data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

    # Build the recommendation model using Alternating Least Squares
    seed = 5L
    iterations = 10
    # Is a basic L2 Regularizer to reduce overfitting
    regularization_parameter = 0.1
    # Number of features used to describe items
    rank = 50
    # Is the confidence that we have that the user likes the item
    alpha = 100.0

    model = ALS.trainImplicit(ratings,
                              rank,
                              seed=seed,
                              iterations=iterations,
                              lambda_=regularization_parameter,
                              alpha=alpha)

    # Evaluate the model on training data
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata).map(lambda r:
                                                 ((r[0], r[1]), r[2]))
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
        predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))

    # Use Spotify annoy to get items neighbors based on the feature vectors
    # n_trees -> a larger value will give more accurate results, but larger indexes
    # search_k -> a larger value will give more accurate results, but will take longer time to return
    index = AnnoyIndex(rank, 'angular')
    items = model.userFeatures().collect()
    for i, vector in items:
        # Annoy start at index 0, while Spark starts at index 1. We need to -1 every index
        index.add_item(i - 1, vector)
    # n_trees
    index.build(300)
    index.save("index.ann")
    sc.addPyFile("index.ann")

    # Broadcast: improve performance by sending once per node rather than a once per task
    names = sc.broadcast(users_data)

    # Major function to get recommendations based on features vectors
    # assumes items are numbered 0 ... n-1
    def find_neighbors(iter):
        t = AnnoyIndex(rank)
        t.load(SparkFiles.get("index.ann"))
        # search_k
        return ((x[0] - 1, t.get_nns_by_item(x[0] - 1, int(number_recs)))
                for x in iter)

    # Function to convert into the format required
    # Need to convert inside the RDD so it make us of spark's file writer
    def construct_string(x):
        array = []
        order = int(number_recs)
        for item in x[1]:
            if item != x[0]:
                array.append("(\"{}\",{})".format(names.value[item],
                                                  str(order)))
                order -= 1
        result = "\"{}\",{}".format(names.value[x[0]], str(array)).replace(
            " ", "").replace("[", "").replace("]", "").replace("'", "")
        return result

    similarRDD = model.productFeatures().mapPartitions(find_neighbors)
    similarRDD.map(construct_string).saveAsTextFile(output)
Esempio n. 15
0
    playerprob = sc.textFile("finalcluster2")
    playerprob=playerprob.map(lambda x:x.split(','))
    playerprobfinal=playerprob.map(lambda x:compute(x))
   
    player_schema = StructType() \
          .add("Bat", "string") .add("Bowl","string").add("0","float").add("1","float").add("2","float").add("3","float") .add("4","float").add("5","float").add("w","float")
    dataframe = sqlContext.createDataFrame(playerprobfinal, player_schema)
    rddtodf = [StringIndexer(inputCol=j, outputCol=j+"_i") for j in list(set(dataframe.columns)-set(['0'])-set(['1'])-set(['2'])-set(['3'])-set(['4'])-set(['6'])-set(['w']))]
    dataframe1 = Pipeline(stages=rddtodf)
    indexed = dataframe1.fit(dataframe).transform(dataframe)
    playerprob = indexed.rdd.map(tuple)


    rank = 10
    numIterations = 10
    ratingszero = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[2])))
    ratingsone = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[3])))
    ratingstwo = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[4])))
    ratingsthree = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[5])))
    ratingsfour = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[6])))
    ratingssix = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[7])))
    ratingswickets = playerprob.map(lambda x: Rating(int(x[9]), int(x[10]), float(x[8])))

    modelzero = ALS.train(ratingszero, rank, numIterations)
    modelone = ALS.train(ratingsone, rank, numIterations)
    modeltwo = ALS.train(ratingstwo, rank, numIterations)
    modelthree = ALS.train(ratingsthree, rank, numIterations)
    modelfour = ALS.train(ratingsfour, rank, numIterations)
    modelsix = ALS.train(ratingssix, rank, numIterations)
    modelwickets = ALS.train(ratingswickets, rank, numIterations)
    models = [modelzero,modelone,modeltwo,modelthree,modelfour,modelsix,modelwickets]
from pyspark.sql import SQLContext
import pandas as pd
import pyspark.sql.functions as f
from time import time

#initialize spark
conf = SparkConf().setAppName('test')
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)

#read train file
data = sqlContext.read.format('com.databricks.spark.csv').options(
    header='true').load('train_2.csv')
ratings = data.rdd.map(
    lambda l: Rating(int(l.userID), int(l.movieID), float(l.rating)))
print ratings.take(5)

################ Create model using train set ######################
t0 = time()
rank = 10
numIterations = 10
model = ALS.train(ratings, rank, numIterations)
tt = time() - t0
print "Model trained in %s seconds" % round(tt, 3)

################ Apply model on test set ######################
#read test file
test_data = sqlContext.read.format('com.databricks.spark.csv').options(
    header='true').load('test_2.csv')
test_all = test_data.rdd.map(
sc = SparkContext("local", "collaborative_filtering")  #initializing sc
sqlContext = SQLContext(sc)

df = sqlContext.read.load("./tables/ratings")

#movieId, rating, timestamp, userid
num_ratings = df.select("rating").count()
num_movies = df.select("movieId").distinct().count()
num_users = df.select("userId").distinct().count()

ratings = "./ratings.csv"  #The path to ratings file. change this according to file location

#Loading the data using SparkContext
data = sc.textFile(ratings)
ratings_data = data.map(lambda l: l.split(','))
ratings = ratings_data.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

#Building the recommendation model using Alternating Least Squares
rank = 10
numIterations = 5
model = ALS.train(ratings, rank, numIterations)

#Evaluate the model on training data
testdata = ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))

#Lets save the model for future use
print("Model Computed. Saving the model...")
Esempio n. 18
0
@author: BuleSky
'''

from pyspark import SparkConf, SparkContext
# Configure the Spark environment
sparkConf = SparkConf().setAppName("WordCounts").setMaster("local")
sc = SparkContext(conf=sparkConf)

from pyspark.mllib.recommendation import ALS, Rating
#file_path="hdfs://blue:9000/data/md/u.data"
file_path = "file:///E:/spark/learn/ml-100k/"

rawRatings = sc.textFile(file_path).map(lambda line: line.split("\t")[0:3])

ratings = rawRatings.map(
    lambda fields: Rating(int(fields[0]), int(fields[1]), int(fields[2])))

print ratings.first()

model = ALS.train(ratings, 50, 10, 0.01)
userFeaturesCount = model.userFeatures().count()
productFeaturesCount = model.productFeatures().count()

print "userCount {0} productCount {1}".format(userFeaturesCount,
                                              productFeaturesCount)

predicted = model.predict(789, 123)
print predicted

userId = 789
k = 10
Esempio n. 19
0
""""
Program:als.py
Description:sparl内置als算法调用
Author: zhenglei - [email protected]
Date: 2016-01-14 12:56:53
Last modified: 2016-01-14 14:45:34
Python release: 2.7
"""
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, Rating

if __name__ == '__main__':
    sc = SparkContext()
    data = sc.textFile("alsTest.data")
    ratings = data.map(lambda l: l.split(',')).map(
        lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
    print ratings.collect()
    rank = 10
    numIterations = 10
    # 训练模型, rank是隐含影响特征,一般是初始为5-10,然后递增查看训练效果,直到效果不再改变,确定rank的值
    model = ALS.train(ratings, rank, numIterations)
    testdata = ratings.map(lambda p: (p[0], p[1]))
    # 对输入的数据进行预测
    predictions = model.predictAll(testdata).map(lambda r:
                                                 ((r[0], r[1]), r[2]))
    print predictions.collect()
    # 获取所有预测及测试数据
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
        predictions)
    print ratesAndPreds.collect()
    # 计算误差
Esempio n. 20
0
def train():
    conf = SparkConf() \
        .setAppName("project") \
        .setMaster("local[*]") \
        .set("spark.driver.memory","4g")
    sc = SparkContext(conf=conf)

    # check model dir
    if not os.path.exists(model_dir):
        os.mkdir(model_dir)

    # rename
    raw_data = sc.textFile(train_file).map(json.loads).persist(StorageLevel.MEMORY_AND_DISK)
    u_table1 = raw_data.map(lambda x: x['user_id']).distinct().collect()
    u_set1 = set(u_table1)
    b_table1 = raw_data.map(lambda x: x['business_id']).distinct().collect()
    b_set1 = set(b_table1)

    user_avg = support.getAvg(user_avg_file)
    business_avg = support.getAvg(business_avg_file)
    u_set2 = set(user_avg.keys())
    b_set2 = set(business_avg.keys())

    b_table3 = sc.textFile(business_json).map(json.loads).map(lambda x: x['business_id']).collect()
    b_set3 = set(b_table3)

    u_table = list(u_set1.union(u_set2))
    b_table = list(b_set1.union(b_set2).union(b_set3))
    u_d = {u_table[i]: i for i in range(len(u_table))}
    b_d = {b_table[i]: i for i in range(len(b_table))}

    # agmentation
    business_avg = support.getAvg(business_avg_file)
    n_b_avg = {b_d[k]: business_avg[k] for k in business_avg}

    # get stopwords
    stopwords = sc.textFile(stopwords_file).collect()

    b_profile = sc.textFile(business_json) \
        .map(json.loads) \
        .map(lambda x: (x['business_id'], x['categories'])) \
        .map(lambda x: (b_d[x[0]], x[1])) \
        .mapValues(lambda v: processCategories(v, stopwords)) \
        .collectAsMap()
    b_list = list(sorted(b_profile.keys()))
    b_length = len(b_profile)
    jaccard_sim = sc.parallelize(b_list) \
        .flatMap(lambda x: getJS(x, b_profile, b_list)) \
        .reduceByKey(lambda x, y: x + y) \
        .mapValues(lambda vs: {k: v for k, v in vs}) \
        .collect()

    agm_data = raw_data.map(lambda r: (r['user_id'], r['business_id'], r['stars'])) \
        .map(lambda x: (u_d[x[0]], b_d[x[1]], x[2])) \
        .map(lambda x: (x[0], [(x[1], x[2])])) \
        .reduceByKey(lambda x, y: x + y) \
        .mapValues(lambda vs: processValues(vs, jaccard_sim, n_b_avg)) \
        .flatMap(lambda x: [(x[0], b, star) for b, star in x[1]]) \
        .persist(StorageLevel.MEMORY_AND_DISK)

    # asl
    agm_train = agm_data.map(lambda x: ((u_table[x[0]], b_table[x[1]]), x[2])).collect()
    support.writeDownRenameTable(agm_train, agm_train_file)

    lonely_user = agm_data.map(lambda x: (x[0], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_USER_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()
    lonely_business = agm_data.map(lambda x: (x[1], 1)) \
        .reduceByKey(lambda x, y: x + y) \
        .filter(lambda x: x[1] < LONELY_BUSINESS_THRESHOLD) \
        .map(lambda x: x[0]) \
        .collect()

    stars_data = agm_data.filter(lambda x: x[0] not in lonely_user and x[1] not in lonely_business) \
        .map(lambda x: Rating(x[0], x[1], x[2])).persist(StorageLevel.MEMORY_AND_DISK)
    sc.setCheckpointDir(checkpoint_file)
    ALS.checkpointInterval = 2
    modelRDD = ALS.train(ratings=stars_data, rank=1, iterations=70, lambda_=0.01, nonnegative=True)
    saveAlsModel(modelRDD, u_table, b_table, als_model_file)
def modelbased(inputfile,valfile,outputfile):

    sc = SparkContext(appName="Task2.1")
    start = time.time()

    data = sc.textFile(inputfile)
    data_header = data.first()
    input_data_final = data.filter(lambda rec: rec != data_header).map(lambda string_record: (string_record.split(',')))
    userdata = input_data_final.map(lambda x: x[0]).collect()
    businessdata = input_data_final.map(lambda x: x[1]).collect()

    usermap = {}
    businessmap = {}

    reverseusermap = {}
    reversebusinessmap = {}

    for idx, user in enumerate(userdata):
        usermap[user] = idx
        reverseusermap[idx] = user
        # idx+=1

    for idx, business in enumerate(businessdata):
        businessmap[business] = idx
        reversebusinessmap[idx] = business
        # idx+=1

    ratings = input_data_final.map(lambda x: Rating(int(usermap[x[0]]), int(businessmap[x[1]]), float(x[2])))
    rank = 2
    numIterations = 5
    model = ALS.train(ratings, rank, numIterations)
    test1 = sc.textFile(valfile)
    test_data_header = test1.first()
    testRDD = test1.filter(lambda rec: rec != test_data_header).map(lambda string_record: (string_record.split(',')))
    test_user = testRDD.map(lambda x: x[0]).collect()
    test_business = testRDD.map(lambda x: x[1]).collect()

    for newIdx, user in enumerate(test_user):
        if user not in usermap:
            while newIdx in usermap.values():
                newIdx += 1
            usermap[user] = newIdx
            reverseusermap[newIdx] = user

    for newIdx, business in enumerate(test_business):
        if business not in businessmap:
            while newIdx in businessmap.values():
                newIdx += 1
            businessmap[business] = newIdx
            reversebusinessmap[newIdx] = business

    testingRDD = testRDD.map(lambda x: Rating(int(usermap[x[0]]), int(businessmap[x[1]]), float(x[2])))
    testing_data = testingRDD.map(lambda x: (x[0], x[1]))
    testprediction = model.predictAll(testing_data).map(lambda x: ((x[0], x[1]), x[2])).cache()
    predictions = testprediction.map(lambda x: (x[0][0], x[0][1], x[1])).collect()
    file = open(outputfile, "w")
    file.write('user_id, business_id, prediction\n')
    for pred in predictions:
        file.write(
            str(reverseusermap[pred[0]]) + "," + str(reversebusinessmap[pred[1]]) + "," + str(pred[2]) + "\n")

    file.close()
    end = time.time()
    print("Duration: ", end - start)
    ratesAndPreds = testingRDD.map(lambda r: ((r[0], r[1]), r[2])).join(testprediction)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1]) ** 2).mean()
    print("Mean Squared Error = ", str(MSE ** 0.5))
def parseRating(line):
    # Parsing SteamID,AppID,Rating
    line = line.split(',')
    return Rating(int(line[0]), int(line[1]), float(line[2]))
Esempio n. 23
0
def compute_model_cf(ftrain, ftest):
    test_data_1 = sc.textFile(ftest)
    data = test_data_1.map(lambda line: readtestdata(line))
    header_info = data.first()
    test_data = data.filter(lambda ratings: ratings != header_info).map(
        lambda rowData: (int(rowData[0]), int(rowData[1]))).persist()

    train_data_1 = sc.textFile(ftrain).map(lambda lines: readtraindata(lines))
    header_info_train = train_data_1.first()
    train_data = train_data_1.filter(
        lambda rating: rating != header_info_train).map(lambda rowData: (
            (int(rowData[0]), int(rowData[1])), float(rowData[2])))

    user_movies = train_data.map(lambda row: row[0])
    #print user_movies.collect()

    train_movies = user_movies.subtract(test_data).map(lambda r: (r, 0))
    #print train_movies.collect()

    training_movies = train_data.join(train_movies).map(lambda r:
                                                        (r[0], r[1][0]))

    #print training_movies.collect()

    test_movies = test_data.map(lambda r: (r, 0))

    test_movie_rates = train_data.join(test_movies)

    test_movie_ratings = train_data.join(test_movie_rates).map(lambda r:
                                                               (r[0], r[1][0]))

    ratings = training_movies.map(lambda r: Rating(r[0][0], r[0][1], r[1]))
    rank = 7
    iters = 10

    model = ALS.train(ratings, rank, iters)
    predictions = model.predictAll(test_data).map(lambda r:
                                                  ((r[0], r[1]), r[2]))

    outliers = predictions.filter(lambda r: r[1] < 0.0 or r[1] > 5.0)
    true_Predictions = predictions.subtract(outliers)

    predicted_movies_rdd = predictions.map(lambda r: r[0])
    missing_movies_rdd = test_data.subtract(predicted_movies_rdd)

    missing_Outliers = outliers.map(lambda r: r[0]).union(missing_movies_rdd)

    userRatings = predictions.map(lambda r: (r[0][0], r[1]))

    mean_Ratings = userRatings.aggregateByKey(
        (0.0, 0.0), lambda U, s: (U[0] + s, U[1] + 1), lambda U, V:
        (U[0] + V[0], U[1] + V[1])).mapValues(
            lambda res: 1.0 * res[0] / res[1])

    user_movie_ratings = missing_Outliers.join(mean_Ratings).map(
        lambda r: ((r[0], r[1][0]), r[1][1]))

    predictions_union = true_Predictions.union(user_movie_ratings)

    ratesAndPreds = test_movie_ratings.join(predictions_union).sortByKey()

    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    RMSE = math.sqrt(MSE)

    difference = ratesAndPreds.map(lambda r: abs(r[1][0] - r[1][1]))
    bet_0_1 = difference.filter(lambda r: r >= 0 and r < 1.0).count()
    bet_1_2 = difference.filter(lambda r: r >= 1.0 and r < 2.0).count()
    bet_2_3 = difference.filter(lambda r: r >= 2.0 and r < 3.0).count()
    bet_3_4 = difference.filter(lambda r: r >= 3.0 and r < 4.0).count()
    bet_4_5 = difference.filter(lambda r: r >= 4.0).count()
    #squared_error = ratesAndPreds.map(lambda r : math.pow((r[1][0] - r[1][1]),2)).mean()

    print()
    print(">=0 and <1: ", bet_0_1)
    print(">=1 and <2: ", bet_1_2)
    print(">=2 and <3: ", bet_2_3)
    print(">=3 and <4: ", bet_3_4)
    print(">=4: ", bet_4_5)
    print("RMSE = ", RMSE)

    headers = []
    headers = [["UserId", "MovieId", "Pred_rating"]]
    header = sc.parallelize(headers)

    resultRDD = ratesAndPreds.map(
        lambda r: (str(r[0][0]), str(r[0][1]), str(r[1][1]))).repartition(1)
    fileRDD = header.union(resultRDD).map(
        lambda r: r[0] + "," + r[1] + "," + r[2])
    fileRDD = fileRDD.repartition(1)
    fileRDD.saveAsTextFile("Prashanth_Manja_ModelBasedCF.txt")
    end_time = time.time()
    total_time = end_time - start_time
    print("Total_Execution_Time_Is-->", str(total_time))
Esempio n. 24
0
def tup_to_rating(tup):
    user, subreddit, num = tup
    user = javahash(user)
    subreddit = javahash(subreddit)
    num = float(num)
    return Rating(user, subreddit, num)
Esempio n. 25
0
    inputRDD = sc.textFile(inputFile)
    testRDD = sc.textFile(testFile)

    header = inputRDD.first()  #extract header
    inputRDD = inputRDD.filter(lambda row: row != header)

    header2 = testRDD.first()  #extract header
    testRDD = testRDD.filter(lambda row: row != header2)

    inputRDD = inputRDD.map(lambda line: line.split(',')).map(
        lambda x: ((int(x[0]), int(x[1])), float(x[2])))
    testRDD = testRDD.map(lambda line: line.split(',')).map(
        lambda x: ((int(x[0]), int(x[1])), 1))

    input1 = inputRDD.subtractByKey(testRDD)
    input = input1.map(lambda x: Rating(x[0][0], x[0][1], x[1]))
    sc.setCheckpointDir('/tmp')

    rank = 8
    numIterations = 10
    lmbda = 0.1
    numBlocks = 16
    nonnegative = True
    model = ALS.train(input,
                      rank,
                      numIterations,
                      lmbda,
                      nonnegative=True,
                      seed=42)

    testRDD = testRDD.map(lambda x: (x[0][0], x[0][1])).distinct()
Esempio n. 26
0
                user_map[user_set[i]] = user_id
                user_id += 1

        business_id = 0

        for i in range(0, len(business_set)):
            if business_set[i] not in business_map:
                business_map[business_set[i]] = business_id
                business_id += 1

        Expected_rating = test_rdd.map(lambda x: (
            (user_map[x[0][0]], business_map[x[0][1]]), float(x[1])))

        test_data = Expected_rating.map(lambda x: (x[0][0], x[0][1]))

        training_data = train_rdd.map(lambda x: Rating(user_map[x[0][
            0]], business_map[x[0][1]], float(x[1])))

        model = ALS.train(training_data, rank=2, iterations=20, lambda_=0.5)

        predictions = model.predictAll(test_data).map(lambda x:
                                                      ((x[0], x[1]), x[2]))

        prediction_list = predictions.collect()

        sum_predicted_ratings = predictions.values().sum()
        total_count_ratings = predictions.count()

        average_predicted_rating = float(sum_predicted_ratings) / float(
            total_count_ratings)

        test_data_list = test_data.collect()
Esempio n. 27
0
    numHits = 0.0
    for (i, p) in zip(range(k), predK):
        if actual.filter(lambda x: x == p).count() != 0:
            numHits += 1.0
            score += numHits / (i + 1.0)
    if actual.isEmpty():
        return 1
    else:
        return (score / min((actual.count(), k)))


#转换为Rating类
rawData = sc.textFile("../data/ml-100k/u.data")
rawRatings = rawData.map(lambda lines: lines.split('\t')[:3])
ratings = rawRatings.map(
    lambda fields: Rating(int(fields[0]), int(fields[1]), float(fields[2])))
implicit_ratings = ratings.map(
    lambda fields: Rating(fields[0], fields[1], int(to_implicit(fields[2]))))
"""
模型参数:
rank: ALS模型中的因子个数,即低阶近似矩阵中的隐含特征个数 10-200
iterations: 迭代次数 10左右
lambda: 控制模型的正则化过程,防止过拟合;需要通过交叉验证来进行标定
classmethod train(ratings, rank, iterations=5, lambda_=0.01, blocks=-1,
                                                  nonnegative=False, seed=None)
classmethod trainImplicit(ratings, rank, iterations=5, lambda_=0.01, blocks=-1,
                                      alpha=0.01, nonnegative=False, seed=None)
"""
model = ALS.train(ratings, 50, 10, 0.01)
model_implicit = ALS.train(ratings, 50, 10)
"""
Esempio n. 28
0
# Apache Spark Config Settings
conf = (SparkConf().setAll([("spark.driver.memory", "5G"),
                            ("spark.driver.maxResultSize", "2G"),
                            ("spark.executor.memory", "3G")]))
sc = SparkContext(conf=conf, appName="CollaborativeFiltering")

# Loading normalized train and testdata from hadoop filesystem
train_norm_data = sc.textFile(
    'file:///users/jeevan4/challenge2/train_norm_data.txt')
test_norm_data = sc.textFile(
    'file:///users/jeevan4/challenge2/test_norm_data.txt')

# Converting the data into Ratings to supply the data into ALS algorithm
train_ratings = train_norm_data.map(lambda x: x.split(",")).map(
    lambda l: Rating(l[0], l[1], float(l[2]))).persist(
        StorageLevel.MEMORY_ONLY)
test_ratings = test_norm_data.map(lambda x: x.split(",")).map(
    lambda l: Rating(l[0], l[1], float(l[2]))).persist(
        StorageLevel.MEMORY_ONLY)

# ALS algorithm parameters and building recommendation model using Alternating Least Squares
rank = 10
numIterations = 10
model = ALS.train(train_ratings, rank, numIterations, 0.01)

#Evaluate the model on test data
testdata = test_ratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))

ratesAndPreds = test_ratings.map(lambda r: ((r[0], r[1]), r[2])).join(
Esempio n. 29
0
            ratings[k] = 0
        ratings[k] += meta2[2][k]
    return (users,items,ratings)

# perform datapass to compute stats

rating_stats = df.rdd.map(lambda row: metaMap(row,user_index,item_index,rating_index)).reduce(lambda x,y:metaReduce(x,y))
print("Total number of distinct items: "+ str(len(rating_stats[1])))
print("Total number of distinct users: "+ str(len(rating_stats[0])))

print("Rating frequencies:")
ratings_range = sorted(rating_stats[2].keys())
for rating in ratings_range:
    print("\t"+str(rating)+": "+str(rating_stats[2][rating]))

ratings = df.map(lambda l: Rating(int(l[user_index]), int(l[item_index]), float(l[rating_index])))

# Build the recommendation model using Alternating Least Squares

model = ALS.train(ratings, rank, iterations, lmbda, blocks, seed=random_seed)

# Save and load model
model.save(sc, model_path)
saveDummy(sc,dummypath)
ascontext.setModelContentFromPath("dummy",dummypath)





Esempio n. 30
0
#c.execute("SELECT * FROM ratings WHERE user_id = %s",(user,))
c.execute("SELECT user_id, movie_id, rating FROM ratings ")
query = c.fetchall()

#### MYSQL #########################################################################
conf = SparkConf().setMaster("local[*]").setAppName("MovieRecommendationsALS")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint')

print("\nLoading movie names...")
nameDict = loadMovieNames()

data = sc.parallelize(query)
# data = sc.textFile("file:/Users/alejandroaparicio/Documents/SparkCourse/ml-latest-small/ratings.csv")

ratings = data.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))).cache()

# Build the recommendation model using Alternating Least Squares
print("\nTraining recommendation model...")
#rank = 10
# Lowered numIterations to ensure it works on lower-end systems
#numIterations = 10
#model = ALS.train(ratings, rank, numIterations)
seed = 5
iterations = 10
regularization_parameter = 0.1
rank = 8

model = ALS.train(ratings,
                  rank,
                  seed=seed,