def collaborative(self,ratings,user_id):

        reader = Reader()
        #ratings.head()

        temp_ratings = ratings



        data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
        data.split(n_folds=2)

        ## Training the data ##
        svd = SVD()
        evaluate(svd, data, measures=['RMSE', 'MAE'])

        trainset = data.build_full_trainset()

        algo = SVD()
        algo.fit(trainset)

        #svd.train(trainset)
        ## Testing the data ##

        from collections import defaultdict
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        count = 0
     
        for uid, iid, true_r, est, _ in predictions:

             if uid == user_id:
                count = count+1
                temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]

        #print("count\n")
        #print(count)
        #print("\n--------here-------\n")	
        #print(temp_ratings)

        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]
        #print("\n--------here-------\n")
        #print(cb)
        
        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]

        return(cb)
Beispiel #2
0
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


total_review_df = pd.read_csv("../data/total_review_df.csv")
total_df = pd.read_csv("../data/total_df.csv")
# Load the dataset (download it if needed)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(total_review_df[["user_name", "res_id", "rating"]],
                            reader)
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
# testset = trainset.build_full_trainset()
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])
Beispiel #3
0
# Requires Numpy and scikit-surprise installed
from surprise import Reader, Dataset, SVD, evaluate

# Read data into an array of strings
with open('./ml-100k/u.data') as f:
    all_lines = f.readlines()

# Let's prepare data to be used in Surprise
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file('./ml-100k/u.data', reader=reader)

# We Split the dataset into 5 folds and choose the algorithm
data.split(n_folds=5)
algo = SVD()  # our chosen algorithm

# We now train and test reporting the RMSE and MAE scores
evaluate(algo, data, measures=['RMSE', 'MAE'])

# Retrieving the trainset.
trainset = data.build_full_trainset()
algo.train(trainset)

# Predict a sample item
userid = str(196)
itemid = str(302)
actual_rating = 4

#Printing out our predictions
print(algo.predict(userid, itemid, actual_rating))
Beispiel #4
0
    'Com menos rating:',
    df.groupby('book_id')['rating'].count().reset_index().sort_values(
        'rating', ascending=True)[:10])

from surprise.model_selection import cross_validate
from surprise import Reader, Dataset, NormalPredictor, KNNBasic, KNNWithMeans
from surprise import KNNWithZScore, KNNBaseline, SVD, BaselineOnly, SVDpp
from surprise import NMF, SlopeOne, CoClustering

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)

benchmark = []
# Testa todos os algoritimos
for algoritimo in [
        SVD(),
        SVDpp(),
        SlopeOne(),
        NMF(),
        NormalPredictor(),
        KNNBaseline(),
        KNNBasic(),
        KNNWithMeans(),
        KNNWithZScore(),
        BaselineOnly(),
        CoClustering()
]:
    print('Inicio algoritimo', algoritimo)
    # Cross validation
    resultados = cross_validate(algoritimo,
                                data,
Beispiel #5
0
# initialize a random seed so that train test split is the same across runs
seed = np.random.RandomState(42)

# split the data into 80% training 20% validation
# Note we pass in the random seed as the second argument to ensure that the 'randomness' of the split
# is consistent every time the file is ran
trainset, testset = train_test_split(data, test_size=.2, random_state=seed)

# Initialize the algorithm that we are going to use to train on the dataset
# Here we use standard SVD algorithm (matrix factorization with user and item biases)
# n_factors specifies the number of factors to be used, n_epochs specifies the number of iterations
# of stochastic gradient descent, and verbose=True gives us progress on the epochs
# Check Surprise documentation on SVD for full list of specifiable parameters
print("Training model...")
algo = SVD(n_factors=50, n_epochs=10, verbose=True)

# This call to fit() on the trainset actually performs the training of the model
algo.fit(trainset)

# The call to test() on the testset makes predictions on ratings of user-items in the testset
# according to the trained model above
predictions = algo.test(testset)

# This line gives us the accuracy in terms of RMSE of the predictions made above
accuracy.rmse(predictions)

# Test again with different params
# Note when we don't specify the number of epochs, the default is 20
print("Training model...")
algo = SVD(n_factors=50, verbose=True)
Beispiel #6
0
def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n


# retrain on the whole set A
algo = SVD()
trainset = data.build_full_trainset()
algo.train(trainset)

# Compute biased accuracy on A
testset = data.construct_testset(A_raw_ratings)  # testset is now the set A
predictions = algo.test(trainset.build_testset())
print('Biased accuracy on A,', end='   ')
accuracy.rmse(predictions)

# predict r for all pairs (user, item) that are NOT in the training set
# by setting the pairs that were to 0 and the pairs that were not in the
# training set to mean of all ratings.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
Beispiel #7
0
product_idx = dict(zip(products, np.arange(0, len(products))))

# Now transform it to the format expected by the Python
# recommendation package 'surprise'. The current data
# has columns 'product', 'ip', 'date_logged' and 'url'.
# Surprise requires columns corresponding to user id,
# item id and rating in that order.

grouped_series = local_df.groupby(['ip', 'product']).size()
ratings_dict = {
    "userId": [idx[0] for idx in grouped_series.index],
    "itemId": [idx[1] for idx in grouped_series.index],
    "rating": list(grouped_series)
}
surprise_df = pandas.DataFrame(ratings_dict,
                               columns=['userId', 'itemId', 'rating'])

# Load the dataframe into a surprise Dataset object
reader = Reader(rating_scale=(1, surprise_df['rating'].max()))
data = Dataset.load_from_df(surprise_df, reader)

# We'll use the famous SVD algorithm to train the
# recommender.
algo = SVD()
trainset = data.build_full_trainset()
algo.fit(trainset)

# Save the trained recommender to disk so we can deploy
# the predictor as a model.
joblib.dump(algo, 'recommender.pkl')
Beispiel #8
0
ordering = 'mu'  # rows correspond to movie_ids; cols correspond to user_ids
submit = True  # set to True to save a submission on qual
save_model = False  # set to True to save model parameters for future predictions

print('Loading data...')
df = pd.read_csv(os.path.join('data', 'mu_train.csv'))
# modify dataframe to reduce memory
del df['Unnamed: 0']
del df['Date Number']
df = df.astype('int32')

df_val = pd.read_csv(os.path.join('data', 'mu_val.csv'))

print('Solving SVD...')
reader = Reader(rating_scale=(1, 5))
model = SVD(n_epochs=20, verbose=True)

train_raw = Dataset.load_from_df(df[['User Number', 'Movie Number', 'Rating']],
                                 reader)
train = train_raw.build_full_trainset()

model.fit(train)
gc.collect()
'''
train_pred = model.test(train.build_testset())
val_raw = Dataset.load_from_df(df_val[['User Number', 'Movie Number', 'Rating']], reader)
val = val_raw.build_full_trainset()
val_pred = model.test(val.build_testset())

print('Train RMSE:', accuracy.rmse(train_pred))
print('Val RMSE:', accuracy.rmse(val_pred))
Beispiel #9
0
data.to_csv("abc.txt",
            index=None,
            header=None,
            columns=["users", "items", "rates"])

reader = Reader(line_format='user item rating', rating_scale=(0, 10), sep=',')

data = Dataset.load_from_file("abc.txt", reader=reader)

data.split(n_folds=10)
# sim_options = {'name': 'cosine',
#                'user_based': False  # compute  similarities between items
#                }
# algo = KNNBasic(sim_options=sim_options)
# We'll use the famous SVD algorithm.
algo = SVD(verbose=True)
for _ in range(10):
    perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
    print_perf(perf)

dump_obj = {'predictions': perf, 'algo': algo}
pickle.dump(dump_obj, open(result_path, 'wb'))
exit()

start_time = time.time()
for trainset, testset in data.folds():
    # train and test algorithm.
    algo.train(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
Beispiel #10
0
class surpriseRecommender():
    def __init__(self, stories, reviews, users):

        #self.stories = stories
        #self.reviews = reviews
        #self.users = users

        self.storyLinkToIdDict = {}
        self.IdToStoryDict = {}

        #create a dict between storied and their id
        self.lastStoryId = 0
        for story in stories:
            self.storyLinkToIdDict[story['storyLink']] = self.lastStoryId
            self.IdToStoryDict[self.lastStoryId] = story
            self.lastStoryId += 1

        self.userLinkToIdDict = {}
        self.IdToUserDict = {}

        self.lastUserId = 0
        for user in users:
            self.userLinkToIdDict[user['name']] = self.lastUserId
            self.IdToUserDict[self.lastUserId] = user
            self.lastUserId += 1

        for review in reviews:
            if (review['r'] not in self.userLinkToIdDict):
                self.userLinkToIdDict[review['r']] = self.lastUserId
                self.IdToUserDict[self.lastUserId] = review['r']
                self.lastUserId += 1

        self.reviewLinkToIdDict = {}
        self.IdToReviewDict = {}
        self.lastReviewId = 0
        for review in reviews:
            self.reviewLinkToIdDict[review['rO'] + '|' +
                                    review['r']] = self.lastReviewId
            self.IdToReviewDict[self.lastReviewId] = review
            self.lastReviewId += 1

        # ## make scores dict

        storyReviewDic = Counter({})
        storyScores = {}

        cnt = 0
        self.minScore = 0
        self.maxScore = 0
        for review in reviews:
            if (review['rO'] in self.storyLinkToIdDict):
                userId = self.userLinkToIdDict[review['r']]
                storyId = self.storyLinkToIdDict[review['rO']]
                score = review['sS']
                self.minScore = min(score, self.minScore)
                self.maxScore = max(score, self.maxScore)
                storyScores[(userId, storyId)] = {
                    "storyId": storyId,
                    "userId": userId,
                    "score": score
                }
                cnt += 1
        print(self.minScore, self.maxScore)
        # ### add in favorites data
        # bias favorites over reviews
        for user in users:
            userId = self.userLinkToIdDict[user['name']]

            for favorite in user['favorites']:
                if (favorite['S'] in self.storyLinkToIdDict):

                    storyId = self.storyLinkToIdDict[favorite['S']]
                    score = 10
                    if ((userId, storyId) not in storyScores):
                        storyScores[(userId, storyId)] = {
                            "storyId": storyId,
                            "userId": userId,
                            "score": 0
                        }
                    storyScores[(userId, storyId)]['score'] += score

        self.inputScores = []
        for score, body in storyScores.items():
            self.inputScores.append(body)

    def train(self):
        df = pd.DataFrame(self.inputScores)
        reader = Reader(rating_scale=(self.minScore, self.maxScore))
        data = Dataset.load_from_df(df[['userId', 'storyId', 'score']], reader)
        trainset = data.build_full_trainset()
        self.algo = SVD()
        self.algo.fit(trainset)

    def getTopPredictions(self, userId, stories):
        df = pd.DataFrame(self.inputScores)
        df_filtered = df.query('userId==' + str(userId))
        #print(df_filtered)
        test_items = []
        for story in stories:
            storyId = self.storyLinkToIdDict[story['storyLink']]
            test_items.append({
                "storyId": storyId,
                "userId": userId,
                "score": 0
            })
        df = pd.DataFrame(test_items)
        #remove values the user already knows
        mask = np.logical_not(df['storyId'].isin(set(df_filtered['storyId'])))
        df = df[mask]

        reader = Reader(rating_scale=(self.minScore, self.maxScore))
        data = Dataset.load_from_df(df[['userId', 'storyId', 'score']], reader)
        trainset = data.build_full_trainset()
        testset = trainset.build_testset()
        predictions = self.algo.test(testset)

        scores = {}
        for uid, iid, true_r, est, _ in predictions:
            scores[self.IdToStoryDict[iid]['storyLink']] = est

        return scores

    def predict(self, link, stories):
        link = link.replace('https://www.fanfiction.net', '')
        #print(link, link in self.userLinkToIdDict)
        #if we havent seen this user before
        if (link not in self.userLinkToIdDict):
            '''self.userLinkToIdDict[user['name']] = self.lastUserId
            self.IdToUserDict[self.lastUserId] = user
            self.lastUserId += 1
            self.train()'''
            print('user not found')
            return {}
        return self.getTopPredictions(self.userLinkToIdDict[link], stories)
class Main():
    def __init__(self):
        #所有的Item项
        self.items = []
        #所有的User项
        self.users = []
        #评分数据
        self.ratings = []
        #测试数据集
        self.test = []
        #用户平均评分
        self.rating_aves = []
        #Item id到self.items的映射
        self.item_dic = {}
        #user id到self.users的映射
        self.user_dic = {}

    def getData(self):

        #获取users
        with open(TRAIN_PATH, 'r') as f:
            user_no = 0
            item_no = 0
            while True:
                line = f.readline()

                if not line or line == '\n':
                    break
                id, item_num = line.split('|')
                item_num = int(item_num[:-1])
                user = User(id, item_num)
                for i in range(item_num):
                    line = f.readline()
                    item_id, score = line.split("  ")[:2]
                    score = int(score)
                    if score == 0:
                        score = 1
                    user.setItems([item_id, score])
                    self.ratings.append([id, item_id, score / 20])
                    if item_id not in self.item_dic:
                        self.item_dic[item_id] = item_no
                        item_no += 1
                        self.items.append(Item(item_id))
                self.user_dic[id] = user_no
                user_no += 1

                # print(id)
                self.users.append(user)
        self.user_num = len(self.users)
        self.item_num = len(self.items)
        self.rating_matrix = sparse.dok_matrix((self.user_num, self.item_num))
        # print(self.item_dic['507696'])
        for i in range(self.user_num):
            for j in range(self.users[i].item_num):
                self.rating_matrix[
                    self.user_dic[self.users[i].id],
                    self.item_dic[self.users[i].
                                  items[j][0]]] = self.users[i].items[j][1]

        for i in range(self.user_num):
            self.rating_aves.append(self.users[i].getAverage())

        #获取测试数据
        with open(TEST_PATH, 'r') as f:
            while True:
                line = f.readline()
                if not line or line == '\n':
                    break

                id, item_num = line.split('|')
                item_num = int(item_num[:-1])
                user = User(id, item_num)
                for i in range(item_num):
                    line = f.readline()
                    item_id = line[:-1]
                    user.setItems([item_id])
                self.test.append(user)

        self.test_num = len(self.test)
        # for i in self.test:
        #     print(i.id,i.items)
        print('finish getData')

    def mySVD(self):
        self.reader = Reader(rating_scale=(1, 5))
        self.data = Dataset.load_from_df(pd.DataFrame(self.ratings),
                                         self.reader)
        print(self.data)
        trainset, testset = train_test_split(self.data, test_size=.15)
        self.model = SVD(n_factors=SVD_PARAMETER)
        self.model.fit(trainset)
        self.model
        a_user = "******"
        a_product = "507696"
        print(self.model.predict(a_user, a_product))

    def predict(self):
        for i in range(self.test_num):
            with open(RESULT_PATH, 'a') as f:
                f.write(self.test[i].id)
                f.write('\n')
                for j in range(len(self.test[i].items)):
                    self.test[i].items[j].append(
                        self.model.predict(self.test[i].id,
                                           self.test[i].items[j][0])[3] * 20)
                    f.write(self.test[i].items[j][0])
                    f.write(':')
                    f.write(str(self.test[i].items[j][1]))
                    f.write('\n')

    def mainMethod(self):
        self.getData()
        start = time.clock()
        self.mySVD()
        self.predict()
        elapsed = (time.clock() - start)
        print(elapsed)
Beispiel #12
0
ratings_mean_count['rating'].hist(bins=50)
plt.show()
# pivot ratings into movie features
df_movie_features = ratings.pivot(index='movies',
                                  columns='users',
                                  values='rating').fillna(0)
mat_movie_features = csr_matrix(df_movie_features.values)
print(df_movie_features)
# A reader is still needed but only the rating_scale param is required.
# The Reader class is used to parse a file containing ratings.
reader = Reader(rating_scale=(0, 5.0))
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['users', 'movies', 'rating']], reader)

# Split data into 5 folds
# data.split(n_folds=5)
# Split the dataset into 5 folds and choose the algorithm
algo = SVD()
# Train and test reporting the RMSE and MAE scores
# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
# Retrieve the trainset.
trainset = data.build_full_trainset()
algo.fit(trainset)

# Predict a certain item
users = str(414)
movies = str(410)
actual_rating = 5
print(algo.predict(users, movies, actual_rating))
df = pandas.DataFrame(trainset)
reader=Reader(line_format='user item rating',sep=',',skip_lines=1)
data=Dataset.load_from_df(df,reader)
hyper={'n_factors':[5,6,7],'reg_all':[0.1,1,10]}
clf = GridSearchCV(SVD,hyper,cv=5,measures=['mae','rmse'])#rmse
clf.fit(data)
print(clf.best_params)
print(clf.best_score['mae'])


# In[2]:


data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
algo = SVD(n_factors=7, reg_all=0.1)
algo.fit(trainset)
testset = trainset.build_testset()

predictions = algo.test(testset)
print('task 1')
mae = accuracy.mae(predictions)
print('accuracy: ',mae) #task 1


# In[5]:


def get_top_n(predictions, n=5):
    '''Return the top-N recommendation for each user from a set of predictions.
Beispiel #14
0
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

#print(ratings.head())

movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId')
movies_with_ratings.dropna(inplace=True)

dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

#print(dataset.head(30))
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
#print(data.df.head(30))

#print(list(zip(dataset.rating.head(100), data.df.rating.head(100))))

trainset, testset = train_test_split(data, test_size=.15, random_state=42)

algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)
test_pred = algo.test(testset)

print('rmse = ', accuracy.rmse(test_pred, verbose=True))
print('prediction = ', algo.predict(uid=5.0, iid='MortalKombat(1995)'))
then reloaded and can be used again for making predictions.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os

from surprise import SVD
from surprise import Dataset
from surprise import dump


data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

# Compute predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.expanduser('~/dump_file')
dump.dump(file_name, algo=algo)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')
Beispiel #16
0
def iniciarfiesta(fiesta_id, nombre_fiesta):

    conn = sqlite3.connect('spotify.db')

    sql = '''SELECT * FROM CancionUsuario'''

    fecha = str(date.today())
    uri_usuarios = []

    c = conn.cursor()

    sqlselectusers = ''' SELECT uri_usuario FROM FiestaUsuario where fiesta_id = ? '''
    cur = conn.cursor()
    cur.execute(sqlselectusers, (str(fiesta_id), ))
    conn.commit()

    select_invitados = cur.fetchall()

    for uri_usuario in select_invitados:
        uri_usuarios.append(uri_usuario[0])

    conn.close()

    invitado_para_sacar_info = uri_usuarios[0][13:]

    scope = 'user-library-read,user-top-read,playlist-modify-public'

    token_info = util.prompt_for_user_token(username=invitado_para_sacar_info,
                                            scope=scope)

    playlistid = crearplaylist(token_info, invitado_para_sacar_info,
                               nombre_fiesta)
    playlisturi = 'spotify:playlist:{}'.format(playlistid)

    df = fn_database(sql)

    df = df.loc[df['uri_usuario'].isin(uri_usuarios)]
    df_mas_recientes = df.groupby(['uri_usuario'],
                                  as_index=False)['date'].max()

    df = df.merge(df_mas_recientes, on='uri_usuario', how='left')

    df = df.loc[df['date_x'] == df['date_y']]

    print(token_info)

    uri_canciones = []
    for index, row in df.iterrows():
        print(row['uri_cancion'])
        uri_canciones.append(row['uri_cancion'])

    ObtenerCaracteristicas(uri_canciones, token_info)

    sqlcaracteristicas = '''SELECT uri AS uri_cancion, danceability*energy*5 AS fiesticidad, duration_ms
        FROM CancionCaracteristicas'''

    df_caracteristicas = fn_database(sqlcaracteristicas)
    print(df_caracteristicas)

    df['rating'] = df.apply(lambda row: elrating(row), axis=1)

    dfmodelo = df.groupby(['uri_cancion', 'uri_usuario'],
                          as_index=False)['rating'].max()

    dfsimple = dfmodelo.groupby(['uri_cancion'],
                                as_index=False)['rating'].sum()

    dfrecommender = dfmodelo[['uri_usuario', 'uri_cancion', 'rating']].copy()

    reader = Reader(rating_scale=(0, 5))

    data = Dataset.load_from_df(
        dfrecommender[['uri_usuario', 'uri_cancion', 'rating']], reader)

    algo = SVD()

    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    param_grid = {
        'n_factors': [5, 10, 15, 20],
        'n_epochs': [5, 10, 20, 25, 30],
        'lr_all': [0.001, 0.005, 0.01],
        'reg_all': [0.02, 0.1, 0.2, 0.3]
    }
    #param_grid = {'n_factors':[1],'n_epochs':[5,10,20,25,30],  'lr_all':[0.001,0.005,0.007,0.01],'reg_all':[0.02,0.1,0.2,0.3]}
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
    gs.fit(data)
    params = gs.best_params['rmse']
    print(params)
    svdtuned = SVD(n_factors=params['n_factors'],
                   n_epochs=params['n_epochs'],
                   lr_all=params['lr_all'],
                   reg_all=params['reg_all'])
    #svdtuned = SVD(n_factors=20, n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'])
    trainingSet = data.build_full_trainset()

    algo = svdtuned
    print
    print(algo.n_epochs)

    #sp = spotipy.Spotify(auth=token_info['access_token'])
    sp = spotipy.Spotify(auth=token_info)

    #descripcion_anterior = sp.user_playlist(user=token[1],playlist_id=playlistid,fields='description')
    descripcion_anterior = sp.user_playlist(user=invitado_para_sacar_info,
                                            playlist_id=playlistid,
                                            fields='description')

    descripcion = descripcion_anterior['description'] + ' ' + str(params)

    sp.user_playlist_change_details(user=invitado_para_sacar_info,
                                    playlist_id=playlistid,
                                    description=descripcion)

    algo.fit(trainingSet)

    prediction = algo.predict('spotify:user:jorged_94',
                              'spotify:track:0aZ5EsW90SpCbsYfMQ7HRf',
                              r_ui=0.995,
                              verbose=True)

    folder_path = r'Matrix\{}'.format(nombre_fiesta)
    #os.makedirs(os.path.dirname(filename_inputSI), exist_ok=True)
    os.makedirs(folder_path, exist_ok=True)

    rm = np.dot(algo.pu, algo.qi.T)

    np.savetxt(r'{}\algo.pu.csv'.format(folder_path), algo.pu, delimiter=',')
    np.savetxt(r'{}\algo.qi.csv'.format(folder_path), algo.qi, delimiter=',')
    np.savetxt(r'{}\rm.csv'.format(folder_path), rm, delimiter=',')

    #pu: User factors
    #qi: Item factors
    #bu: User bias
    #bi: Item bias

    group_pu = algo.pu.mean(axis=0)
    latent_factors = np.dot(group_pu, algo.qi.T)

    np.savetxt('{}\latent_factors.csv'.format(folder_path),
               latent_factors,
               delimiter=',')

    #print(algo.bu)
    #print(algo.bu.mean())

    numero_de_canciones = algo.qi.shape[0]

    recomendacion_grupal = []

    for i_iid in range(numero_de_canciones):
        group_estimacion = latent_factors[i_iid] + algo.bi[
            i_iid] + dfrecommender['rating'].mean()
        cancion = trainingSet.to_raw_iid(i_iid)
        #print (cancion,group_estimacion)
        recomendacion_grupal.append([cancion, group_estimacion])

    def Sort(sub_li):

        # reverse = None (Sorts in Ascending order)
        # key is set to sort using second element of
        # sublist lambda has been used
        sub_li.sort(key=lambda x: x[1], reverse=True)
        return sub_li

    # Driver Code
    recomendacion_grupal_ordenada = Sort(recomendacion_grupal)

    df_final = pd.DataFrame(recomendacion_grupal_ordenada,
                            columns=['uri_cancion', 'estimacion'])
    print(df_final)

    #df_final.join(other.set_index('key'), on='key')

    #df_final.merge(df_caracteristicas, left_on='uri',right_on='uri_cancion', how='left')
    df_final = df_final.merge(df_caracteristicas, on='uri_cancion', how='left')
    print(df_final)

    #df_final['puntaje_final']=(50*df_final['estimacion']+df_final['fiesticidad'])/3

    fiesticidad_threshold = 2

    df_final.loc[df_final['fiesticidad'] <= fiesticidad_threshold,
                 'puntaje_final'] = 0
    df_final.loc[df_final['fiesticidad'] > fiesticidad_threshold,
                 'puntaje_final'] = df_final['estimacion']

    #df_final['puntaje_final'] = df_final['set_of_numbers'].apply(lambda x: 'True' if x <= 4 else 'False')

    print(df_final)

    #df.loc[['viper', 'sidewinder'], ['shield']] = 50

    #df_final.loc[df_final['duration_ms'] > 420000, 'puntaje_final'] = 0

    df_final.loc[df_final.duration_ms > 420000, 'puntaje_final'] = 0

    df_final = df_final.sort_values(by='puntaje_final', ascending=False)
    print(df_final)

    df_final.to_csv(r'{}\estimacion_final.csv'.format(folder_path),
                    index=False,
                    header=True)

    dfsimple = dfsimple.sort_values(by='rating', ascending=False)

    dfsimple.to_csv(r'{}\estimacion_simple.csv'.format(folder_path),
                    index=False,
                    header=True)

    aleatorio = random.choice([1, 2])
    #aleatorio = 1
    if (aleatorio == 1):
        cancionesasonar = df_final
    elif (aleatorio == 2):
        cancionesasonar = dfsimple

    canciones = []

    for index, row in cancionesasonar.iterrows():
        #print(row['uri_cancion'])
        canciones.append(row['uri_cancion'])

    print(canciones)

    #for a,b in recomendacion_grupal_ordenada:

    #canciones.append(a)
    canciones_seccionado = split_list(canciones, 100)

    #canciones = canciones[:100]
    #print (canciones)
    sp = spotipy.Spotify(auth=token_info)
    for seccion in canciones_seccionado:
        snapshot = sp.user_playlist_add_tracks(user=invitado_para_sacar_info,
                                               playlist_id=playlistid,
                                               tracks=seccion)

    #sp.start_playback(devi)
    sp.shuffle(state=False)

    headers = {
        'Accept': 'application/json',
        'Content-Type': 'application/json',
        'Authorization': 'Bearer {}'.format(token[0]),
    }

    data = '{{"context_uri":"{}","offset":{{"position":{}}},"position_ms":0}}'.format(
        playlisturi, 0)

    response = requests.put('https://api.spotify.com/v1/me/player/play',
                            headers=headers,
                            data=data)

    if response:
        print('Se reprodujo la playlist')
    else:
        print(response, response.text)

    conn = sqlite3.connect('spotify.db')
    while True:  # making a loop
        try:  # used try so that if user pressed other than the given key error will not be shown
            if keyboard.is_pressed('n'):  # if key 'q' is pressed
                print('Next song!')

                headers = {
                    'Accept': 'application/json',
                    'Content-Type': 'application/json',
                    'Authorization': 'Bearer {}'.format(token[0]),
                }

                responsecancion = requests.get(
                    'https://api.spotify.com/v1/me/player/currently-playing',
                    headers=headers)

                #print(responsecancion.json()['progress_ms'],responsecancion.json()['item']['duration_ms'])

                sql = ''' INSERT INTO Salto(uri,porcentaje)
                            VALUES(?,?) '''
                cur = conn.cursor()

                porce = responsecancion.json(
                )['progress_ms'] / responsecancion.json(
                )['item']['duration_ms']

                t = (responsecancion.json()['item']['uri'], porce)
                cur.execute(sql, t)

                conn.commit()
                #conn.close()

                response = requests.post(
                    'https://api.spotify.com/v1/me/player/next',
                    headers=headers)

                #break  # finishing the loop
        except Exception as e:
            print(e)
            break  # if user pressed a key other than the given key the loop will break

    print('Bien hecho campeón')
def trainModel(data):
    trainset = data.build_full_trainset()
    model = SVD(n_epochs=20, n_factors=50, verbose=True)
    model.fit(trainset)
    return model
Beispiel #18
0
# Unzip ml-100k.zip
zipfile = zipfile.ZipFile('ml-100k.zip', 'r')
zipfile.extractall()
zipfile.close()

# Read data into an array of strings
with open('./ml-100k/u.data') as f:
    all_lines = f.readlines()

# Prepare the data to be used in Surprise
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file('./ml-100k/u.data', reader=reader)

# Split the dataset into 5 folds and choose the algorithm
data.split(n_folds=5)
algo = SVD()

# Train and test reporting the RMSE and MAE scores
evaluate(algo, data, measures=['RMSE', 'MAE'])

# Retrieve the trainset.
trainset = data.build_full_trainset()
algo.train(trainset)

# Predict a certain item
userid = str(196)
itemid = str(302)
actual_rating = 4
print(algo.predict(userid, itemid, actual_rating))
Beispiel #19
0
print("searching for the best parameters for svd...")

param_grid = {
    'n_epochs': [14, 14],
    'lr_all': [0.005, 0.005],
    'n_factors': [10, 5]
}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(evaluationData)

print("Best RMSE score attained: ", gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print("Best parameters: ", gs.best_params['rmse'])

evaluator = Evaluator(evaluationData, rankings)

params = gs.best_params['rmse']
SVDtuned = SVD(n_epochs=params['n_epochs'],
               lr_all=params['lr_all'],
               n_factors=params['n_factors'])
evaluator.AddAlgorithm(SVDtuned, "SVD - Tuned")

SVDUntuned = SVD()
evaluator.AddAlgorithm(SVDUntuned, "SVD - Untuned")

evaluator.Evaluate(True)
evaluator.SampleTopNRecs(ml)
    f.close()
for train in train_file:
    line = train.strip()
    if line.find('|') != -1:
        user_id, user_item_count = line.split('|')
    else:
        if line == "":
            continue
        item_id, rate_str = line.split()
        write_file.write('%s,%s,%s\n' % (user_id, item_id, rate_str))
write_file.close()
print("reading......")
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 100))
data = Dataset.load_from_file("train.csv", reader=reader)

algo = SVD(n_factors=10, n_epochs=10, lr_all=0.015, reg_all=0.01)
'''
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
'''
#algo = BaselineOnly(bsl_options=bsl_options)
'''
kf = KFold(n_splits=3) 
print('------begin train user cf model------------')
for trainset, testset in kf.split(train_cf):
    # 训练并测试算法
    print("aaa")
    algo.fit(trainset)
Beispiel #21
0
def Cal_Svd(filepath, user_id):
    # 1. raw dataset
    rating = pd.read_csv(filepath)
    rating['userId'].value_counts()
    rating['placeId'].value_counts()
    
    # 관광 vs 미관광
    tab = pd.crosstab(rating['userId'], rating['placeId'])
    #print(tab)
    
    # rating
    # 두 개의 집단변수를 가지고 나머지 rating을 그룹화
    rating_g = rating.groupby(['userId', 'placeId'])
    rating_g.sum()
    tab = rating_g.sum().unstack() # 행렬구조로 변환
    #print(tab)
    #print(tab.info())
    #사용자 2이 가지 않은 곳, 1,15, 39....
    
    # 2. rating 데이터셋 생성
    reader = Reader(rating_scale= (1, 5)) # 평점 범위
    data = Dataset.load_from_df(df=rating, reader=reader)
    # rating이라는 데이터프레임은 reader(1~5)의 평점 범위를 가진다.
    #print(data)
    
    # 3. train/test set
    train = data.build_full_trainset() # 훈련셋
    test = train.build_testset() # 검정셋
    
    # 4. model 생성
    #help(SVD)
    model = SVD(n_factors=100, n_epochs=20, random_state=123)
    model.fit(train) # model 생성
    
    
    # 5. user_id 입력
    #user_id = 1 # 추천대상자
    item_ids = range(0, 2106) # placeId 범위
    actual_rating = 0 # 평점
    
    predict_result = []
    
    for item_id in item_ids :
        if not actual_rating in tab:
            actual_rating = 0
            predict_result.append(model.predict(user_id, item_id, actual_rating))
    ddff = pd.DataFrame(predict_result)
    #print(ddff)
   
    # 유저 1 추천 여행지 상위 5개
    result = ddff.sort_values(by='est', ascending=False)[:5]
    #print(result)
    results.append(result)
    
    return result
# 
# if __name__ == '__main__':
#     Cal_Svd(filepath, user_id)
#     print(results[0])
    #print(type(results[0]))     #dataframe     
    #print(results[0]['iid'])        # placeId
Beispiel #22
0
import pandas as pd

path = '../Datasets/BookCrossings'
os.chdir(path)
trans = pd.read_csv('BX-Book-Ratings.csv',
                    sep=';',
                    error_bad_lines=False,
                    encoding="latin-1")
trans.columns = ['user', 'item', 'rating']
trans = trans[trans.rating != 0]

min_item_ratings = 10
popular_items = trans['item'].value_counts() >= min_item_ratings
popular_items = popular_items[popular_items].index.tolist()

min_user_ratings = 10
active_users = trans['user'].value_counts() >= min_user_ratings
active_users = active_users[active_users].index.tolist()

trans = trans[(trans['item'].isin(popular_items))
              & (trans['user'].isin(active_users))]
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(trans, reader)
trainset, testset = train_test_split(data, test_size=0.002)

algo = SVD(n_factors=50)

algo.fit(trainset)
preds = algo.test(testset)
accuracy.mae(preds)
Beispiel #23
0
                        stop_words='english')
count_matrix = count.fit_transform(books['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)

indices = pd.Series(books.index, index=books['title'])
titles = books['title']

books.to_csv('ob.csv', columns=books.columns.tolist())
ratings.to_csv('or.csv', columns=ratings.columns.tolist())
book_tags.to_csv('obt.csv', columns=book_tags.columns.tolist())
tags.to_csv('ot.csv', columns=tags.columns.tolist())

print("beforeNY")

svd = SVD()

print("NY")
reader = Reader()
data = Dataset.load_from_df(new_ratings[['user_id', 'book_id', 'rating']],
                            reader)

cross_validate(svd, data, measures=['RMSE', 'MAE'])

trainset = data.build_full_trainset()
svd.fit(trainset)

# save the model to disk
filename = 'model.sav'
pickle.dump(svd, open(filename, 'wb'))
Beispiel #24
0
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified


print(improved_recommendations('Chungking Express'))
# print(list(improved_recommendations('Se7en')))
print("end of metadata")
#Collaborative Filtering
reader = Reader()
ratings = pd.read_csv('ratings_small.csv')
print(ratings.head(5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])
trainset = data.build_full_trainset()
svd.train(trainset)
print(ratings[ratings['userId'] == 554])
print(svd.predict(554, 509, 4))
print(type(svd))


#end Collaborative Filtering
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan
Beispiel #25
0
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD, model_selection
from pprint import pprint

# Soley predicts based on other ratings
reader = Reader()
ratings = pd.read_csv('archive/ratings_small.csv')
ratings.head()

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
pprint(
    model_selection.cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5))

trainset = data.build_full_trainset()
svd.trainset(trainset)

ratings[ratings['userId'] == 1]

svd.predict(1, 302, 3)


def convert_int(x):
    try:
        return int(x)
    except:
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 24 15:03:41 2019

@author: Jon
"""

from helper import *
from surprise import SVD, NormalPredictor, accuracy
from surprise import NormalPredictor
from surprise.model_selection import train_test_split, GridSearchCV, KFold
import random
import numpy as np

np.random.seed(0)
random.seed(0)

GetBookData(density_filter=False)
data = GetBookData(density_filter=False)
trainset, testset = train_test_split(data, test_size=0.25)

##SVD Out of the Box
SVD_OOB = SVD()
SVD_OOB.fit(trainset)
oob_predictions = SVD_OOB.test(testset)
oob_rmse = accuracy.rmse(oob_predictions)
oob_mae = accuracy.mae(oob_predictions)

precisions, recalls = precision_recall_at_k(oob_predictions, k=10, threshold=4)
oob_avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
oob_avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
Beispiel #27
0
from surprise import evaluate, print_perf
from surprise import SVD
from surprise import NMF
from surprise import KNNBasic
import os

# 3
file_path = os.path.expanduser('restaurant_ratings.txt')
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)

# 5
print('\n#{} SVD -------------------------------\n'.format(5))
data.split(n_folds=3)

algo = SVD()
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)

# 6
print('\n#{} PMF-------------------------------\n'.format(6))

algo = SVD(biased=False)  #PMF
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
print_perf(perf)

# 7
print('\n#{} NMF-------------------------------\n'.format(7))

algo = NMF()
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
Beispiel #28
0
train_averages['user'] = get_average_ratings(of_users=True)
train_averages['food'] = get_average_ratings(of_users=False)

## svd 학습시키기
from surprise import SVD, Reader, Dataset, accuracy
import surprise
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(
    rating_data_mf[['user_id', 'smallCategory_id', 'rating']], reader)

trainset = data.build_full_trainset()

svd = SVD(n_factors=100, biased=True, random_state=15, verbose=True)
svd.fit(trainset)

## svd로 빈 rating 예측해서 채우기
rating_data_svd = rating_data.copy()
for user_id in rating_data.index:
    for smallCategory_id in rating_data.columns:
        if rating_data.loc[user_id][smallCategory_id] == 0:
            rating_data_svd.loc[user_id][smallCategory_id] = (svd.test([
                (user_id, smallCategory_id, 0)
            ]))[0].est

## user based filtering - rating_data
rating_data_svd_t = rating_data_svd.transpose()
user_rating_sim = rating_data_svd_t.corr(method='pearson')
def CollabFilteringModel(data, option=1, gridsearch=True):
    
    if option==1:
        sim_options = {
            "name":  "pearson_baseline",
            "min_support": 2,
            "user_based": False,
        }

        if gridsearch:
            sim_options = {
            "name": ["pearson_baseline"],
            "min_support": [2],
            "user_based": [False],
            }
            param_grid = {"sim_options": sim_options}

            gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
            gs.fit(data)

            print(gs.best_score["rmse"])
            print(gs.best_params["rmse"])
            print(gs.best_score["mae"])
            print(gs.best_params["mae"])

            sim_options = {
                "name":  gs.best_params["rmse"]["sim_options"]["name"],
                "min_support": gs.best_params["rmse"]["sim_options"]["min_support"],
                "user_based": gs.best_params["rmse"]["sim_options"]["user_based"],
            }

        algo = KNNWithMeans(sim_options=sim_options)

        trainingSet = data.build_full_trainset()

        algo.fit(trainingSet)

    elif option==2:
        n_epochs = 200
        lr_all = .01
        reg_all = .05
        
        if gridsearch:
            param_grid = {
                "n_epochs": [10, 200],
                "lr_all": [0.002, 0.1],
                "reg_all": [0.05, 0.9]
            }

            gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
            gs.fit(data)

            print(gs.best_score["rmse"])
            print(gs.best_params["rmse"])
            print(gs.best_score["mae"])
            print(gs.best_params["mae"])

            n_epochs = gs.best_params["mae"]["n_epochs"]
            lr_all = gs.best_params["mae"]["lr_all"]
            reg_all = gs.best_params["mae"]["reg_all"]
        

        
        algo = SVD(n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all)

        trainingSet = data.build_full_trainset()

        algo.fit(trainingSet)
    else:
        n_cltr_u  = 3
        n_cltr_i  = 3
        n_epochs = 200
        
        if gridsearch:
            param_grid = {
                "n_epochs": [10, 200],
                "n_cltr_u": [2,3,4,5,6],
                "n_cltr_i": [2,3,4,5,6]
            }

            gs = GridSearchCV(CoClustering, param_grid, measures=["rmse", "mae"], cv=3)
            gs.fit(data)

            print(gs.best_score["rmse"])
            print(gs.best_params["rmse"])
            print(gs.best_score["mae"])
            print(gs.best_params["mae"])

            n_epochs = gs.best_params["rmse"]["n_epochs"]
            n_cltr_u = gs.best_params["rmse"]["n_cltr_u"]
            n_cltr_i = gs.best_params["rmse"]["n_cltr_i"]
        

        
        algo = CoClustering(n_cltr_u =n_cltr_u , n_epochs=n_epochs, n_cltr_i=n_cltr_i)

        trainingSet = data.build_full_trainset()

        algo.fit(trainingSet)

    return algo
Beispiel #30
0
    def __init__(self, module_type, baseline_type, cf_type, similar, sim_type,
                 params):
        assert baseline_type in {"ALS", "SGD", "default"}
        assert cf_type in {None, "base_user", "base_item"}
        assert similar in {
            None, "COSINE", "cosine", "MSD", "msd", "PEARSON", "pearson",
            "PEARSON_BASELINE", "pearson_baseline", "JACCARD", "jaccard",
            "EUCLIDEAN", "euclidean"
        }
        assert sim_type in {None, "default"}
        self.module_type = module_type
        self.baseline_type = baseline_type
        self.cf_type = cf_type
        self.similar = similar
        self.sim_type = sim_type
        self.bu = None
        self.bi = None
        self.sim = None
        if self.baseline_type == "ALS":
            bsl_options = {
                'method': params["bsl_options"].get("method", 'als'),
                'n_epochs': params["bsl_options"].get("n_epochs", 10),
                'reg_u': params["bsl_options"].get("reg_u", 15),
                'reg_i': params["bsl_options"].get("reg_i", 10)
            }
        elif self.baseline_type == "SGD":
            bsl_options = {
                'method': params["bsl_options"].get("method", 'sgd'),
                'n_epochs': params["bsl_options"].get("n_epochs", 20),
                'reg': params["bsl_options"].get("reg", 0.02),
                'learning_rate':
                params["bsl_options"].get("learning_rate", 0.005)
            }
        else:  # 默认值
            bsl_options = {}
        params["sim_options"] = {}

        if self.cf_type == "base_user":
            params["sim_options"]["user_based"] = True
        elif self.cf_type == "base_item":
            params["sim_options"]["item_based"] = False
        else:
            params["sim_options"]["user_based"] = True

        if self.similar == "COSINE" or self.similar == "cosine":
            params["sim_options"]["name"] = "cosine"
        elif self.similar == "MSD" or self.similar == "msd":
            params["sim_options"]["name"] = "msd"
        elif self.similar == "PEARSON" or self.similar == "pearson":
            params["sim_options"]["name"] = "pearson"
        elif self.similar == "PEARSON_BASELINE" or self.similar == "pearson_baseline":
            params["sim_options"]["name"] = "pearson_baseline"
        elif self.similar == "JACCARD" or self.similar == "jaccard":
            params["sim_options"]["name"] = "jaccard"
        elif self.similar == "EUCLIDEAN" or self.similar == "euclidean":
            params["sim_options"]["name"] = "euclidean"
        else:
            params["sim_options"]["name"] = "msd"

        if self.sim_type == "default":
            sim_options = {}
        else:
            sim_options = {
                "name": params["sim_options"].get("name", "MSD"),
                "user_based": params["sim_options"].get("user_based", True),
                "min_support": params["sim_options"].get("min_support", 5),
                "shrinkage": params["sim_options"].get("shrinkage", 100)
            }
            """
            'name':要使用的相似性名称,如similarities模块中所定义 。默认值为'MSD'。
            'user_based':将计算用户之间还是项目之间的相似性。这对预测算法的性能有巨大影响。默认值为True。
            'min_support':相似度不为零的最小公共项数('user_based' 为'True'时)或最小公共用户数('user_based'为 'False'时)。
            简单地说,如果 |Iuv|<min_support 然后 sim(u,v)=0。项目也是如此。
            'shrinkage':
            """
        if self.module_type == "KNNmeans":
            # 在KNNBasic算法的基础上,考虑用户均值或项目均值
            self.model = KNNWithMeans(k=params.get("k", 40),
                                      min_k=params.get("min_k", 1),
                                      sim_options=sim_options,
                                      verbose=params.get("verbose", True))
        elif self.module_type == "KNNzscore":
            # 引入Z - Score的思想
            self.model = KNNWithZScore(k=params.get("k", 40),
                                       min_k=params.get("min_k", 1),
                                       sim_options=sim_options,
                                       verbose=params.get("verbose", True))
        elif self.module_type == "KNNbase":
            # 和KNNWithMeans的区别在于,用的不是均值而是bias
            self.model = KNNBaseline(
                k=params.get("k", 40),
                min_k=params.get("min_k", 1),  # 最少的邻居个数
                sim_options=sim_options,
                bsl_options=bsl_options,
                verbose=params.get("verbose", True))
        elif self.module_type == "KNNbasic":
            # 最基础的KNN算法,可分为user - based KNN和item - based KNN
            self.model = KNNBasic(k=params.get("k", 40),
                                  min_k=params.get("min_k", 1),
                                  sim_options=sim_options,
                                  verbose=params.get("verbose", True))
        elif self.module_type == "SVD":
            self.model = SVD(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为100。
            n_epochs – SGD过程的迭代次数。默认值为 20。
            偏见(bool)–是否使用基线(或偏见)。请参阅上面的注释。默认值为True。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0.1。
            lr_all –所有参数的学习率。默认值为0.005。
            reg_all –所有参数的正则项。默认值为 0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SVDpp":
            self.model = SVDpp(n_factors=params.get("n_factors", 100),
                               n_epochs=params.get("n_epochs", 20),
                               init_mean=params.get("init_mean", 0),
                               init_std_dev=params.get("init_std_dev", 0.1),
                               lr_all=params.get("lr_all", 0.005),
                               reg_all=params.get("reg_all", 0.02),
                               lr_bu=params.get("lr_bu", None),
                               lr_bi=params.get("lr_bi", None),
                               lr_pu=params.get("lr_pu", None),
                               lr_qi=params.get("lr_qi", None),
                               reg_bu=params.get("reg_bu", None),
                               reg_bi=params.get("reg_bi", None),
                               reg_pu=params.get("reg_pu", None),
                               reg_qi=params.get("reg_qi", None),
                               random_state=params.get("random_state", None),
                               verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为20。
            n_epochs – SGD过程的迭代次数。默认值为
            20。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0
            .1。
            lr_all –所有参数的学习率。默认值为0
            .007。
            reg_all –所有参数的正则项。默认值为
            0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            lr_yj –的学习率yj。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            reg_yj –的正则化术语yj。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用
            fit()。如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为
            None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "NMF":
            # 非负矩阵分解,即要求p矩阵和q矩阵都是正的
            self.model = NMF(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为15。
            n_epochs – SGD过程的迭代次数。默认值为 50。
            偏见(bool)–是否使用基线(或偏见)。默认值为 False。
            reg_pu –用户的正则化术语λu。默认值为 0.06。
            reg_qi –项目的正规化术语λi。默认值为 0.06。
            reg_bu –的正则化术语bu。仅与偏置版本相关。默认值为0.02。
            reg_bi –的正则化术语bi。仅与偏置版本相关。默认值为0.02。
            lr_bu –的学习率bu。仅与偏置版本相关。默认值为0.005。
            lr_bi –的学习率bi。仅与偏置版本相关。默认值为0.005。
            init_low –因子的随机初始化的下限。必须大于0以确保非负因素。默认值为 0。
            init_high –因子的随机初始化的上限。默认值为1。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SlopeOne":
            self.model = SlopeOne(**params)

        elif self.module_type == "cc":
            # 基于聚类的协同过滤
            self.model = CoClustering(n_cltr_u=params.get("n_cltr_u", 3),
                                      n_cltr_i=params.get("n_cltr_i", 3),
                                      n_epochs=params.get("n_epochs", 20),
                                      random_state=params.get(
                                          "random_state", None),
                                      verbose=params.get("verbose", False))
            """
            n_cltr_u(int)–用户集群的数量。默认值为3。
            n_cltr_i(int)–项目集群的数量。默认值为3。
            n_epochs(int)–优化循环的迭代次数。默认值为 20。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细(bool)–如果为True,则将打印当前纪元。默认值为 False。
            """

        elif self.module_type == "BaselineOnly":
            # 不考虑用户的偏好
            self.model = BaselineOnly(bsl_options=bsl_options, verbose=True)

        elif self.module_type == "Np":
            # 该算法即随机预测算法,假设测试集的评分满足正态分布,然后生成正态分布的随机数进行预测,
            self.model = NormalPredictor()
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from collections import defaultdict

data = pd.read_csv('train_triplets.txt', sep="\t", header=None)
data.columns = ['user', 'song', 'plays']
data = data[:30000]

song_df = pd.read_csv('song_data.csv')
data_surprise = Dataset.load_from_df(
    data, Reader(rating_scale=(1, data['plays'].max())))
# trainset, testset = train_test_split(data_surprise, test_size=.25)

trainset = data_surprise.build_full_trainset()
svd = SVD()
svd.fit(trainset)

testset = trainset.build_anti_testset()
predictions = svd.test(testset)


def get_top_n(user_id, n=10):
    '''Return the top-N recommendation for user from a set of predictions.

    Args:
        user_id: User ID
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
Beispiel #32
0
# 我们依然沿用上面的评分矩阵进行预测。
# 同样的,我们用surprise库里面的SVD来进行矩阵分解方法。

# In[40]:


# 矩阵分解(SVD)

# 阅读器
reader = Reader(line_format='user item rating', sep=',')
# 载入数据
raw_data = Dataset.load_from_df(user_item_rating, reader=reader)
# 分割数据集
kf = KFold(n_splits=5)
# 构建模型
algo = SVD(n_factors=40, biased=True)
# 训练数据集,并返回rmse误差
for trainset, testset in kf.split(raw_data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)


# In[41]:


# 矩阵分解 推荐
def recommendation_basedonMF(userID, N=5):
    # 用户听过的音乐列表
    used_items = user_songs[userID]
    
def hybrid(userId,train_rd):
    #get_ipython().magic('matplotlib inline')
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate

    import warnings; warnings.simplefilter('ignore')


    # In[2]:


    #Popularity#

    md = pd.read_csv('CustomData/FinalData.csv')

    fd = pd.read_csv('avg_ratings1.csv')



    fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float')
    vote_averages= fd[fd['rating'].notnull()]['rating']
    C = vote_averages.mean()


    fd1 = pd.read_csv('ratings_count.csv')


    fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float')
    vote_counts = fd1[fd1['rating'].notnull()]['rating']


    # In[3]:


    m = vote_counts.quantile(0.75)



    # In[4]:


    md['ratings_count'] = fd1['rating']
    md['average_rating'] = fd['rating']


    # In[28]:


    #print(md.shape)
    qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']]

    qualified['ratings_count'] = qualified['ratings_count'].astype('float')

    qualified['average_rating'] = qualified['average_rating'].astype('float')

    #qualified.shape


    # In[29]:


    def weighted_rating(x):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)


    # In[30]:


    qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1)
    #qualified['wr']
    #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250)
    pop = qualified[['book_id','popularity_rating']]
    #print(qualified.shape)
    #print(pop.shape)


    # In[11]:


    ### Collaborative ##

    reader = Reader()
    ratings=train_rd
    #ratings = pd.read_csv('ratings.csv')
    #ratings.head()

    temp_ratings = ratings[0:1000]

    #print(temp_ratings)
    data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
    data.split(n_folds=2)


    # In[12]:


    svd = SVD()
    evaluate(svd, data, measures=['RMSE', 'MAE'])


    # In[13]:


    trainset = data.build_full_trainset()
    #svd.train(trainset)
    algo = SVD()
    algo.fit(trainset)

    ## usefule = temp_rating[rating]


    # In[14]:


#print(len(temp_ratings[temp_ratings['user_id']==userId]))


    # In[ ]:


    def get_top_n(predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            #user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    # In[15]:


    from collections import defaultdict
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    '''
    top_n = get_top_n(predictions, n=10000)

    #print(top_n)
    #result = pd.DataFrame(top_n)
    #print(result)
    for uid, user_ratings in top_n.items():
    
        #print(uid, [iid for (iid  , _) in user_ratings])
        for uid, iid, true_r, est, _ in predictions:
        
            temp_ratings.loc[uid]= [uid,iid,est]
        #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']]
        
    '''
    count = 0
    for uid, iid, true_r, est, _ in predictions:
        
         if uid == userId:
            count = count+1
            temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]
            #print('here')

            #print(uid)
            #temp_ratings.append([uid,iid,est],ignore_index=True)

    #print(count)
    #print(temp_ratings)



    # In[16]:


    #print(len(temp_ratings[temp_ratings['user_id']==2]))


    # In[ ]:





    # In[46]:


    ##### CONTENT ######

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate
    import csv
    import warnings; warnings.simplefilter('ignore')


    # In[48]:



    md=pd.read_csv('CustomData/FinalData.csv')
    rd=train_rd
    #rd=pd.read_csv('ratings.csv')
    md['book_id'] = md['book_id'].astype('int')
    rd['book_id'] = rd['book_id'].astype('int')
    rd['user_id'] = rd['user_id'].astype('int')
    rd['rating'] = rd['rating'].astype('int')

    #print(md.head())


    md['authors'] = md['authors'].str.replace(' ','')
    md['authors'] = md['authors'].str.lower()
    md['authors'] = md['authors'].str.replace(',',' ')

    #print(md.head())

    md['authors'] = md['authors'].apply(lambda x: [x,x])
    #print(md['authors'])

    md['Genres']=md['Genres'].str.split(';')
    #print(md['Genres'])

    md['soup'] = md['authors'] + md['Genres']
    #print(md['soup'])

    md['soup'] = md['soup'].str.join(' ')

    #md['soup'].fillna({})
    #print(md['soup'])

    count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(md['soup'])
    #print (count_matrix.shape)
    #print np.array(count.get_feature_names())
    #print(count_matrix.shape)

    cosine_sim = cosine_similarity(count_matrix, count_matrix)


    # In[91]:


    def build_user_profiles():
        user_profiles=np.zeros((53421,999))
        #print(rd.iloc[0]['user_id'])
	#len(rd['book_id'])
        for i in range(0,1000):
            u=rd.iloc[i]['user_id']
            b=rd.iloc[i]['book_id']
            #print(u,b)
            #print(i)
            #if b<999:
                #print("match at "+str(b))
            user_profiles[u][b-1]=rd.iloc[i]['rating']
        #print(user_profiles)
        return user_profiles

    user_profiles=build_user_profiles()
    def _get_similar_items_to_user_profile(person_id):
            #Computes the cosine similarity between the user profile and all item profiles
            #print(user_profiles[person_id])
        #print("\n---------\n")
        #print(cosine_sim[0])
        user_ratings = np.empty((999,1))
        cnt=0
        for i in range(0,998):
            book_sim=cosine_sim[i]
            user_sim=user_profiles[person_id]
            user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i])
        maxval = max(user_ratings)
    #print(maxval)

        for i in range(0,998):
            user_ratings[i]=((user_ratings[i]*5.0)/(maxval))
            #print(user_ratings[i])
            if(user_ratings[i]>3):
                #print("MILA KUCCHHH")
                cnt+=1
        #print(max(user_ratings))
        #print (cnt)
       
            #print(cosine_similarities)
            
            #return similar_items
        return user_ratings
    content_ratings = _get_similar_items_to_user_profile(userId)



    # In[100]:


    num = md[['book_id']]
    #print(num)

    num1 = pd.DataFrame(data=content_ratings[0:,0:])


    frames = [num, num1]
    #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])

    mer = pd.concat(frames, axis =1,join_axes=[num.index])
    mer.columns=['book_id', 'content_rating']
    #print(mer.shape)
    #print('here')
    #print(mer)





    # In[102]:


    ## for user 2 #

#print(temp_ratings.shape)
    cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']]
#   print(cb.shape)
#   print(pop.shape)
    hyb = md[['book_id']]
    hyb = hyb.merge(cb,on = 'book_id')
    hyb = hyb.merge(pop, on='book_id')
    hyb = hyb.merge(mer, on='book_id')
    #hyb.shape


    # In[106]:


    def weighted_rating(x):
        v = x['rating']
        R = x['popularity_rating']
        c = x['content_rating']
        return 0.4*v + 0.2*R + 0.4 * c


    # In[107]:


    print(hyb)
    hyb['final'] = hyb.apply(weighted_rating, axis=1)
    hyb = hyb.sort_values('final', ascending=False).head(999)
    #print(hyb['final'])

    print(hyb)
    return hyb
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)