def collaborative(self,ratings,user_id):

        reader = Reader()
        #ratings.head()

        temp_ratings = ratings



        data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
        data.split(n_folds=2)

        ## Training the data ##
        svd = SVD()
        evaluate(svd, data, measures=['RMSE', 'MAE'])

        trainset = data.build_full_trainset()

        algo = SVD()
        algo.fit(trainset)

        #svd.train(trainset)
        ## Testing the data ##

        from collections import defaultdict
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        count = 0
     
        for uid, iid, true_r, est, _ in predictions:

             if uid == user_id:
                count = count+1
                temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]

        #print("count\n")
        #print(count)
        #print("\n--------here-------\n")	
        #print(temp_ratings)

        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]
        #print("\n--------here-------\n")
        #print(cb)
        
        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]

        return(cb)
Ejemplo n.º 2
0
def test_performances():
    """Test the returned dict. Also do dumping."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader,
                                   rating_scale=(1, 5))

    algo = NormalPredictor()
    tmp_dir = tempfile.mkdtemp()  # create tmp dir
    with pytest.warns(UserWarning):
        performances = evaluate(algo, data, measures=['RmSe', 'Mae'],
                                with_dump=True, dump_dir=tmp_dir, verbose=2)
    shutil.rmtree(tmp_dir)  # remove tmp dir

    assert performances['RMSE'] is performances['rmse']
    assert performances['MaE'] is performances['mae']
Ejemplo n.º 3
0
data = Dataset.load_builtin('ml-100k')
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user',
                    sep='|',
                    names=u_cols,
                    encoding='latin-1')
#print users.head()
#data.split(n_folds=2)
algo1 = SVD()
algo2 = KNNBasic()
algo3 = KNNBaseline()
algo4 = KNNWithMeans()
algo5 = NormalPredictor()
#start_time1=time.time()
start = timeit.default_timer()
perf1 = evaluate(algo1, data, measures=['RMSE', 'MAE'])
stop = timeit.default_timer()
print("--- %s seconds ---" % (stop - start))
#start_time2=time.time()
start1 = timeit.default_timer()
perf2 = evaluate(algo2, data, measures=['RMSE', 'MAE'])
stop1 = timeit.default_timer()
print("...%s seconds..." % (stop1 - start1))
#start_time3=time.time()
perf3 = evaluate(algo3, data, measures=['RMSE', 'MAE'])
#print ("...%s seconds..."%(time.time()-start_time3))
#start_time4=time.time()
perf4 = evaluate(algo4, data, measures=['RMSE', 'MAE'])
#print ("...%s seconds..."%(time.time()-start_time4))
#start_time5=time.time()
perf5 = evaluate(algo5, data, measures=['RMSE', 'MAE'])
def hybrid(userId,train_rd):
    #get_ipython().magic('matplotlib inline')
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate

    import warnings; warnings.simplefilter('ignore')


    # In[2]:


    #Popularity#

    md = pd.read_csv('CustomData/FinalData.csv')

    fd = pd.read_csv('avg_ratings1.csv')



    fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float')
    vote_averages= fd[fd['rating'].notnull()]['rating']
    C = vote_averages.mean()


    fd1 = pd.read_csv('ratings_count.csv')


    fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float')
    vote_counts = fd1[fd1['rating'].notnull()]['rating']


    # In[3]:


    m = vote_counts.quantile(0.75)



    # In[4]:


    md['ratings_count'] = fd1['rating']
    md['average_rating'] = fd['rating']


    # In[28]:


    #print(md.shape)
    qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']]

    qualified['ratings_count'] = qualified['ratings_count'].astype('float')

    qualified['average_rating'] = qualified['average_rating'].astype('float')

    #qualified.shape


    # In[29]:


    def weighted_rating(x):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)


    # In[30]:


    qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1)
    #qualified['wr']
    #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250)
    pop = qualified[['book_id','popularity_rating']]
    #print(qualified.shape)
    #print(pop.shape)


    # In[11]:


    ### Collaborative ##

    reader = Reader()
    ratings=train_rd
    #ratings = pd.read_csv('ratings.csv')
    #ratings.head()

    temp_ratings = ratings[0:1000]

    #print(temp_ratings)
    data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
    data.split(n_folds=2)


    # In[12]:


    svd = SVD()
    evaluate(svd, data, measures=['RMSE', 'MAE'])


    # In[13]:


    trainset = data.build_full_trainset()
    #svd.train(trainset)
    algo = SVD()
    algo.fit(trainset)

    ## usefule = temp_rating[rating]


    # In[14]:


#print(len(temp_ratings[temp_ratings['user_id']==userId]))


    # In[ ]:


    def get_top_n(predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            #user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    # In[15]:


    from collections import defaultdict
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    '''
    top_n = get_top_n(predictions, n=10000)

    #print(top_n)
    #result = pd.DataFrame(top_n)
    #print(result)
    for uid, user_ratings in top_n.items():
    
        #print(uid, [iid for (iid  , _) in user_ratings])
        for uid, iid, true_r, est, _ in predictions:
        
            temp_ratings.loc[uid]= [uid,iid,est]
        #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']]
        
    '''
    count = 0
    for uid, iid, true_r, est, _ in predictions:
        
         if uid == userId:
            count = count+1
            temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]
            #print('here')

            #print(uid)
            #temp_ratings.append([uid,iid,est],ignore_index=True)

    #print(count)
    #print(temp_ratings)



    # In[16]:


    #print(len(temp_ratings[temp_ratings['user_id']==2]))


    # In[ ]:





    # In[46]:


    ##### CONTENT ######

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate
    import csv
    import warnings; warnings.simplefilter('ignore')


    # In[48]:



    md=pd.read_csv('CustomData/FinalData.csv')
    rd=train_rd
    #rd=pd.read_csv('ratings.csv')
    md['book_id'] = md['book_id'].astype('int')
    rd['book_id'] = rd['book_id'].astype('int')
    rd['user_id'] = rd['user_id'].astype('int')
    rd['rating'] = rd['rating'].astype('int')

    #print(md.head())


    md['authors'] = md['authors'].str.replace(' ','')
    md['authors'] = md['authors'].str.lower()
    md['authors'] = md['authors'].str.replace(',',' ')

    #print(md.head())

    md['authors'] = md['authors'].apply(lambda x: [x,x])
    #print(md['authors'])

    md['Genres']=md['Genres'].str.split(';')
    #print(md['Genres'])

    md['soup'] = md['authors'] + md['Genres']
    #print(md['soup'])

    md['soup'] = md['soup'].str.join(' ')

    #md['soup'].fillna({})
    #print(md['soup'])

    count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(md['soup'])
    #print (count_matrix.shape)
    #print np.array(count.get_feature_names())
    #print(count_matrix.shape)

    cosine_sim = cosine_similarity(count_matrix, count_matrix)


    # In[91]:


    def build_user_profiles():
        user_profiles=np.zeros((53421,999))
        #print(rd.iloc[0]['user_id'])
	#len(rd['book_id'])
        for i in range(0,1000):
            u=rd.iloc[i]['user_id']
            b=rd.iloc[i]['book_id']
            #print(u,b)
            #print(i)
            #if b<999:
                #print("match at "+str(b))
            user_profiles[u][b-1]=rd.iloc[i]['rating']
        #print(user_profiles)
        return user_profiles

    user_profiles=build_user_profiles()
    def _get_similar_items_to_user_profile(person_id):
            #Computes the cosine similarity between the user profile and all item profiles
            #print(user_profiles[person_id])
        #print("\n---------\n")
        #print(cosine_sim[0])
        user_ratings = np.empty((999,1))
        cnt=0
        for i in range(0,998):
            book_sim=cosine_sim[i]
            user_sim=user_profiles[person_id]
            user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i])
        maxval = max(user_ratings)
    #print(maxval)

        for i in range(0,998):
            user_ratings[i]=((user_ratings[i]*5.0)/(maxval))
            #print(user_ratings[i])
            if(user_ratings[i]>3):
                #print("MILA KUCCHHH")
                cnt+=1
        #print(max(user_ratings))
        #print (cnt)
       
            #print(cosine_similarities)
            
            #return similar_items
        return user_ratings
    content_ratings = _get_similar_items_to_user_profile(userId)



    # In[100]:


    num = md[['book_id']]
    #print(num)

    num1 = pd.DataFrame(data=content_ratings[0:,0:])


    frames = [num, num1]
    #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])

    mer = pd.concat(frames, axis =1,join_axes=[num.index])
    mer.columns=['book_id', 'content_rating']
    #print(mer.shape)
    #print('here')
    #print(mer)





    # In[102]:


    ## for user 2 #

#print(temp_ratings.shape)
    cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']]
#   print(cb.shape)
#   print(pop.shape)
    hyb = md[['book_id']]
    hyb = hyb.merge(cb,on = 'book_id')
    hyb = hyb.merge(pop, on='book_id')
    hyb = hyb.merge(mer, on='book_id')
    #hyb.shape


    # In[106]:


    def weighted_rating(x):
        v = x['rating']
        R = x['popularity_rating']
        c = x['content_rating']
        return 0.4*v + 0.2*R + 0.4 * c


    # In[107]:


    print(hyb)
    hyb['final'] = hyb.apply(weighted_rating, axis=1)
    hyb = hyb.sort_values('final', ascending=False).head(999)
    #print(hyb['final'])

    print(hyb)
    return hyb
n_users = ratings.userId.unique().shape[0]
n_movies = ratings.movieId.unique().shape[0]
print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str(
    n_movies)

# In[69]:

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                            reader=reader)
data.split(n_folds=5)

# In[70]:

svd = SVD()
perf1 = evaluate(svd, data, measures=['RMSE', 'MAE'])

# In[71]:

slp = SlopeOne()
perf2 = evaluate(slp, data, measures=['RMSE', 'MAE'])

# In[72]:

knn = KNNBaseline()
perf3 = evaluate(knn, data, measures=['RMSE', 'MAE'])

# In[73]:

trainset = data.build_full_trainset()
svd.train(trainset)
Ejemplo n.º 6
0
file_path = os.path.expanduser('~/PycharmProjects/aashay/shuffled_ratings.csv')
reader = Reader(line_format='user item rating', sep=',')

data = surprise.Dataset.load_from_file(file_path, reader=reader)
# data.split(5)  # split data for 2-folds cross validation

# dataset = 'ml-1m'
# data = Dataset.load_builtin(dataset)
data.split(2)
kf = KFold(n_splits=2,random_state=0,shuffle=False)  # folds will be the same for all algorithms.

table = []
for klass in classes:
    start = time.time()
    #out = cross_validate(klass(), data, ['rmse', 'mae'], kf)
    out = surprise.evaluate(klass(), data, measures=['RMSE'], with_dump=False
                            )
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))

    new_line = [link, mean_rmse, mean_mae, cv_time]
    print(tabulate([new_line], tablefmt="pipe"))  # print current algo perf
    table.append(new_line)

header = [LINK[data],
          'RMSE',
          'MAE',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))
Ejemplo n.º 7
0
    # ===============================  load data  ===================================
    # ml-latest-small
    # file_path = 'input/ml-latest-small/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-100k
    file_path = 'input/ml-100k/u.data'
    reader = env.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-20m
    # file_path = 'input/ml-20m/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ==============================================================================

    # data = env.Dataset.load_from_file(file_path, reader=reader)
    # data.split(n_folds=5)

    # file_path = 'input/ml-100k/u.data'
    # reader = myDataset.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1, implicit=True,
    #                           threshold=4.5)
    data = myDataset.Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=5)

    # define algorithm
    algo = SLIM3(l1_reg=0.001, l2_reg=0.01, max_iter=200, tol=1e-3)

    # evaluate
    env.evaluate(algo, data, measures=['rmse', 'mae', 'fcp'])
    # myEvaluate.evaluate(algo, data, measures=['fcp', 'hr', 'arhr'], topN=10, leave_out_num=1, verbose=2)
Ejemplo n.º 8
0
        self.trainset = trainset

    def estimate(self, u, i):
        '''Rende la valutazione/rating stimato dell'utente u per l'item i.'''

        # Rende un prodotto scalare tra p_u e q_i se l'utente e l'item sono conosciuti, altrimenti rende una media di tutti i ratings
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            return numpi.dot(self.p[u], self.q[i])
        else:
            return self.trainset.global_mean


# Caricamento del db, utilizziamo il Movielens DataSet (https://grouplens.org/datasets/movielens/100k/)
# Grazie alla libreria Surprise possiamo scaricarlo automaticamente.
data = surprise.Dataset.load_builtin('ml-100k')
data.split(2)  # Divide i dati per 2-folds cross validation

algo = RecAlgo(learning_rate=.01, n_epochs=10, n_factors=10)
surprise.evaluate(algo, data, measures=['RMSE'])
surprise.evaluate(algo, data, measures=['mae'])

# Utilizziamo un algoritmo di neighborhood sugli stessi dati come confronto
algo = surprise.KNNBasic()
surprise.evaluate(algo, data, measures=['RMSE'])
surprise.evaluate(algo, data, measures=['mae'])

# Utilizziamo un metodo di fattorizzazione piu sofisticato sugli stessi dati
algo = surprise.SVD()
surprise.evaluate(algo, data, measures=['RMSE'])
surprise.evaluate(algo, data, measures=['mae'])
Ejemplo n.º 9
0
import time
import matplotlib.pyplot as plt
import psutil


timex=[]
mem=[]
m1=psutil.virtual_memory().percent

start = time.time()
df1 = pd.read_csv('C:/Users/Mausamee Patel/Desktop/Project/A5/Ratings_1Million1.csv', dtype={'rating': float})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df1[['user_id','book_id','rating']], reader)
data.split(2)
algo = surprise.KNNBasic()
result1 = surprise.evaluate(algo, data, measures=['RMSE'])
end = time.time()
print("Time1",end - start)
timex.append(end-start)
m2=psutil.virtual_memory().percent
#print(m2)
mem.append(m2)

start = time.time()
df2 = pd.read_csv('C:/Users/Mausamee Patel/Desktop/Project/A5/Ratings_1Million2.csv', dtype={'rating': float})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df2[['user_id','book_id','rating']], reader)
data.split(2)
algo = surprise.KNNBasic()
result2 = surprise.evaluate(algo, data, measures=['RMSE'])
end = time.time()
Ejemplo n.º 10
0
    gen_occ_mean = merged_df[['sex', 'rating', 'movie_id', 'occupation'
                              ]].pivot_table(values='rating',
                                             index='movie_id',
                                             columns=['occupation', 'sex'],
                                             aggfunc='mean')

    print(gen_occ_mean.head())

    print(score(cf_gen_occ))

    # Define a Reader object
    # The Reader object helps in parsing the file or dataframe containing ratings
    reader = Reader()

    # Create the dataset to be used for building the filter
    data = Dataset.load_from_df(ratings, reader)

    # Define the algorithm object; in this case kNN
    knn = KNNBasic()

    # Evaluate the performance in terms of RMSE
    evaluate(knn, data, measures=['RMSE'])

    # Import SVD
    from surprise import SVD

    # Define the SVD algorithm object
    svd = SVD()

    # Evaluate the performance in terms of RMSE
    evaluate(svd, data, measures=['RMSE'])
Ejemplo n.º 11
0
def test_SVD_parameters():
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVD(n_factors=1, n_epochs=1)
    rmse_default = evaluate(algo, data, measures=['rmse'])['rmse']

    # n_factors
    algo = SVD(n_factors=2, n_epochs=1)
    rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_factors

    # n_epochs
    algo = SVD(n_factors=1, n_epochs=2)
    rmse_n_epochs = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_n_epochs

    # biased
    algo = SVD(n_factors=1, n_epochs=1, biased=False)
    rmse_biased = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_biased

    # lr_all
    algo = SVD(n_factors=1, n_epochs=1, lr_all=5)
    rmse_lr_all = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_lr_all

    # reg_all
    algo = SVD(n_factors=1, n_epochs=1, reg_all=5)
    rmse_reg_all = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_reg_all

    # lr_bu
    algo = SVD(n_factors=1, n_epochs=1, lr_bu=5)
    rmse_lr_bu = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_lr_bu

    # lr_bi
    algo = SVD(n_factors=1, n_epochs=1, lr_bi=5)
    rmse_lr_bi = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_lr_bi

    # lr_pu
    algo = SVD(n_factors=1, n_epochs=1, lr_pu=5)
    rmse_lr_pu = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_lr_pu

    # lr_qi
    algo = SVD(n_factors=1, n_epochs=1, lr_qi=5)
    rmse_lr_qi = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_lr_qi

    # reg_bu
    algo = SVD(n_factors=1, n_epochs=1, reg_bu=5)
    rmse_reg_bu = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_reg_bu

    # reg_bi
    algo = SVD(n_factors=1, n_epochs=1, reg_bi=5)
    rmse_reg_bi = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_reg_bi

    # reg_pu
    algo = SVD(n_factors=1, n_epochs=1, reg_pu=5)
    rmse_reg_pu = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_reg_pu

    # reg_qi
    algo = SVD(n_factors=1, n_epochs=1, reg_qi=5)
    rmse_reg_qi = evaluate(algo, data, measures=['rmse'])['rmse']
    assert rmse_default != rmse_reg_qi
Ejemplo n.º 12
0
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

df = df[~df['Movie_id'].isin(drop_movie_list)]
df = df[~df['Cust_id'].isin(drop_cust_list)]

#Pivot data
df_p = pd.pivot_table(df, index="Cust_id", columns="Movie_id", values="Rating")

#See which algorithm gives the lowest RMSE value
reader = Reader()
data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']][:100000], reader)
benchmark = []
for algo in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), BaselineOnly(), CoClustering()]:
    data.split(n_folds=3)
    results = evaluate(algo, data, measures = ["RMSE"])
    
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

print(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('rmse'))

##Train and Test split
#reader = Reader()
#data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']], reader)
#trainset, testset = train_test_split(data, test_size = 0.25)
#blo = BaselineOnly()
#blo.fit(trainset)
#predictions = blo.test(testset[:10000])
#accuracy.rmse(predictions)
from surprise import Reader, Dataset
from surprise import NMF, evaluate

# creating the format for the dataset when given the user, item, rating and timestamp
data_reader = Reader(line_format="user item rating timestamp", sep="\t")

# store the data in the specific format created above
# u. data is the data we want
data = Dataset.load_from_file("./ml-100k/u.data", reader=data_reader)

# will be splitting the data into 5 folds for cross validation
data.split(n_folds=5)

# for this project I will be using the NMF algorithm
algorithm = NMF()
evaluate(algorithm, data, measures=["RMSE", "MAE"])

# train the whole data set now
training_set = data.build_full_trainset()
algorithm.train(training_set)

# set the specific user and movie I want to predict
user_id = str(200)
item_id = str(222)
actual_rating = 5

# see how it works!
print(algorithm.predict(user_id, item_id, actual_rating))
Ejemplo n.º 14
0
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


reader = Reader(rating_scale=(0, 5))
dataset = Dataset.load_from_df(actions[["user_id", "context_product"]], reader)

algo = SVD()

evaluate(algo, dataset, measures=["RMSE", "MAE"])
trainset = dataset.build_full_trainset()

algo.fit(trainset)
algo.predict("53ff5739aebb450829000074", "affect-health-drinking-chocolate",
             15)
algo.predict("53ff5739aebb450829000074", "affect-health-drinking-chocolate", 0)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

recommendations = {}
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
data=users.assign(key=1).merge(hotels.assign(key=1), on='key', how='inner').drop('key', axis=1)
data=data.merge(activity_count, on=['user', 'hotel'], how='left')
data['browse']=data.browse.fillna(0)
data=data[['user', 'hotel', 'browse']]


# tentatively CV test for some algorithms
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(data, reader)

data_cv=data
data_cv.split(n_folds=5)

# SVD test
svd = SVD()
perf = evaluate(svd, data, measures=['RMSE'])
print_perf(perf)      # MSE 0.052

param_svd = {'n_factors': [50, 100], 'lr_all': [0.003, 0.005],
              'reg_all': [0.05, 0.1, 0.5]}
gs = GridSearch(SVD, param_svd, measures=['RMSE'])
gs.evaluate(data_cv) # RMSE 0.2272 ~ 0.2284, after many tests notice 0.2272 is a benchmark, 100, 0.003, 0.1

# Co-clustering test
coc=CoClustering()
perf = evaluate(coc, data, measures=['RMSE'])
print_perf(perf)     # MSE 0.053

param_svd = {'n_cltr_u': [3, 5, 7], 'n_cltr_i': [3, 5, 7],
              'n_epochs': [10, 20]}
gs = GridSearch(CoClustering, param_svd, measures=['RMSE'])
Ejemplo n.º 16
0
from surprise import Dataset
from surprise import SVD
from surprise import evaluate, print_perf

# Load the movielens-100k dataset (download it if needed),
# and split it into 3 folds for cross-validation.
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

# We'll use the famous SVD algorithm.
algo = SVD()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)
Ejemplo n.º 17
0
train[['id_usuario', 'id_restaurante', 'rating_ambiente',
       'fecha']].to_csv('surprise_format.csv', index=False)

file_path = 'surprise_format.csv'
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=5)

# We'll use the famous SVD++ algorithm.
algo = SVDpp()

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)

#grid search

SVDpp_ambiente = SVDpp()

param_grid = {
    'n_epochs': [50, 100, 150],
    'lr_all': [0.002, 0.003, 0.004],
    'reg_all': [0.2, 0.3],
    'n_factors': [10, 20, 30, 40]
}

grid_search = GridSearch(SVDpp_ambiente, param_grid, measures=['RMSE', 'FCP'])
Ejemplo n.º 18
0
 def eval(self):
     # Evaluate performances of our algorithm on the dataset.
     perf = evaluate(self.svd, self.data, measures=['RMSE'])
     print_perf(perf)
Ejemplo n.º 19
0
import os
from surprise import Reader, Dataset

# 指定文件路径
file_path = os.path.expanduser('./popular_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 分成5折
music_data.split(n_folds=5)

# 使用NormalPredictor
from surprise import NormalPredictor, evaluate
algo1 = NormalPredictor()
perf1 = evaluate(algo1, music_data, measures=['RMSE', 'MAE'])

# 使用BaselineOnly
from surprise import BaselineOnly, evaluate
algo2 = BaselineOnly()
perf2 = evaluate(algo2, music_data, measures=['RMSE', 'MAE'])

# 使用基础版协同过滤
from surprise import KNNBasic, evaluate
algo3 = KNNBasic()
perf3 = evaluate(algo3, music_data, measures=['RMSE', 'MAE'])

# 使用均值协同过滤
from surprise import KNNWithMeans, evaluate
algo4 = KNNWithMeans()
perf4 = evaluate(algo4, music_data, measures=['RMSE', 'MAE'])
Ejemplo n.º 20
0
    param_grid = { 'n_factors':range(10,30,2), 'n_epochs': [10,15,20], 'lr_all': [0.002, 0.005, 0.1],'reg_all': [0.4, 0.6, 0.8]}
    param_grid = { 'n_factors':range(2,22,2), 'n_epochs': [10], 'lr_all': [0.1],'reg_all': [0.4]}
    param_grid = { 'n_factors':[2], 'n_epochs':range(11), 'lr_all': [0.1],'reg_all': [0.4]}
    grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'MAE'])
    grid_search.evaluate(music_data)    
    print(grid_search.best_params['RMSE'])   
    print(grid_search.best_params['MAE'])
   
    # 开始训练模型
    print('开始训练模型...')
    #algo = KNNBaseline()
    algo = SVDpp(n_factors=grid_search.best_params['RMSE']['n_factors'],n_epochs=grid_search.best_params['RMSE']['n_epochs'],lr_all=grid_search.best_params['RMSE']['lr_all'],reg_all=grid_search.best_params['RMSE']['reg_all'],verbose=2)
    algo=SVDpp()
    #algo=SVD()
    #algo=SVDpp()
    perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'],verbose=1)
    
    print_perf(perf)
    
    #print()
    #print('针对歌单进行预测:')
    #current_playlist_name =list(name_id_dict.keys())[3]
    #print('歌单名称', current_playlist_name)

    #playlist_rid = name_id_dict[current_playlist_name]
    #print('歌单rid', playlist_rid)

    #playlist_inner_id = algo.trainset.to_inner_uid(playlist_rid)
    #print('歌曲inid', playlist_inner_id)

    #algo.compute_similarities()
#Intialising the Reader which is used to parse the file containing the ratings 
reader=Reader()

#Making the dataset containing the column as userid itemid ratings
#the order is very specific and we have to follow the same order
dataset=Dataset.load_from_df(ratings_dataset[['userId','movieId','rating']],reader)

#Using the split function to perform cross validation 
dataset.split(n_folds=6)

#Intialising the SVD model and specifying the number of latent features
#we can tune this parameters according to our requirement
svd=SVD(n_factors=25)

#evaluting the model on the based on the root mean square error and Mean absolute error 
evaluate(svd,dataset,measures=['rmse','mae'])

#making the dataset to train our model
train=dataset.build_full_trainset()
#training our model
svd.train(train)



#Making a new series which have two columns in it 
#Movie name and movie id 
movies_dataset = movies_dataset.reset_index()
titles = movies_dataset['movie_name']
indices = pd.Series(movies_dataset.index, index=movies_dataset['movie_name'])
#Function to make recommendation to the user
def recommendataion(user_id,movie):
Ejemplo n.º 22
0
 def metrics(self, measures):
   return evaluate(self.algo, self.data, measures=measures)
Ejemplo n.º 23
0
from surprise import Reader, Dataset, SVD, evaluate

sns.set_style("darkgrid")

df1 = pd.read_csv('../../Data/combined_data_1.txt',
                  header=None,
                  usecols=[0, 1],
                  names=['uid', 'rating'])
df1['rating'] = df1['rating'].astype(float).fillna(1.0)
df1['iid'] = pd.DataFrame(list(range(len(df1))))

df = df1.head(100000)
df = df[['uid', 'iid', 'rating']]

df_title = pd.read_csv('../../Data/movie_titles.csv',
                       encoding="ISO-8859-1",
                       header=None,
                       names=['Movie_Id', 'Year', 'Name'])

USERID = '822109'

reader = Reader()
data = Dataset.load_from_df(df, reader)
alg = SVD()
output = alg.fit(data.build_full_trainset())
evaluate(alg, data)

pickle.dump([alg, df, df_title], open('../../Evaluations/matrix-data.p', "wb"))

print(df[df['rating'] == 5]['uid'])
Ejemplo n.º 24
0
def surprise_algorithms_print_perf():
    print('Surprise Algorithms (Tabla de resultados finales)...')
    print('Que data desea utilizar?')
    print('(1) Android')
    print('(2) WordPress')
    data_utilizar = input()

    # Funcion de encoding para no tener error de lectura del archivo.
    reload(sys)
    sys.setdefaultencoding('utf8')

    if data_utilizar == 1:
        file_path = configuration.FILE_PATH_ANDROID
        reader = Reader(line_format='user item rating', sep='\t')
    else:
        file_path = configuration.FILE_PATH_WORDPRESS
        reader = Reader(line_format='user item rating', sep=',')

    # Dataset
    data = Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=5)

    # BaselineOnly
    algo_normal_predictor = NormalPredictor()
    perf_normal_predictor = evaluate(algo_normal_predictor,
                                     data,
                                     measures=['RMSE', 'MAE'],
                                     verbose=False)

    # SVD
    algo_svd = SVD()
    perf_svd = evaluate(algo_svd,
                        data,
                        measures=['RMSE', 'MAE'],
                        verbose=False)

    # BaselineOnly
    algo_baseline_only = BaselineOnly()
    perf_baseline_only = evaluate(algo_baseline_only,
                                  data,
                                  measures=['RMSE', 'MAE'],
                                  verbose=False)

    # SVDpp
    algo_svdpp = SVDpp()
    perf_svdpp = evaluate(algo_svdpp,
                          data,
                          measures=['RMSE', 'MAE'],
                          verbose=False)

    # NMF
    algo_nmf = NMF()
    perf_nmf = evaluate(algo_nmf,
                        data,
                        measures=['RMSE', 'MAE'],
                        verbose=False)

    # SlopeOne
    algo_slope_one = SlopeOne()
    perf_slope_one = evaluate(algo_slope_one,
                              data,
                              measures=['RMSE', 'MAE'],
                              verbose=False)

    # CoClustering
    algo_coclustering = CoClustering()
    perf_coclustering = evaluate(algo_coclustering,
                                 data,
                                 measures=['RMSE', 'MAE'],
                                 verbose=False)
    """Segmento que utiliza KNN para el analisis:
        'k' Es el numero maximo de vecinos a tomar en cuenta para la agregacion
        'min_k' El numero minimo de vecinos a tomar en cuenta para la agregacion.
            Si no hay suficientes vecinos,la predicción se establece en la media global de todas las calificaciones
        'sim_options' son las opciones de similitud que utiliza el knn
        'bsl_options' configuracion de las estimaciones de base"""

    k = 40
    min_k = 1
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': 0  # no shrinkage
    }
    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}

    algo_knn_basic = KNNBasic(k=k, min_k=k, sim_options=sim_options)
    perf_knn_basic = evaluate(algo_knn_basic,
                              data,
                              measures=['RMSE', 'MAE'],
                              verbose=False)

    algo_knn_with_means = KNNWithMeans(k=k, min_k=k, sim_options=sim_options)
    perf_knn_with_means = evaluate(algo_knn_with_means,
                                   data,
                                   measures=['RMSE', 'MAE'],
                                   verbose=False)

    algo_knn_base_line = KNNBaseline(k=k,
                                     min_k=k,
                                     sim_options=sim_options,
                                     bsl_options=bsl_options)
    perf_knn_base_line = evaluate(algo_knn_base_line,
                                  data,
                                  measures=['RMSE', 'MAE'],
                                  verbose=False)
    """Imprimiendo resultados de los algoritmos"""
    print('')
    print('Printing results from algorithms...')
    print('- Normal predictor')
    print_perf(perf_normal_predictor)
    print('')
    print('- Normal SVD')
    print_perf(perf_svd)
    print('')
    print('- Normal Baseline Only')
    print_perf(perf_baseline_only)
    print('')
    print('- Normal SVD++')
    print_perf(perf_svdpp)
    print('')
    print('- Normal NMF')
    print_perf(perf_nmf)
    print('')
    print('- Normal Slope One')
    print_perf(perf_slope_one)
    print('')
    print('- Normal Co-Clustering')
    print_perf(perf_coclustering)
    print('')
    print('- Normal KNN Basic')
    print_perf(perf_knn_basic)
    print('')
    print('- Normal KNN With Means')
    print_perf(perf_knn_with_means)
    print('')
    print('- Normal KNN Base Line')
    print_perf(perf_knn_base_line)
def run_svd(n_epochs, reg_all, init_mean):
    start_time = time.time()
    algo = SVD(n_epochs=n_epochs, reg_all=reg_all, init_mean=init_mean)
    evaluate(algo, data)
    running_time = time.time() - start_time
    print("SVD:", running_time, " s")
Ejemplo n.º 26
0
from surprise import evaluate, print_perf
from surprise import GridSearch
import pandas as pd
import io

#载入数据集,该数据集是一个电影评分数据集,数据结构:uid,iid,score,time
data = Dataset.load_builtin('ml-100k')

#将数据集采用交叉验证均分为3份
data.split(n_folds=3)
'''协调过滤'''
#使用协调过滤算法
algo = KNNBasic()

#评估算法的效果,这里采用RMSE和MAE
perf = evaluate(algo, data, measures=['rmse', 'mae'])
print_perf(perf)
'''SVD分解'''
#指定参数取值范围
param_grid = {
    'n_epochs': [5, 10],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.4, 0.6]
}
#利用surprise自带的GridSearch确定最优参数
grid_search = GridSearch(SVD, param_grid, measures=['rmse', 'fcp'])
grid_search.evaluate(data)

#确定最优参数和结果
print(grid_search.best_score['rmse'])
print(grid_search.best_params['rmse'])

# In[ ]:


from surprise import SVD, evaluate
from surprise import NMF
from surprise import KNNBasic


# In[ ]:


# svd
algo = SVD()
evaluate(algo, data, measures=['RMSE'])


# In[ ]:


# nmf
algo = NMF()
evaluate(algo, data, measures=['RMSE'])


# In[ ]:


# knn
algo = KNNBasic()
Ejemplo n.º 28
0
import surprise as env

path = "input/ml-latest-small/ratings.csv"
reader = env.Reader(line_format="user item rating timestamp",
                    sep=",",
                    skip_lines=1)

data = env.Dataset.load_from_file(path, reader=reader)
data.split(n_folds=3)

algo = env.SVD()

env.evaluate(algo, data)
Ejemplo n.º 29
0
    for i in open('C:\\Users\\Dimple Shah\\Desktop\\mtech\\reco\\3l.csv',
                  'r').readlines()
]
ratings_df1 = pd.DataFrame(ratings_list1,
                           columns=['UserID', 'BookID', 'Rating'],
                           dtype=float)

ratings_df1.loc[:, 'Rating'] = sk.minmax_scale(ratings_df1.loc[:, 'Rating'])

data1 = Dataset.load_from_df(ratings_df1[['UserID', 'BookID', 'Rating']],
                             reader)

data1.split(2)  # split data for 2-folds cross validation
algo1 = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)  #print(algo)
#test_rms=
result1 = surprise.evaluate(algo1, data1, measures=['RMSE'])  #print(test_rms)
x.append(np.mean(result1['RMSE']))
end = time.time()
#print("Time1",end - start)
timex.append(end - start)
process = psutil.Process(os.getpid())
m2 = process.memory_full_info().uss
#m2=m2-m1
print(m2)
mem.append(m2)

#Checking RMSE with 500k data records

start = time.time()
ratings_list2 = [
    i.strip().split(",")
Ejemplo n.º 30
0
        print('-' * 12)
        print('-' * 12)

    return hr, arhr


if __name__ == '__main__':
    # builtin dataset
    # data = env.Dataset.load_builtin('ml-100k')

    # ===============================  load data  ============================
    # ml-latest-small
    # file_path = 'input/ml-latest-small/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-100k
    file_path = 'input/ml-100k/u.data'
    reader = env.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-20m
    # file_path = 'input/ml-20m/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ==============================================================================

    data = env.Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=5)
    algo = env.SVDpp()

    # evaluate_topn(algo, data, top_n=100, threshold=3, verbose=1)
    env.evaluate(algo, data, measures=['rmse', 'mae', 'fcp'], verbose=1)
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import evaluate
import pandas as pd
from pandas import plotting
import matplotlib.pyplot as plt
import warnings

warnings.simplefilter('ignore')
reader = Reader()
ratings = pd.read_csv('./tmdb-5000-movie-dataset/ratings.csv')
ratings.head()

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

trainset = data.build_full_trainset()
svd.fit(trainset)
X = raw_input("Select a UserId")
Y = raw_input("Select a MovieId")
print(svd.predict(int(X),int(Y) ,3))

Ejemplo n.º 32
0
    # ml-latest-small
    # file_path = 'input/ml-latest-small/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-100k
    file_path = 'input/ml-100k/u.data'
    reader = env.Reader(line_format='user item rating timestamp',
                        sep='\t',
                        skip_lines=1)
    # ------------------------------------------------------------------------------
    # ml-20m
    # file_path = 'input/ml-20m/ratings.csv'
    # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
    # ==============================================================================

    data = env.Dataset.load_from_file(file_path, reader=reader)
    data.split(n_folds=5)

    # define algorithm
    algo = WAPR(learning_rate=0.01,
                factor_num=20,
                epoch_num=1,
                batch_num=512,
                alpha=0.01,
                eps=1e-2,
                random=False)

    # evaluate
    # topn.evaluate_topn(algo, data, top_n=100, threshold=4.5)
    env.evaluate(algo, data, measures=['fcp'])
Ejemplo n.º 33
0
        AlgoBase.train(self, trainset)

        # Compute baselines and similarities
        self.bu, self.bi = self.compute_baselines()
        self.sim = self.compute_similarities()

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible('User and/or item is unkown.')

        # Compute similarities between u and v, where v describes all other
        # users that have also rated item i.
        neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
        # Sort these neighbors by similarity
        neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)

        print('The 3 nearest neighbors of user', str(u), 'are:')
        for v, sim_uv in neighbors[:3]:
            print('user {0:} with sim {1:1.2f}'.format(v, sim_uv))

        # ... Aaaaand return the baseline estimate anyway ;)
        bsl = self.trainset.global_mean + self.bu[u] + self.bi[i]
        return bsl


data = Dataset.load_builtin('ml-100k')
algo = MyOwnAlgorithm()

evaluate(algo, data)
data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=5)

sim_options = {
    'name': 'pearson_baseline',
    'shrinkage': 0  # no shrinkage
}
knnbasic_ambiente = KNNBasic()

k_neig = np.array([40, 45, 50, 60])

for i in range(0, len(k_neig)):
    knnbasic_ambiente = KNNBasic(k=k_neig[i])
    perf = evaluate(knnbasic_ambiente,
                    data,
                    measures=['RMSE', 'MAE'],
                    verbose=0)
    print('K es ', k_neig[i], 'media', np.array(perf['rmse']).mean())

#mejor k de ambiente es 40

knnbasic_ambiente = KNNBasic(k=40)
# Retrieve the trainset.
trainset = data.build_full_trainset()

knnbasic_ambiente.train(trainset)

from sklearn.externals import joblib
joblib.dump(knnbasic_ambiente, 'knnbasic_ambiente.pkl')

####comida knn######