Example #1
0
def test_deprecated_way():
    """Test all Dataset constructors without passing rating_scale as a
    parameter. Make sure we revert back to the Reader object, with a warning
    message.

    Also, make sure ValueError is raised if reader has no rating_scale in this
    context.

    Not using dataset fixtures here for more control.
    """

    # test load_from_file
    toy_data_path = (os.path.dirname(os.path.realpath(__file__)) +
                     '/custom_dataset')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=(1, 5))
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=None)
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    # test load_from_folds
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=(1, 5))
        data = Dataset.load_from_folds([(train_file, test_file)], reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=None)
        data = Dataset.load_from_folds([(train_file, test_file)],
                                       reader=reader)
    # test load_from_df
    ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                    'userID': [9, 32, 2, 45, '10000'],
                    'rating': [3, 2, 4, 3, 1]}
    df = pd.DataFrame(ratings_dict)

    with pytest.warns(UserWarning):
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                    reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(rating_scale=None)
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],  # noqa
                                    reader=reader)
Example #2
0
def test_zero_rating_canary():

    ratings_dict = {'itemID': [0, 0, 0, 0, 1, 1],
                    'userID': [0, 1, 2, 3, 3, 4],
                    'rating': [-10, 10, 0, -5, 0, 5]}
    df = pd.DataFrame(ratings_dict)
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(-10, 10))
    trainset = data.build_full_trainset()

    # test ur and ir fields. Kind of OK, but the purpose of the test is
    # precisely to test what would happen if we removed them...
    assert trainset.ir[0] == [(0, -10), (1, 10), (2, 0), (3, -5)]
    assert trainset.ir[1] == [(3, 0), (4, 5)]

    assert trainset.ur[0] == [(0, -10)]
    assert trainset.ur[1] == [(0, 10)]
    assert trainset.ur[2] == [(0, 0)]
    assert trainset.ur[3] == [(0, -5), (1, 0)]
    assert trainset.ur[4] == [(1, 5)]
    print(trainset.ur)

    # ... so also test all_ratings which should be more reliable.
    all_ratings = list(trainset.all_ratings())
    assert (0, 0, -10) in all_ratings
    assert (1, 0, 10) in all_ratings
    assert (2, 0, 0) in all_ratings
    assert (3, 0, -5) in all_ratings
    assert (3, 1, 0) in all_ratings
    assert (4, 1, 5) in all_ratings
Example #3
0
def test_load_form_df():
    """Ensure reading dataset from pandas dataframe is OK."""

    # DF creation.
    ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                    'userID': [9, 32, 2, 45, '10000'],
                    'rating': [3, 2, 4, 3, 1]}
    df = pd.DataFrame(ratings_dict)

    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(1, 5))

    # Assert split and folds can be used without problems
    with pytest.warns(UserWarning):
        data.split(2)
        assert sum(1 for _ in data.folds()) == 2

    # assert users and items are correctly mapped
    trainset = data.build_full_trainset()
    assert trainset.knows_user(trainset.to_inner_uid(9))
    assert trainset.knows_user(trainset.to_inner_uid('10000'))
    assert trainset.knows_item(trainset.to_inner_iid(2))

    # assert r(9, 1) = 3 and r(2, 1) = 4
    uid9 = trainset.to_inner_uid(9)
    uid2 = trainset.to_inner_uid(2)
    iid1 = trainset.to_inner_iid(1)
    assert trainset.ur[uid9] == [(iid1, 3)]
    assert trainset.ur[uid2] == [(iid1, 4)]

    # mess up the column ordering and assert that users are not correctly
    # mapped
    data = Dataset.load_from_df(df[['rating', 'itemID', 'userID']],
                                rating_scale=(1, 5))
    trainset = data.build_full_trainset()
    with pytest.raises(ValueError):
        trainset.to_inner_uid('10000')
    def collaborative(self,ratings,user_id):

        reader = Reader()
        #ratings.head()

        temp_ratings = ratings



        data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
        data.split(n_folds=2)

        ## Training the data ##
        svd = SVD()
        evaluate(svd, data, measures=['RMSE', 'MAE'])

        trainset = data.build_full_trainset()

        algo = SVD()
        algo.fit(trainset)

        #svd.train(trainset)
        ## Testing the data ##

        from collections import defaultdict
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        count = 0
     
        for uid, iid, true_r, est, _ in predictions:

             if uid == user_id:
                count = count+1
                temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]

        #print("count\n")
        #print(count)
        #print("\n--------here-------\n")	
        #print(temp_ratings)

        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]
        #print("\n--------here-------\n")
        #print(cb)
        
        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]

        return(cb)
Example #5
0
def test_build_anti_testset():
    ratings_dict = {'itemID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                    'userID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                    'rating': [1, 2, 3, 4, 5, 6, 7, 8, 9]}
    df = pd.DataFrame(ratings_dict)

    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(1, 5))
    with pytest.warns(UserWarning):
        data.split(2)
        trainset, __testset = next(data.folds())
    # fill with some specific value
    for fillvalue in (0, 42., -1):
        anti = trainset.build_anti_testset(fill=fillvalue)
        for (u, i, r) in anti:
            assert r == fillvalue
    # fill with global_mean
    anti = trainset.build_anti_testset(fill=None)
    for (u, i, r) in anti:
        assert r == trainset.global_mean
    expect = trainset.n_users * trainset.n_items
    assert trainset.n_ratings + len(anti) == expect
Example #6
0
def Cal_Knn(user_id):
    
    user = Usert.objects.all()
    travel = Travel.objects.all()
    #print(type(travel))

    travels = travel.values('tourid', 'city', 'town', 'site', 'genre1', 'genre2','genre3')
    #print(travels,type(travels))
    '''
    qs = SomeModel.objects.select_related().filter(date__year=2012)
    q = qs.values('date', 'OtherField')
    df = pd.DataFrame.from_records(q)
    '''
    
    # 1. raw dataset
    rate  = Treview.objects.all()
    rates = rate.values('treview_no', 'user_no', 'placeid', 'rating', 'udate')
    #print(rate, type(rate))
    #rating = pd.DataFrame(data = rate, columns=['review_no', 'user_no', 'placeid', 'rating'])
    rating = pd.DataFrame.from_records(rates)
    rating.drop('treview_no', axis=1, inplace=True)
    rating.drop('udate', axis=1, inplace=True)
    #print(rating.head())   #   critic(user)   title(item)   rating
    
    #print(user_id)
    rating['user_no'].value_counts()
    rating['placeid'].value_counts()
    
    # 관광 vs 미관광
    tab = pd.crosstab(rating['user_no'], rating['placeid'])
    #print(tab)
    
    # rating
    # 두 개의 집단변수를 가지고 나머지 rating을 그룹화
    rating_g = rating.groupby(['user_no', 'placeid'])
    #print(rating_g.sum())
    tab = rating_g.sum().unstack() # 행렬구조로 변환
    #print(tab)
    #사용자 2이 가지 않은 곳, 1,15, 39....
    #print(tab)

    
    
    # 2. rating 데이터셋 생성
    #reader = Reader(line_format='rating["user_no"] rating["placeid"] rating["rating"]', rating_scale=(0.5, 5))

    reader = Reader(rating_scale= (0.5, 5)) # 평점 범위
    data = Dataset.load_from_df(df=rating, reader=reader)
    # rating이라는 데이터프레임은 reader(1~5)의 평점 범위를 가진다.
    #print(data)
    
    # 3. train/test set
    train = data.build_full_trainset() # 훈련셋
    test = train.build_testset() # 검정셋


    # 4. model 생성
    option = {'name': 'pearson'}
    model = surprise.KNNBaseline(sim_options=option)
    model.fit(train) # model 생성
    
    # 5. user_id 입력
    #user_id = 1 # 추천대상자
    item_ids = range(0, 2106) # placeid 범위
    actual_rating = 0 # 평점
    
    predict_result = []
    
    for item_id in item_ids :
        if not actual_rating in tab:
            actual_rating = 0
            a = model.predict(user_id, item_id, actual_rating)
            predict_result.append(a)
    
    ddff = pd.DataFrame(predict_result)
    #print(ddff)
    
    # 유저 1 추천 여행지 상위 5개
    result = ddff.sort_values(by='est', ascending=False)[:5]
    
    # print('cal knn', result, type(result))
    


    return result
Example #7
0
from surprise import Dataset
from surprise import accuracy
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

# Use movielens-100K
# data = Dataset.load_builtin('ml-100k')


df = pd.read_csv("tr_mini_1.csv")
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.15)

pred = Dataset.load_from_df(pd.read_csv("tr_mini_2.csv")[['user_id', 'business_id']], reader)

# reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 5), skip_lines=1)
# data = Dataset.load_from_file('tr_mini_2.csv', reader=reader)

trainset, testset = train_test_split(data, test_size=.15)

print("About to start")
# ----- SVD ----- #

param_grid = {'n_factors' : [160, 200, 250], 'n_epochs' : [70, 90, 110], 'lr_all': [0.003, 0.005],
              'reg_all': [0.2]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, joblib_verbose = 2, n_jobs=7)
Example #8
0
from steven.steven_baselines import MeanOfMeans

FILE_DIRECTORY = os.path.split(os.path.realpath(__file__))[0]
DATA_DIRECTORY = os.path.join(
    os.path.split(FILE_DIRECTORY)[0], 'data', 'movies')

if __name__ == "__main__":
    # Read data
    df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv'))

    # Drop unneeded column 'timestamp'
    df.drop('timestamp', axis=1, inplace=True)

    # Load the data into the surprise format
    reader = Reader()
    data = Dataset.load_from_df(df, reader=reader)

    # Train ALS model
    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    trainset, testset = train_test_split(data, test_size=0.25)
    algo = BaselineOnly(bsl_options=bsl_options)
    predictions = algo.fit(trainset).test(testset)

    # Get the RMSE of our predictions
    rmse = accuracy.rmse(predictions)

    # Get the cross-validated RMSE of our predictions
    cv_results = cross_validate(algo, data)
    cv_rmse = cv_results['test_rmse'].mean()
    print(f'CV RMSE: {cv_rmse}')
Example #9
0
def get_EDA_page():
    if not os.path.isfile("Data/NetflixRatings.csv"):
        startTime = datetime.now()
        data = open("Data/NetflixRatings.csv", mode="w")
        files = ['Data/combined_data_4.txt']
        for file in files:
            print("Reading from file: " + str(file) + "...")
            with open(file) as f:
                for line in f:
                    line = line.strip()
                    if line.endswith(":"):
                        movieID = line.replace(":", "")
                    else:
                        row = []
                        row = [
                            x for x in line.split(",")
                        ]  #custID, rating and date are separated by comma
                        row.insert(0, movieID)
                        data.write(",".join(row))
                        data.write("\n")
            print("Reading of file: " + str(file) + " is completed\n")
        data.close()
        print("Total time taken for execution of this code = " +
              str(datetime.now() - startTime))

    else:
        print("data is already loaded")

    # creating data frame from our output csv file.
    if not os.path.isfile("Data/NetflixData.pkl"):
        startTime = datetime.now()
        Final_Data = pd.read_csv(
            "Data/NetflixRatings.csv",
            sep=",",
            names=["MovieID", "CustID", "Ratings", "Date"])
        Final_Data["Date"] = pd.to_datetime(Final_Data["Date"])
        Final_Data.sort_values(by="Date", inplace=True)
        print("Time taken for execution of above code = " +
              str(datetime.now() - startTime))
        st.write("data frame created")
    else:
        print("data frame already present")

    # storing pandas dataframe as a picklefile for later use
    if not os.path.isfile("Data/NetflixData.pkl"):
        Final_Data.to_pickle("Data/NetflixData.pkl")
        st.write("pkl created")
    else:
        Final_Data = pd.read_pickle("Data/NetflixData.pkl")
        print("pkl already present")

    if st.checkbox("Show Final_Data"):
        st.write(Final_Data)
        if st.checkbox("Show all the column Names"):
            st.write(Final_Data.columns)

########
    if st.checkbox("Show size of dataset"):
        if st.checkbox("Show row size"):
            st.write(Final_Data.shape[0])
        if st.checkbox("Show column size"):
            st.write(Final_Data.shape[1])
        if st.checkbox("Show complete dataset size"):
            st.write(Final_Data.shape)
        if st.checkbox("Show desc of Ratings in final data"):
            Final_Data.describe()["Ratings"]

    st.write("**displaying final dataset header lines using area chart**")
    st.area_chart(Final_Data)

    print("Number of NaN values = " + str(Final_Data.isnull().sum()))

    duplicates = Final_Data.duplicated(["MovieID", "CustID", "Ratings"])
    print("Number of duplicate rows = " + str(duplicates.sum()))

    #####
    if st.checkbox("Show unique customer & movieId in Total Data:"):
        st.write("Total number of movie ratings = ", str(Final_Data.shape[0]))
        st.write("Number of unique users = ",
                 str(len(np.unique(Final_Data["CustID"]))))
        st.write("Number of unique movies = ",
                 str(len(np.unique(Final_Data["MovieID"]))))
######### creating pkl file
    if not os.path.isfile("Data/TrainData.pkl"):
        Final_Data.iloc[:int(Final_Data.shape[0] *
                             0.80)].to_pickle("Data/TrainData.pkl")
        Train_Data = pd.read_pickle("Data/TrainData.pkl")
        Train_Data.reset_index(drop=True, inplace=True)
    else:
        Train_Data = pd.read_pickle("Data/TrainData.pkl")
        Train_Data.reset_index(drop=True, inplace=True)

    if not os.path.isfile("Data/TestData.pkl"):
        Final_Data.iloc[int(Final_Data.shape[0] *
                            0.80):].to_pickle("Data/TestData.pkl")
        Test_Data = pd.read_pickle("Data/TestData.pkl")
        Test_Data.reset_index(drop=True, inplace=True)
    else:
        Test_Data = pd.read_pickle("Data/TestData.pkl")
        Test_Data.reset_index(drop=True, inplace=True)
#########

    if st.checkbox("Showing dataset of Train_Data & Test_Data"):
        st.area_chart(Train_Data)
        st.area_chart(Test_Data)

    if st.checkbox("Show unique customer & movieId in Train DataSet:"):
        st.write("Total number of movie ratings in train data = ",
                 str(Train_Data.shape[0]))
        st.write("Number of unique users in train data = ",
                 str(len(np.unique(Train_Data["CustID"]))))
        st.write("Number of unique movies in train data = ",
                 str(len(np.unique(Train_Data["MovieID"]))))
        st.write("Highest value of a User ID = ",
                 str(max(Train_Data["CustID"].values)))
        st.write("Highest value of a Movie ID =  ",
                 str(max(Train_Data["MovieID"].values)))

    if st.checkbox("Show unique customer & movieId in Test DataSet:"):
        st.write("Total number of movie ratings in Test data = ",
                 str(Test_Data.shape[0]))
        st.write("Number of unique users in Test data = ",
                 str(len(np.unique(Test_Data["CustID"]))))
        st.write("Number of unique movies in trTestain data = ",
                 str(len(np.unique(Test_Data["MovieID"]))))
        st.write("Highest value of a User ID = ",
                 str(max(Test_Data["CustID"].values)))
        st.write("Highest value of a Movie ID =  ",
                 str(max(Test_Data["MovieID"].values)))

    ##########

    def changingLabels(number):
        return str(number / 10**6) + "M"

    plt.figure(figsize=(12, 8))
    ax = sns.countplot(x="Ratings", data=Train_Data)

    ax.set_yticklabels([changingLabels(num) for num in ax.get_yticks()])

    plt.tick_params(labelsize=15)
    plt.title("Distribution of Ratings in train data", fontsize=20)
    plt.xlabel("Ratings", fontsize=20)
    plt.ylabel("Number of Ratings(Millions)", fontsize=20)
    st.pyplot()
    st.write(
        "This graph will  show how **Distribution of Ratings** which shows the overall maturity level of the whole series and is provided by the audience :smile: "
    )

    Train_Data["DayOfWeek"] = Train_Data.Date.dt.weekday_name
    plt.figure(figsize=(10, 8))
    ax = Train_Data.resample("M", on="Date")["Ratings"].count().plot()
    ax.set_yticklabels([changingLabels(num) for num in ax.get_yticks()])
    ax.set_title("Number of Ratings per Month", fontsize=20)
    ax.set_xlabel("Date", fontsize=20)
    ax.set_ylabel("Number of Ratings Per Month(Millions)", fontsize=20)
    plt.tick_params(labelsize=15)
    st.pyplot()
    st.write(
        "This Graph will represents the **Number of Ratings Per Month** means counts of ratings grouped by months :smile:"
    )

    st.write("**Analysis of Ratings given by user**")
    no_of_rated_movies_per_user = Train_Data.groupby(
        by="CustID")["Ratings"].count().sort_values(ascending=False)
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 7))
    sns.kdeplot(no_of_rated_movies_per_user.values, shade=True, ax=axes[0])
    axes[0].set_title("Fig1", fontsize=18)
    axes[0].set_xlabel("Number of Ratings by user", fontsize=18)
    axes[0].tick_params(labelsize=15)
    sns.kdeplot(no_of_rated_movies_per_user.values,
                shade=True,
                cumulative=True,
                ax=axes[1])
    axes[1].set_title("Fig2", fontsize=18)
    axes[1].set_xlabel("Number of Ratings by user", fontsize=18)
    axes[1].tick_params(labelsize=15)
    fig.subplots_adjust(wspace=2)
    plt.tight_layout()
    st.pyplot()

    ####
    st.write(
        "Above fig1 graph shows that almost all of the users give very few ratings. There are very **few users who's ratings count is high** .Similarly, above fig2 graph shows that **almost 99% of users give very few ratings**"
    )
    quantiles = no_of_rated_movies_per_user.quantile(np.arange(0, 1.01, 0.01))
    fig = plt.figure(figsize=(10, 6))
    axes = fig.add_axes([0.1, 0.1, 1, 1])
    axes.set_title("Quantile values of Ratings Per User", fontsize=20)
    axes.set_xlabel("Quantiles", fontsize=20)
    axes.set_ylabel("Ratings Per User", fontsize=20)
    axes.plot(quantiles)
    plt.scatter(x=quantiles.index[::5],
                y=quantiles.values[::5],
                c="blue",
                s=70,
                label="quantiles with 0.05 intervals")
    plt.scatter(x=quantiles.index[::25],
                y=quantiles.values[::25],
                c="red",
                s=70,
                label="quantiles with 0.25 intervals")
    plt.legend(loc='upper left', fontsize=20)
    for x, y in zip(quantiles.index[::25], quantiles.values[::25]):
        plt.annotate(s='({},{})'.format(x, y),
                     xy=(x, y),
                     fontweight='bold',
                     fontsize=16,
                     xytext=(x - 0.05, y + 180))
    axes.tick_params(labelsize=15)
    st.pyplot()

    st.write("this graph shows the Quantile values of Ratings Per User")
    st.write("**Analysis of Ratings Per Movie** :smile:")
    no_of_ratings_per_movie = Train_Data.groupby(
        by="MovieID")["Ratings"].count().sort_values(ascending=False)
    fig = plt.figure(figsize=(12, 6))
    axes = fig.add_axes([0.1, 0.1, 1, 1])
    plt.title("Number of Ratings Per Movie", fontsize=20)
    plt.xlabel("Movie", fontsize=20)
    plt.ylabel("Count of Ratings", fontsize=20)
    plt.plot(no_of_ratings_per_movie.values)
    plt.tick_params(labelsize=15)
    axes.set_xticklabels([])
    st.pyplot()

    st.write(
        "This graph shows the number of rating(in count) each movie achieved by the audience, which clearly shows that there are some movies which are very popular and were rated by many users as comapared to other movies "
    )
    st.write("**Analysis of Movie Ratings on Day of Week** :smile:")
    fig = plt.figure(figsize=(12, 8))
    axes = sns.countplot(x="DayOfWeek", data=Train_Data)
    axes.set_title("Day of week VS Number of Ratings", fontsize=20)
    axes.set_xlabel("Day of Week", fontsize=20)
    axes.set_ylabel("Number of Ratings", fontsize=20)
    axes.set_yticklabels([changingLabels(num) for num in ax.get_yticks()])
    axes.tick_params(labelsize=15)
    st.pyplot()

    st.write(
        "This graph will show Analysis of Movie Ratings on Day of Week in bar graph format ,here clearly visible that on sturday & sunday users are least interested in providing ratings "
    )
    fig = plt.figure(figsize=(12, 8))
    axes = sns.boxplot(x="DayOfWeek", y="Ratings", data=Train_Data)
    axes.set_title("Day of week VS Number of Ratings", fontsize=20)
    axes.set_xlabel("Day of Week", fontsize=20)
    axes.set_ylabel("Number of Ratings", fontsize=20)
    axes.tick_params(labelsize=15)
    st.pyplot()

    st.write(
        "This graph will show Analysis of Movie Ratings on Day of Week in box plot format ,here clearly visible that on sturday & sunday users are least interested in providing ratings "
    )
    average_ratings_dayofweek = Train_Data.groupby(
        by="DayOfWeek")["Ratings"].mean()
    st.write("**Average Ratings on Day of Weeks**")
    st.write(average_ratings_dayofweek)
    st.write(
        "**This Average Ratings on Day of Weeks will represented in graphical format** "
    )
    st.area_chart(average_ratings_dayofweek)
    st.write(
        "this graph represents that average rating is mostly lies between 3 to 4."
    )
    st.write("**Distribution of Movie ratings amoung Users**")
    plt.scatter(Test_Data["CustID"], Test_Data["MovieID"])
    st.pyplot()

    ####################Creating USER-ITEM sparse matrix from data frame

    startTime = datetime.now()
    print("Creating USER_ITEM sparse matrix for train Data")
    if os.path.isfile("Data/TrainUISparseData.npz"):
        print(
            "Sparse Data is already present in your disk, no need to create further. Loading Sparse Matrix"
        )
        TrainUISparseData = sparse.load_npz("Data/TrainUISparseData.npz")
        print("Shape of Train Sparse matrix = " + str(TrainUISparseData.shape))

    else:
        print("We are creating sparse data")
        TrainUISparseData = sparse.csr_matrix(
            (Train_Data.Ratings, (Train_Data.CustID, Train_Data.MovieID)))
        print("Creation done. Shape of sparse matrix = " +
              str(TrainUISparseData.shape))
        print("Saving it into disk for furthur usage.")
        sparse.save_npz("Data/TrainUISparseData.npz", TrainUISparseData)
        print("Done\n")

    print(datetime.now() - startTime)

    ###############Creating USER-ITEM sparse matrix from data frame for test data

    startTime = datetime.now()
    print("Creating USER_ITEM sparse matrix for test Data")
    if os.path.isfile("Data/TestUISparseData.npz"):
        print(
            "Sparse Data is already present in your disk, no need to create further. Loading Sparse Matrix"
        )
        TestUISparseData = sparse.load_npz("Data/TestUISparseData.npz")
        print("Shape of Test Sparse Matrix = " + str(TestUISparseData.shape))
    else:
        print("We are creating sparse data")
        TestUISparseData = sparse.csr_matrix(
            (Test_Data.Ratings, (Test_Data.CustID, Test_Data.MovieID)))
        print("Creation done. Shape of sparse matrix = " +
              str(TestUISparseData.shape))
        print("Saving it into disk for furthur usage.")
        sparse.save_npz("Data/TestUISparseData.npz", TestUISparseData)
        print("Done\n")

    print(datetime.now() - startTime)

    rows, cols = TrainUISparseData.shape
    presentElements = TrainUISparseData.count_nonzero()

    print("Sparsity Of Train matrix : {}% ".format(
        (1 - (presentElements / (rows * cols))) * 100))

    rows, cols = TestUISparseData.shape
    presentElements = TestUISparseData.count_nonzero()

    print("Sparsity Of Test matrix : {}% ".format(
        (1 - (presentElements / (rows * cols))) * 100))

    #################Finding Global average of all movie ratings, Average rating per user, and Average rating per movie

    def getAverageRatings(sparseMatrix, if_user):
        ax = 1 if if_user else 0
        #axis = 1 means rows and axis = 0 means columns
        sumOfRatings = sparseMatrix.sum(
            axis=ax
        ).A1  #this will give an array of sum of all the ratings of user if axis = 1 else
        #sum of all the ratings of movies if axis = 0
        noOfRatings = (sparseMatrix != 0).sum(
            axis=ax
        ).A1  #this will give a boolean True or False array, and True means 1 and False
        #means 0, and further we are summing it to get the count of all the non-zero cells means length of non-zero cells
        rows, cols = sparseMatrix.shape
        averageRatings = {
            i: sumOfRatings[i] / noOfRatings[i]
            for i in range(rows if if_user else cols) if noOfRatings[i] != 0
        }
        return averageRatings

    Global_Average_Rating = TrainUISparseData.sum(
    ) / TrainUISparseData.count_nonzero()
    print("Global Average Rating {}".format(Global_Average_Rating))

    AvgRatingUser = getAverageRatings(TrainUISparseData, True)

    #############Machine Learning Models

    def get_sample_sparse_matrix(sparseMatrix, n_users, n_movies):
        startTime = datetime.now()
        users, movies, ratings = sparse.find(sparseMatrix)
        uniq_users = np.unique(users)
        uniq_movies = np.unique(movies)
        np.random.seed(
            15
        )  #this will give same random number everytime, without replacement
        userS = np.random.choice(uniq_users, n_users, replace=True)
        movieS = np.random.choice(uniq_movies, n_movies, replace=True)
        mask = np.logical_and(np.isin(users, userS), np.isin(movies, movieS))
        sparse_sample = sparse.csr_matrix(
            (ratings[mask], (users[mask], movies[mask])),
            shape=(max(userS) + 1, max(movieS) + 1))
        print("Sparse Matrix creation done. Saving it for later use.")
        sparse.save_npz(path, sparse_sample)
        print("Done")
        print("Shape of Sparse Sampled Matrix = " + str(sparse_sample.shape))

        print(datetime.now() - startTime)
        return sparse_sample

    ####Creating Sample Sparse Matrix for Train Data

    path = "Data/TrainUISparseData_Sample.npz"
    if not os.path.isfile(path):
        print(
            "Sample sparse matrix is not present in the disk. We are creating it..."
        )
        train_sample_sparse = get_sample_sparse_matrix(TrainUISparseData, 4000,
                                                       400)
    else:
        print("File is already present in the disk. Loading the file...")
        train_sample_sparse = sparse.load_npz(path)
        print("File loading done.")
        print("Shape of Train Sample Sparse Matrix = " +
              str(train_sample_sparse.shape))

    ##########Creating Sample Sparse Matrix for Test Data

    path = "Data/TestUISparseData_Sample.npz"
    if not os.path.isfile(path):
        print(
            "Sample sparse matrix is not present in the disk. We are creating it..."
        )
        test_sample_sparse = get_sample_sparse_matrix(TestUISparseData, 2000,
                                                      200)
    else:
        print("File is already present in the disk. Loading the file...")
        test_sample_sparse = sparse.load_npz(path)
        print("File loading done.")
        print("Shape of Test Sample Sparse Matrix = " +
              str(test_sample_sparse.shape))
    #####print("Global average of all movies ratings in Train Sample Sparse is {}".format(np.round((train_sample_sparse.sum()/train_sample_sparse.count_nonzero()), 2)))
    globalAvgMovies = getAverageRatings(train_sample_sparse, False)
    globalAvgUsers = getAverageRatings(train_sample_sparse, True)

    #######   Featurizing data for regression problem
    ###### Featurizing Train Data

    sample_train_users, sample_train_movies, sample_train_ratings = sparse.find(
        train_sample_sparse)

    if os.path.isfile("Data/Train_Regression.csv"):
        print(
            "File is already present in your disk. You do not have to prepare it again."
        )
    else:
        startTime = datetime.now()
        print("Preparing Train csv file for {} rows".format(
            len(sample_train_ratings)))
        with open("Data/Train_Regression.csv", mode="w") as data:
            count = 0
            for user, movie, rating in zip(sample_train_users,
                                           sample_train_movies,
                                           sample_train_ratings):
                row = list()
                row.append(user)  #appending user ID
                row.append(movie)  #appending movie ID
                row.append(train_sample_sparse.sum() /
                           train_sample_sparse.count_nonzero()
                           )  #appending global average rating

                #----------------------------------Ratings given to "movie" by top 5 similar users with "user"--------------------#
                similar_users = cosine_similarity(train_sample_sparse[user],
                                                  train_sample_sparse).ravel()
                similar_users_indices = np.argsort(-similar_users)[1:]
                similar_users_ratings = train_sample_sparse[
                    similar_users_indices, movie].toarray().ravel()
                top_similar_user_ratings = list(
                    similar_users_ratings[similar_users_ratings != 0][:5])
                top_similar_user_ratings.extend(
                    [globalAvgMovies[movie]] *
                    (5 - len(top_similar_user_ratings)))
                #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "movie" average
                #rating. Let say only 3 out of 5 ratings are available then rest 2 will be "movie" average rating.
                row.extend(top_similar_user_ratings)

                #----------------------------------Ratings given by "user" to top 5 similar movies with "movie"------------------#similar_movies = cosine_similarity(train_sample_sparse[:,movie].T, train_sample_sparse.T).ravel()
                similar_movies_indices = np.argsort(-similar_movies)[1:]
                similar_movies_ratings = train_sample_sparse[
                    user, similar_movies_indices].toarray().ravel()
                top_similar_movie_ratings = list(
                    similar_movies_ratings[similar_movies_ratings != 0][:5])
                top_similar_movie_ratings.extend(
                    [globalAvgUsers[user]] *
                    (5 - len(top_similar_movie_ratings)))
                #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "user" average
                #rating. Let say only 3 out of 5 ratings are available then rest 2 will be "user" average rating.
                row.extend(top_similar_movie_ratings)

                #----------------------------------Appending "user" average, "movie" average & rating of "user""movie"-----------#
                row.append(globalAvgUsers[user])
                row.append(globalAvgMovies[movie])
                row.append(rating)

                #-----------------------------------Converting rows and appending them as comma separated values to csv file------#
                data.write(",".join(map(str, row)))
                data.write("\n")
                count += 1
                if count % 2000 == 0:
                    print("Done for {}. Time elapsed: {}".format(
                        count, (datetime.now() - startTime)))

        print("Total Time for {} rows = {}".format(
            len(sample_train_ratings), (datetime.now() - startTime)))
################
    Train_Reg = pd.read_csv("Data/Train_Regression.csv",
                            names=[
                                "User_ID", "Movie_ID", "Global_Average",
                                "SUR1", "SUR2", "SUR3", "SUR4", "SUR5", "SMR1",
                                "SMR2", "SMR3", "SMR4", "SMR5", "User_Average",
                                "Movie_Average", "Rating"
                            ])
    #Train_Reg.head()
    ########    Featurizing Test Data    #####################3

    sample_test_users, sample_test_movies, sample_test_ratings = sparse.find(
        test_sample_sparse)
    if os.path.isfile("Data/Test_Regression.csv"):
        print(
            "File is already present in your disk. You do not have to prepare it again."
        )
    else:
        startTime = datetime.now()
        print("Preparing Test csv file for {} rows".format(
            len(sample_test_ratings)))
        with open("Data/Test_Regression.csv", mode="w") as data:
            count = 0
            for user, movie, rating in zip(sample_test_users,
                                           sample_test_movies,
                                           sample_test_ratings):
                row = list()
                row.append(user)  #appending user ID
                row.append(movie)  #appending movie ID
                row.append(
                    train_sample_sparse.sum() /
                    train_sample_sparse.count_nonzero()
                )  #appending global average rating#-----------------------------Ratings given to "movie" by top 5 similar users with "user"-------------------------#
                try:
                    similar_users = cosine_similarity(
                        train_sample_sparse[user],
                        train_sample_sparse).ravel()
                    similar_users_indices = np.argsort(-similar_users)[1:]
                    similar_users_ratings = train_sample_sparse[
                        similar_users_indices, movie].toarray().ravel()
                    top_similar_user_ratings = list(
                        similar_users_ratings[similar_users_ratings != 0][:5])
                    top_similar_user_ratings.extend(
                        [globalAvgMovies[movie]] *
                        (5 - len(top_similar_user_ratings)))
                    #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "movie"
                    #average rating. Let say only 3 out of 5 ratings are available then rest 2 will be "movie" average rating.
                    row.extend(top_similar_user_ratings)
                #########Cold Start Problem, for a new user or a new movie#########
                except (IndexError, KeyError):
                    global_average_train_rating = [
                        train_sample_sparse.sum() /
                        train_sample_sparse.count_nonzero()
                    ] * 5
                    row.extend(global_average_train_rating)
                except:
                    raise

#-----------------------------Ratings given by "user" to top 5 similar movies with "movie"-----------------------#
                try:
                    similar_movies = cosine_similarity(
                        train_sample_sparse[:, movie].T,
                        train_sample_sparse.T).ravel()
                    similar_movies_indices = np.argsort(-similar_movies)[1:]
                    similar_movies_ratings = train_sample_sparse[
                        user, similar_movies_indices].toarray().ravel()
                    top_similar_movie_ratings = list(similar_movies_ratings[
                        similar_movies_ratings != 0][:5])
                    top_similar_movie_ratings.extend(
                        [globalAvgUsers[user]] *
                        (5 - len(top_similar_movie_ratings)))
                    #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "user"
                    #average rating. Let say only 3 out of 5 ratings are available then rest 2 will be "user" average rating.
                    row.extend(top_similar_movie_ratings)
                #########Cold Start Problem, for a new user or a new movie#########
                except (IndexError, KeyError):
                    global_average_train_rating = [
                        train_sample_sparse.sum() /
                        train_sample_sparse.count_nonzero()
                    ] * 5
                    row.extend(global_average_train_rating)
                except:
                    raise

#-----------------------------Appending "user" average, "movie" average & rating of "user""movie"----------------#try:
                try:
                    row.append(globalAvgUsers[user])
                except (KeyError):
                    global_average_train_rating = train_sample_sparse.sum(
                    ) / train_sample_sparse.count_nonzero()
                    row.append(global_average_train_rating)
                except:
                    raise

                try:
                    row.append(globalAvgMovies[movie])
                except (KeyError):
                    global_average_train_rating = train_sample_sparse.sum(
                    ) / train_sample_sparse.count_nonzero()
                    row.append(global_average_train_rating)
                except:
                    raise

                row.append(rating)

                #------------------------------Converting rows and appending them as comma separated values to csv file-----------#
                data.write(",".join(map(str, row)))
                data.write("\n")

                count += 1
                if count % 100 == 0:
                    print("Done for {}. Time elapsed: {}".format(
                        count, (datetime.now() - startTime)))

        print("Total Time for {} rows = {}".format(
            len(sample_test_ratings), (datetime.now() - startTime)))

    Test_Reg = pd.read_csv("Data/Test_Regression.csv",
                           names=[
                               "User_ID", "Movie_ID", "Global_Average", "SUR1",
                               "SUR2", "SUR3", "SUR4", "SUR5", "SMR1", "SMR2",
                               "SMR3", "SMR4", "SMR5", "User_Average",
                               "Movie_Average", "Rating"
                           ])
    #Test_Reg.head()

    ##
    ###### Transforming Data for Surprise Models
    Train_Reg[['User_ID', 'Movie_ID', 'Rating']].head(5)
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(Train_Reg[['User_ID', 'Movie_ID', 'Rating']],
                                reader)
    trainset = data.build_full_trainset()

    testset = list(
        zip(Test_Reg["User_ID"].values, Test_Reg["Movie_ID"].values,
            Test_Reg["Rating"].values))

    error_table = pd.DataFrame(columns=[
        "Model", "Train RMSE", "Train MAPE", "Test RMSE", "Test MAPE"
    ])
    model_train_evaluation = dict()
    model_test_evaluation = dict()

    def make_table(model_name, rmse_train, mape_train, rmse_test, mape_test):
        global error_table
        #All variable assignments in a function store the value in the local symbol table; whereas variable references first look
        #in the local symbol table, then in the global symbol table, and then in the table of built-in names. Thus, global variables
        #cannot be directly assigned a value within a function (unless named in a global statement),
        #although they may be referenced.
        error_table = error_table.append(
            pd.DataFrame(
                [[model_name, rmse_train, mape_train, rmse_test, mape_test]],
                columns=[
                    "Model", "Train RMSE", "Train MAPE", "Test RMSE",
                    "Test MAPE"
                ]))
        error_table.reset_index(drop=True, inplace=True)

    ###### Utility Functions for Regression Models
    def error_metrics(y_true, y_pred):
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        mape = np.mean(abs((y_true - y_pred) / y_true)) * 100
        return rmse, mape

    def train_test_xgboost(x_train, x_test, y_train, y_test, model_name):
        startTime = datetime.now()
        train_result = dict()
        test_result = dict()

        clf = xgb.XGBRegressor(n_estimators=100, silent=False, n_jobs=10)
        clf.fit(x_train, y_train)

        print("-" * 50)
        print("TRAIN DATA")
        y_pred_train = clf.predict(x_train)
        rmse_train, mape_train = error_metrics(y_train, y_pred_train)
        print("RMSE = {}".format(rmse_train))
        print("MAPE = {}".format(mape_train))
        print("-" * 50)
        train_result = {
            "RMSE": rmse_train,
            "MAPE": mape_train,
            "Prediction": y_pred_train
        }

        print("TEST DATA")
        y_pred_test = clf.predict(x_test)
        rmse_test, mape_test = error_metrics(y_test, y_pred_test)
        print("RMSE = {}".format(rmse_test))
        print("MAPE = {}".format(mape_test))
        print("-" * 50)
        test_result = {
            "RMSE": rmse_test,
            "MAPE": mape_test,
            "Prediction": y_pred_test
        }

        print("Time Taken = " + str(datetime.now() - startTime))

        plot_importance(xgb, clf)

        make_table(model_name, rmse_train, mape_train, rmse_test, mape_test)

        return train_result, test_result

    #######################
    def plot_importance(model, clf):
        fig = plt.figure(figsize=(4, 3))
        ax = fig.add_axes([0, 0, 1, 1])
        model.plot_importance(clf, ax=ax, height=0.3)
        ax.set_xlabel("F Score", fontsize=20)
        ax.set_ylabel("Features", fontsize=20)
        ax.set_title("Feature Importance", fontsize=20)
        #ax.set_tick_params(labelsize = 15)
        st.pyplot(fig=fig)
        #plt.show()

    #st.plotly_chart(fig,use_container_width=True)

    ###### Utility Functions for Surprise Models

    def get_ratings(predictions):
        actual = np.array([pred.r_ui for pred in predictions])
        predicted = np.array([pred.est for pred in predictions])
        return actual, predicted

    #in surprise prediction of every data point is returned as dictionary like this:
    #"user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}"
    #In this dictionary, "r_ui" is a key for actual rating and "est" is a key for predicted rating
    def get_error(predictions):
        actual, predicted = get_ratings(predictions)
        rmse = np.sqrt(mean_squared_error(actual, predicted))
        mape = np.mean(abs((actual - predicted) / actual)) * 100
        return rmse, mape

    my_seed = 15
    random.seed(my_seed)
    np.random.seed(my_seed)

    def run_surprise(algo, trainset, testset, model_name):
        startTime = datetime.now()

        train = dict()
        test = dict()

        algo.fit(trainset)
        #You can check out above function at "https://surprise.readthedocs.io/en/stable/getting_started.html" in
        #"Train-test split and the fit() method" section

        #-----------------Evaluating Train Data------------------#
        print("-" * 50)
        print("TRAIN DATA")
        train_pred = algo.test(trainset.build_testset())
        #You can check out "algo.test()" function at "https://surprise.readthedocs.io/en/stable/getting_started.html" in
        #"Train-test split and the fit() method" section
        #You can check out "trainset.build_testset()" function at "https://surprise.readthedocs.io/en/stable/FAQ.html#can-i-use-my-own-dataset-with-surprise-and-can-it-be-a-pandas-dataframe" in
        #"How to get accuracy measures on the training set" section
        train_actual, train_predicted = get_ratings(train_pred)
        train_rmse, train_mape = get_error(train_pred)
        print("RMSE = {}".format(train_rmse))
        print("MAPE = {}".format(train_mape))
        print("-" * 50)
        train = {
            "RMSE": train_rmse,
            "MAPE": train_mape,
            "Prediction": train_predicted
        }

        #-----------------Evaluating Test Data------------------#
        print("TEST DATA")
        test_pred = algo.test(testset)
        #You can check out "algo.test()" function at "https://surprise.readthedocs.io/en/stable/getting_started.html" in
        #"Train-test split and the fit() method" section
        test_actual, test_predicted = get_ratings(test_pred)
        test_rmse, test_mape = get_error(test_pred)
        print("RMSE = {}".format(test_rmse))
        print("MAPE = {}".format(test_mape))
        print("-" * 50)
        test = {
            "RMSE": test_rmse,
            "MAPE": test_mape,
            "Prediction": test_predicted
        }

        print("Time Taken = " + str(datetime.now() - startTime))

        make_table(model_name, train_rmse, train_mape, test_rmse, test_mape)

        return train, test

    ##
    ################## XGBoost 13 Features###################
    x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    y_train = Train_Reg["Rating"]

    y_test = Test_Reg["Rating"]

    train_result, test_result = train_test_xgboost(x_train, x_test, y_train,
                                                   y_test, "XGBoost_13")

    model_train_evaluation["XGBoost_13"] = train_result
    model_test_evaluation["XGBoost_13"] = test_result

    ####################################################
    ###################   2. Surprise BaselineOnly Model    #################################
    bsl_options = {"method": "sgd", "learning_rate": 0.01, "n_epochs": 25}

    algo = BaselineOnly(bsl_options=bsl_options)
    #You can check the docs of above used functions at:https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#baseline-estimates-configuration
    #at section "Baselines estimates configuration".

    train_result, test_result = run_surprise(algo, trainset, testset,
                                             "BaselineOnly")

    model_train_evaluation["BaselineOnly"] = train_result
    model_test_evaluation["BaselineOnly"] = test_result

    ############# 3. XGBoost 13 Features + Surprise BaselineOnly Model  ####################
    Train_Reg["BaselineOnly"] = model_train_evaluation["BaselineOnly"][
        "Prediction"]
    Test_Reg["BaselineOnly"] = model_test_evaluation["BaselineOnly"][
        "Prediction"]

    x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    y_train = Train_Reg["Rating"]

    y_test = Test_Reg["Rating"]

    train_result, test_result = train_test_xgboost(x_train, x_test, y_train,
                                                   y_test, "XGB_BSL")

    model_train_evaluation["XGB_BSL"] = train_result
    model_test_evaluation["XGB_BSL"] = test_result

    ################### 4. Surprise KNN-Baseline with User-User and Item-Item Similarity     #########
    param_grid = {
        'sim_options': {
            'name': ["pearson_baseline"],
            "user_based": [True],
            "min_support": [2],
            "shrinkage": [60, 80, 80, 140]
        },
        'k': [5, 20, 40, 80]
    }

    gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # best RMSE score
    #print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    #print(gs.best_params['rmse'])

    #######   Applying KNNBaseline User-User with best parameters    ########
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': True,
        'min_support': 2,
        'shrinkage': gs.best_params['rmse']['sim_options']['shrinkage']
    }

    bsl_options = {'method': 'sgd'}

    algo = KNNBaseline(k=gs.best_params['rmse']['k'],
                       sim_options=sim_options,
                       bsl_options=bsl_options)

    train_result, test_result = run_surprise(algo, trainset, testset,
                                             "KNNBaseline_User")

    model_train_evaluation["KNNBaseline_User"] = train_result
    model_test_evaluation["KNNBaseline_User"] = test_result

    ##########  4.2 Surprise KNN-Baseline with Item-Item    #############

    param_grid = {
        'sim_options': {
            'name': ["pearson_baseline"],
            "user_based": [False],
            "min_support": [2],
            "shrinkage": [60, 80, 80, 140]
        },
        'k': [5, 20, 40, 80]
    }

    gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # best RMSE score
    #print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    #print(gs.best_params['rmse'])

    ###############  Applying KNNBaseline Item-Item with best parameters  ######
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': False,
        'min_support': 2,
        'shrinkage': gs.best_params['rmse']['sim_options']['shrinkage']
    }

    bsl_options = {'method': 'sgd'}

    algo = KNNBaseline(k=gs.best_params['rmse']['k'],
                       sim_options=sim_options,
                       bsl_options=bsl_options)

    train_result, test_result = run_surprise(algo, trainset, testset,
                                             "KNNBaseline_Item")

    model_train_evaluation["KNNBaseline_Item"] = train_result
    model_test_evaluation["KNNBaseline_Item"] = test_result
    ###########   5. XGBoost 13 Features + Surprise BaselineOnly + Surprise KNN Baseline    ###############
    Train_Reg["KNNBaseline_User"] = model_train_evaluation["KNNBaseline_User"][
        "Prediction"]
    Train_Reg["KNNBaseline_Item"] = model_train_evaluation["KNNBaseline_Item"][
        "Prediction"]

    Test_Reg["KNNBaseline_User"] = model_test_evaluation["KNNBaseline_User"][
        "Prediction"]
    Test_Reg["KNNBaseline_Item"] = model_test_evaluation["KNNBaseline_Item"][
        "Prediction"]

    #st.write(Train_Reg.head())

    x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    y_train = Train_Reg["Rating"]

    y_test = Test_Reg["Rating"]

    train_result, test_result = train_test_xgboost(x_train, x_test, y_train,
                                                   y_test, "XGB_BSL_KNN")

    model_train_evaluation["XGB_BSL_KNN"] = train_result
    model_test_evaluation["XGB_BSL_KNN"] = test_result
    ##
    #########################################################################################################
    #################   6. Matrix Factorization SVD    ################################

    param_grid = {
        'n_factors': [5, 7, 10, 15, 20, 25, 35, 50, 70, 90]
    }  #here, n_factors is the equivalent to dimension 'd' when matrix 'A'
    #is broken into 'b' and 'c'. So, matrix 'A' will be of dimension n*m. So, matrices 'b' and 'c' will be of dimension n*d and m*d.

    gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # best RMSE score
    #print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    #print(gs.best_params['rmse'])

    #############   Applying SVD with best parameters   #################

    algo = SVD(n_factors=gs.best_params['rmse']['n_factors'],
               biased=True,
               verbose=True)

    train_result, test_result = run_surprise(algo, trainset, testset, "SVD")

    model_train_evaluation["SVD"] = train_result
    model_test_evaluation["SVD"] = test_result

    #############   7. Matrix Factorization SVDpp with implicit feedback    ############

    param_grid = {
        'n_factors': [10, 30, 50, 80, 100],
        'lr_all': [0.002, 0.006, 0.018, 0.054, 0.10]
    }

    gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # best RMSE score
    #print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    #print(gs.best_params['rmse'])

    ##########
    algo = SVDpp(n_factors=gs.best_params['rmse']['n_factors'],
                 lr_all=gs.best_params['rmse']["lr_all"],
                 verbose=True)

    train_result, test_result = run_surprise(algo, trainset, testset, "SVDpp")

    model_train_evaluation["SVDpp"] = train_result
    model_test_evaluation["SVDpp"] = test_result

    ############## 8. XGBoost 13 Features + Surprise BaselineOnly + Surprise KNN Baseline + SVD + SVDpp

    Train_Reg["SVD"] = model_train_evaluation["SVD"]["Prediction"]
    Train_Reg["SVDpp"] = model_train_evaluation["SVDpp"]["Prediction"]

    Test_Reg["SVD"] = model_test_evaluation["SVD"]["Prediction"]
    Test_Reg["SVDpp"] = model_test_evaluation["SVDpp"]["Prediction"]

    #######
    x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1)

    y_train = Train_Reg["Rating"]

    y_test = Test_Reg["Rating"]

    train_result, test_result = train_test_xgboost(x_train, x_test, y_train,
                                                   y_test, "XGB_BSL_KNN_MF")

    model_train_evaluation["XGB_BSL_KNN_MF"] = train_result
    model_test_evaluation["XGB_BSL_KNN_MF"] = test_result

    ########## 9. Surprise KNN Baseline + SVD + SVDpp  ###################

    x_train = Train_Reg[[
        "KNNBaseline_User", "KNNBaseline_Item", "SVD", "SVDpp"
    ]]

    x_test = Test_Reg[["KNNBaseline_User", "KNNBaseline_Item", "SVD", "SVDpp"]]

    y_train = Train_Reg["Rating"]

    y_test = Test_Reg["Rating"]

    train_result, test_result = train_test_xgboost(x_train, x_test, y_train,
                                                   y_test, "XGB_KNN_MF")

    model_train_evaluation["XGB_KNN_MF"] = train_result
    model_test_evaluation["XGB_KNN_MF"] = test_result

    ###########################

    error_table2 = error_table.drop(["Train MAPE", "Test MAPE"], axis=1)

    error_table2.plot(x="Model",
                      kind="bar",
                      figsize=(14, 8),
                      grid=True,
                      fontsize=15)
    plt.title("Train and Test RMSE and MAPE of all Models", fontsize=20)
    plt.ylabel("Error Values", fontsize=20)
    plt.legend(bbox_to_anchor=(1, 1), fontsize=20)
    st.pyplot()
    #plt.show()

    #########
    error_table.drop(["Train MAPE", "Test MAPE"],
                     axis=1).style.highlight_min(axis=0)
Example #10
0
                                        group_cols=['userCode', 'project_id'])
train = train.merge(gp, on=['userCode', 'project_id'], how='left')
# print(name_col, train.head(), (1, train[name_col].max()))

#drop duplicate
train = train.drop_duplicates(['userCode', 'project_id'], keep='last')
print(len(train), len(test))

#to scale
scale = train[name_col].max()
train[name_col] = train[name_col].apply(lambda x: x / scale)
print('max: ', train[name_col].max())

reader = Reader(rating_scale=(0, 1))

trainset = Dataset.load_from_df(train[["userCode", "project_id", name_col]],
                                reader)
trainset = trainset.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

accuracy.rmse(predictions, verbose=True)


def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
cursor = conn.cursor()

query = "SELECT [UserId],[RecipeId],[Rating] FROM [Licenta].[dbo].[Favorites]"

cursor.execute("Delete from Recommendations")
conn.commit()

import pandas as pd
from surprise import Dataset
from surprise import Reader

db_data = pd.read_sql(query, conn)
reader = Reader(rating_scale=(1, 5))

ratings = Dataset.load_from_df(db_data[["UserId", "RecipeId", "Rating"]],
                               reader)

import math


def distance(u1, u2, d):
    ssum = 0
    for r in d[u1]:
        if r in d[u2]:
            ssum += pow(d[u1][r] - d[u2][r], 2)
    if sum == 0:
        return 0
    return math.sqrt(ssum)


def most_near(u, d, n=10):
import pandas as pd

path = '../Datasets/BookCrossings'
os.chdir(path)
trans = pd.read_csv('BX-Book-Ratings.csv',
                    sep=';',
                    error_bad_lines=False,
                    encoding="latin-1")
trans.columns = ['user', 'item', 'rating']
trans = trans[trans.rating != 0]

min_item_ratings = 10
popular_items = trans['item'].value_counts() >= min_item_ratings
popular_items = popular_items[popular_items].index.tolist()

min_user_ratings = 10
active_users = trans['user'].value_counts() >= min_user_ratings
active_users = active_users[active_users].index.tolist()

trans = trans[(trans['item'].isin(popular_items))
              & (trans['user'].isin(active_users))]
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(trans, reader)
trainset, testset = train_test_split(data, test_size=0.002)

sim_options = {'name': 'pearson', 'user_based': False}

algo = KNNBasic(sim_options=sim_options)
algo.fit(trainset)
preds = algo.test(testset)
accuracy.mae(preds)
Example #13
0
from surprise import SVD, SVDpp, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne, BaselineOnly, NormalPredictor
from surprise import KNNWithMeans

r = pd.read_csv('ratings.csv')
tr = pd.read_csv('to_read.csv')
b = pd.read_csv('books.csv')
t = pd.read_csv('tags.csv')
bt = pd.read_csv('book_tags.csv')

r.head()
b.head()

# create a reader that takes the rating scale as a parameter
reader = Reader(rating_scale=(1, 5))
# use the load_from_df function to load our book ratings dataframe
data = Dataset.load_from_df(r[['user_id', 'book_id', 'rating']], reader)

# split data into a training set and a test set with an 80/20 ratio
trainset, testset = train_test_split(data, test_size=0.2)
algo_svd = SVD()

algo_svd.fit(trainset)

predictions = algo_svd.test(trainset.build_anti_testset())

predictions_svd = algo_svd.test(testset)
pred_svd = pd.DataFrame(predictions_svd)

r.loc[(r['user_id'] == 27523) & (r['book_id'] == 2203)]

SVD().fit
def collaborative_filtering_using_surprise():
    """
    https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4
    Predict games for user with user_key = 93681
    """
    target_user_key = 93681

    # import reduced dataset:
    df = import_reduced_reviews()

    # check for duplicates:
    duplicates = len(df) - len(
        df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    # check out our user:
    df_target_user = df[df['user_key'] == target_user_key]

    # build utility matrix:
    # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating')

    # calculate sparsity
    # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size
    # print('Sparcity of utility matrix: ' + str(sparsity))

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Split in trainset and testset
    trainset, testset = train_test_split(data, test_size=0.2)

    print('Number of users: ', trainset.n_users, '\n')
    print('Number of items: ', trainset.n_items, '\n')

    # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file
    # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You
    # might need to trace back to the original names. Using the items as an example (you can do the same approach
    # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items
    # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back.

    # An example on how to save a list of inner and raw item id’s:
    trainset_iids = list(trainset.all_items())
    iid_converter = lambda x: trainset.to_raw_iid(x)
    trainset_raw_iids = list(map(iid_converter, trainset_iids))

    ## Model parameters: of kNN:
    # Two hyperparameters we can tune:
    # 1. k parameter
    # 2. similarity option
    #   a) user-user vs item-item
    #   b) similarity function (cosine, pearson, msd)

    sim_option = {'name': 'pearson', 'user_based': False}

    # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore
    k = 40
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    algo.fit(trainset)

    ## Testing:
    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Own similarity matrix:
    sim_matrix_imported = pd.read_csv(
        '../Data/Recommender/selfmade_item-item-similarity-matrix.csv',
        index_col=0)
    sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int)
    sim_matrix_imported = sim_matrix_imported.to_numpy()

    algo.sim = sim_matrix_imported

    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Cross validation:
    skip = True
    if not skip:
        results = cross_validate(algo=algo,
                                 data=data,
                                 measures=['RMSE'],
                                 cv=5,
                                 return_train_measures=True)
        results_mean = results['test_rmse'].mean()

    ## Predictions
    # Lets assume we are happy with the method and now want to apply it to the entire data set.

    # Estimate for a specific user a specific item:
    single_item_single_user_prediction = algo.predict(uid=target_user_key,
                                                      iid=100010,
                                                      verbose=True)

    # Estimate all items for a specific user:
    list_of_all_items = trainset_raw_iids
    target_predictions = []

    for item in list_of_all_items:
        single_prediction = algo.predict(uid=target_user_key, iid=item)
        target_predictions.append(
            (single_prediction.uid, single_prediction.iid,
             single_prediction.est))

    # Then sort the predictions for each user and retrieve the k highest ones:
    target_predictions.sort(key=lambda x: x[2], reverse=True)
    n = 20
    top_n = target_predictions[:n]
    top_n = [row[1] for row in top_n]

    print('end')
def selfmade_approach():
    # import reduced dataset:
    df = import_reduced_reviews(
        'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv')
    df = df[['user_key', 'game_key', 'rating']]

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Build trainset from the whole dataset:
    trainsetfull = data.build_full_trainset()
    print('Number of users: ', trainsetfull.n_users, '\n')
    print('Number of items: ', trainsetfull.n_items, '\n')

    # Parameters:
    sim_option = {'name': 'cosine', 'user_based': False}
    k = 10
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    # Run fit:
    start_time = time.time()
    algo.fit(trainsetfull)
    print("--- %s seconds ---" % (time.time() - start_time))

    # 1st approach: Calculate for a single user contained in dataset:
    target_user_key = 286189
    target_user_info = df[df['user_key'] == target_user_key]

    # Estimate single game:
    target_game_key = 100098

    # data structures:
    # sim_matrix = ndarray(312,312)
    # xr = defaultdict: 312
    # yr = defaultdict 8787

    # later on replace these by self-written structures
    xr = algo.xr
    yr = algo.yr
    sim_matrix = algo.sim
    item_means = algo.means

    inner_target_uid = algo.trainset.to_inner_uid(target_user_key)
    inner_target_iid = algo.trainset.to_inner_iid(target_game_key)

    # switch: uid and idd:
    x = inner_target_uid
    y = inner_target_iid

    # pred2:
    inner_2_raw_item_ids = algo.trainset._raw2inner_id_items
    # swap keys and values:
    inner_2_raw_item_ids = dict(
        (v, k) for k, v in inner_2_raw_item_ids.items())

    # similarity matrix with raw ids instead of inner surprise ids:
    sim_matrix_df = pd.DataFrame(sim_matrix)
    sim_matrix_df = sim_matrix_df.rename(
        columns=lambda x: inner_2_raw_item_ids[x])
    sim_matrix_df = sim_matrix_df.rename(
        index=lambda x: inner_2_raw_item_ids[x])

    target_user_ratings = yr[x]

    # convert from inner to raw:
    target_user_ratings2 = []
    for (inner_iid, rating) in target_user_ratings:
        target_user_ratings2.append((inner_2_raw_item_ids[inner_iid], rating))

    # convert item means from inner to raw:
    item_means2 = {}
    for i, mean in enumerate(item_means):
        item_means2[inner_2_raw_item_ids[i]] = mean

    myKNN = MyKnnWithMeans(sim_matrix=sim_matrix_df,
                           target_user_ratings=target_user_ratings2,
                           item_means=item_means2,
                           k=k,
                           min_k=min_k)
    pred = myKNN.predict_single_game(user_key=target_user_key,
                                     game_key=target_game_key)
    pred_surprise = algo.predict(uid=inner_target_uid, iid=inner_target_iid)

    estimate = pred
    print("Estimate for user %s for game %s is %s" %
          (target_user_key, target_game_key, estimate))

    # Estimate for user not contained in dataset:
    target_user_key = 123456789
    target_game_key = 100098

    user_ratings = [
        (100284, 7),
        (100311, 8),
        (105154, 2),
        (100020, 4),
        (100001, 9),
        (100277, 7),
    ]

    myKNN2 = MyKnnWithMeans(sim_matrix_df, user_ratings, item_means2, k, min_k)
    prediction = myKNN2.predict_single_game(target_user_key, target_game_key)

    # export similarity matrix:
    sim_matrix_df.to_csv(
        '../Data/Recommender/item-item-sim-matrix-surprise.csv')

    # export item means:
    export_path = '../Data/Recommender/item-means.json'
    with open(export_path, 'w') as fp:
        json.dump(item_means2, fp, sort_keys=False, indent=4)

    test = sim_matrix_df.loc[100516, 100284]

    pass
Example #16
0
 def __init__(self, dataframe=None):
     self.dataframe = dataframe
     self.reader = Reader(rating_scale=(1, 5))
     self.data = Dataset.load_from_df(
         self.dataframe[['user', 'trail_id', 'rating']], self.reader)
     self.fit_model = None
Example #17
0
# IMPORTS
import ccobra
import pandas as pd
from surprise import Dataset, Reader
from surprise import KNNWithMeans

# Ratings
rcols = ['userId', 'movieId', 'rating']
ml_ratings_training = pd.read_csv('../data/final_py_data_training.csv',
                                  usecols=rcols)

# Convert to Surprise Ratings
reader = Reader(rating_scale=(0.5, 5))
surprise_training = Dataset.load_from_df(ml_ratings_training,
                                         reader=reader).build_full_trainset()

# Train algorithm
i_min_k = 5
i_max_k = 100
sim_options_item = {'name': 'pearson', 'user_based': False}
algo_item = KNNWithMeans(k=i_max_k,
                         min_k=i_min_k,
                         sim_options=sim_options_item)
algo_item.fit(surprise_training)


class item_CF_model(ccobra.CCobraModel):
    def __init__(self, name='Item_CF'):
        super(item_CF_model, self).__init__(name, ["recommendation"],
                                            ["single-choice"])
def hybrid(userId,train_rd):
    #get_ipython().magic('matplotlib inline')
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate

    import warnings; warnings.simplefilter('ignore')


    # In[2]:


    #Popularity#

    md = pd.read_csv('CustomData/FinalData.csv')

    fd = pd.read_csv('avg_ratings1.csv')



    fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float')
    vote_averages= fd[fd['rating'].notnull()]['rating']
    C = vote_averages.mean()


    fd1 = pd.read_csv('ratings_count.csv')


    fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float')
    vote_counts = fd1[fd1['rating'].notnull()]['rating']


    # In[3]:


    m = vote_counts.quantile(0.75)



    # In[4]:


    md['ratings_count'] = fd1['rating']
    md['average_rating'] = fd['rating']


    # In[28]:


    #print(md.shape)
    qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']]

    qualified['ratings_count'] = qualified['ratings_count'].astype('float')

    qualified['average_rating'] = qualified['average_rating'].astype('float')

    #qualified.shape


    # In[29]:


    def weighted_rating(x):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)


    # In[30]:


    qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1)
    #qualified['wr']
    #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250)
    pop = qualified[['book_id','popularity_rating']]
    #print(qualified.shape)
    #print(pop.shape)


    # In[11]:


    ### Collaborative ##

    reader = Reader()
    ratings=train_rd
    #ratings = pd.read_csv('ratings.csv')
    #ratings.head()

    temp_ratings = ratings[0:1000]

    #print(temp_ratings)
    data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
    data.split(n_folds=2)


    # In[12]:


    svd = SVD()
    evaluate(svd, data, measures=['RMSE', 'MAE'])


    # In[13]:


    trainset = data.build_full_trainset()
    #svd.train(trainset)
    algo = SVD()
    algo.fit(trainset)

    ## usefule = temp_rating[rating]


    # In[14]:


#print(len(temp_ratings[temp_ratings['user_id']==userId]))


    # In[ ]:


    def get_top_n(predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            #user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    # In[15]:


    from collections import defaultdict
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    '''
    top_n = get_top_n(predictions, n=10000)

    #print(top_n)
    #result = pd.DataFrame(top_n)
    #print(result)
    for uid, user_ratings in top_n.items():
    
        #print(uid, [iid for (iid  , _) in user_ratings])
        for uid, iid, true_r, est, _ in predictions:
        
            temp_ratings.loc[uid]= [uid,iid,est]
        #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']]
        
    '''
    count = 0
    for uid, iid, true_r, est, _ in predictions:
        
         if uid == userId:
            count = count+1
            temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]
            #print('here')

            #print(uid)
            #temp_ratings.append([uid,iid,est],ignore_index=True)

    #print(count)
    #print(temp_ratings)



    # In[16]:


    #print(len(temp_ratings[temp_ratings['user_id']==2]))


    # In[ ]:





    # In[46]:


    ##### CONTENT ######

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate
    import csv
    import warnings; warnings.simplefilter('ignore')


    # In[48]:



    md=pd.read_csv('CustomData/FinalData.csv')
    rd=train_rd
    #rd=pd.read_csv('ratings.csv')
    md['book_id'] = md['book_id'].astype('int')
    rd['book_id'] = rd['book_id'].astype('int')
    rd['user_id'] = rd['user_id'].astype('int')
    rd['rating'] = rd['rating'].astype('int')

    #print(md.head())


    md['authors'] = md['authors'].str.replace(' ','')
    md['authors'] = md['authors'].str.lower()
    md['authors'] = md['authors'].str.replace(',',' ')

    #print(md.head())

    md['authors'] = md['authors'].apply(lambda x: [x,x])
    #print(md['authors'])

    md['Genres']=md['Genres'].str.split(';')
    #print(md['Genres'])

    md['soup'] = md['authors'] + md['Genres']
    #print(md['soup'])

    md['soup'] = md['soup'].str.join(' ')

    #md['soup'].fillna({})
    #print(md['soup'])

    count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(md['soup'])
    #print (count_matrix.shape)
    #print np.array(count.get_feature_names())
    #print(count_matrix.shape)

    cosine_sim = cosine_similarity(count_matrix, count_matrix)


    # In[91]:


    def build_user_profiles():
        user_profiles=np.zeros((53421,999))
        #print(rd.iloc[0]['user_id'])
	#len(rd['book_id'])
        for i in range(0,1000):
            u=rd.iloc[i]['user_id']
            b=rd.iloc[i]['book_id']
            #print(u,b)
            #print(i)
            #if b<999:
                #print("match at "+str(b))
            user_profiles[u][b-1]=rd.iloc[i]['rating']
        #print(user_profiles)
        return user_profiles

    user_profiles=build_user_profiles()
    def _get_similar_items_to_user_profile(person_id):
            #Computes the cosine similarity between the user profile and all item profiles
            #print(user_profiles[person_id])
        #print("\n---------\n")
        #print(cosine_sim[0])
        user_ratings = np.empty((999,1))
        cnt=0
        for i in range(0,998):
            book_sim=cosine_sim[i]
            user_sim=user_profiles[person_id]
            user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i])
        maxval = max(user_ratings)
    #print(maxval)

        for i in range(0,998):
            user_ratings[i]=((user_ratings[i]*5.0)/(maxval))
            #print(user_ratings[i])
            if(user_ratings[i]>3):
                #print("MILA KUCCHHH")
                cnt+=1
        #print(max(user_ratings))
        #print (cnt)
       
            #print(cosine_similarities)
            
            #return similar_items
        return user_ratings
    content_ratings = _get_similar_items_to_user_profile(userId)



    # In[100]:


    num = md[['book_id']]
    #print(num)

    num1 = pd.DataFrame(data=content_ratings[0:,0:])


    frames = [num, num1]
    #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])

    mer = pd.concat(frames, axis =1,join_axes=[num.index])
    mer.columns=['book_id', 'content_rating']
    #print(mer.shape)
    #print('here')
    #print(mer)





    # In[102]:


    ## for user 2 #

#print(temp_ratings.shape)
    cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']]
#   print(cb.shape)
#   print(pop.shape)
    hyb = md[['book_id']]
    hyb = hyb.merge(cb,on = 'book_id')
    hyb = hyb.merge(pop, on='book_id')
    hyb = hyb.merge(mer, on='book_id')
    #hyb.shape


    # In[106]:


    def weighted_rating(x):
        v = x['rating']
        R = x['popularity_rating']
        c = x['content_rating']
        return 0.4*v + 0.2*R + 0.4 * c


    # In[107]:


    print(hyb)
    hyb['final'] = hyb.apply(weighted_rating, axis=1)
    hyb = hyb.sort_values('final', ascending=False).head(999)
    #print(hyb['final'])

    print(hyb)
    return hyb
Example #19
0
"""
This module descibes how to load a dataset from a pandas dataframe.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                            rating_scale=(1, 5))

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)
Example #20
0
from util import *


user, book, user_test, book_test, rate, user_all, book_all, user_dict, book_dict = read_data()

# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': book,
                'userID': user,
                'rating': rate}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)


# Models
algos = []
algos_name = []

algos_name.append('BS_ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 1,
               'reg_i': 5
               }
algos.append(BaselineOnly(bsl_options=bsl_options))

algos_name.append('BS_SGD')
def benchmark_different_algorithms():
    # import reduced dataset:
    df = import_reduced_reviews(
        'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv')

    # check for duplicates:
    duplicates = len(df) - len(
        df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    ## Surprise:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    results = []
    algorithms = [
        'SVD\t\t\t\t\t\t', 'SlopeOne\t\t\t\t', 'CoClustering\t\t\t',
        'NMF\t\t\t\t\t\t', 'KNN_Basic Item-Item\t\t',
        'KNN_WithMeans Item-Item\t', 'KNN_WithZScore Item-Item',
        'KNN_Basic User-User\t\t', 'KNN_WithMeans User-User\t',
        'KNN_WithZScore User-User'
    ]

    # 1) SVD
    algo = SVD()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 2) Slope One
    algo = SlopeOne()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 3) CoClustering
    algo = CoClustering()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 4) NMF
    algo = NMF()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    ## K-Nearest Neighbors - Item-Item
    sim_option = {'name': 'cosine', 'user_based': False}
    k = 40
    min_k = 5

    # 5) KNNBasic
    algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 6) KNNWithMeans
    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 7) KNNWithZScore
    algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    ## K-Nearest Neighbors - User - User
    sim_option = {'name': 'cosine', 'user_based': True}
    k = 100
    min_k = 2

    # 8) KNNBasic
    algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 9) KNNWithMeans
    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 10) KNNWithZScore
    algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    for algorithm, result in zip(algorithms, results):
        print(algorithm + '\t \t RMSE Score: \t' +
              str(result['test_rmse'].mean()) + '\t\t Fit-Time: ' +
              str(result['fit_time']) + '\t\t Train-Time: ' +
              str(result['test_time']))
Example #22
0
    count1[count1 >= 20].index)]
#print(ratings_explicit['UserID'].value_counts())
print(ratings_explicit.shape)
#
#### split the ratings table into taining and testing dataset

ratings_train, ratings_test = train_test_split(
    ratings_explicit,
    stratify=ratings_explicit['UserID'],
    test_size=0.30,
    random_state=0)
#

#
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_train[['UserID', 'ISBN', 'Rating']],
                            reader)

parameter_grid = {'n_factors': [50, 100, 150, 200, 250, 300]}

grid_search = GridSearch(SVD, parameter_grid, measures=['RMSE', 'MAE'])

grid_search.evaluate(data)

best_parameters = grid_search.best_params
print(best_parameters)

# best RMSE and MAE score
best_result = grid_search.best_score
print(best_result)

# In[ ]:
def create_similarity_matrix():
    start_time = time.time()

    # import reviews:
    import_path = '../Data/Joined/Results/Reviews_Reduced.csv'
    df = pd.read_csv(import_path)

    # keep only important columns:
    df = df[['game_key', 'user_key', 'rating']]

    # create surprise algorithm object
    sim_option = {'name': 'pearson', 'user_based': False}
    algo = KNNWithMeans(sim_options=sim_option)

    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Build trainset from the whole dataset:
    trainset_full = data.build_full_trainset()
    print('Number of users: ', trainset_full.n_users, '\n')
    print('Number of items: ', trainset_full.n_items, '\n')

    # fit similarity matrix and calculate item means:
    algo.fit(trainset_full)
    print("--- %s seconds ---" % (time.time() - start_time))

    # save similarity matrix and means from algo object to variable
    sim_matrix = algo.sim
    item_means = algo.means

    # convert numpy array to pd df:
    sim_matrix = pd.DataFrame(sim_matrix)

    # replace inner ids with raw ids:
    raw_2_inner_ids = trainset_full._raw2inner_id_items
    # swap keys and values:
    inner_2_raw_item_ids = dict((v, k) for k, v in raw_2_inner_ids.items())

    # replace inner ids in sim_matrix index and columns by game_keys:
    sim_matrix = sim_matrix.rename(index=inner_2_raw_item_ids)
    sim_matrix = sim_matrix.rename(columns=inner_2_raw_item_ids)

    # export sim_matrix:
    sim_matrix.to_csv(
        '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset.csv'
    )

    # convert item means from inner to raw:
    item_means_raw_ids = {}
    for i, mean in enumerate(item_means):
        item_means_raw_ids[inner_2_raw_item_ids[i]] = mean

    # export item means:
    export_path = '../Data/Recommender/item-means-Reduced_dataset.json'
    with open(export_path, 'w') as fp:
        json.dump(item_means_raw_ids, fp, sort_keys=False, indent=4)

    ## create sim matrix in long format:
    # get index as column:
    column_names = list(sim_matrix.columns.values)
    sim_matrix.reset_index(level=0, inplace=True)

    # convert df from wide to long:
    sim_matrix_long = pd.melt(sim_matrix,
                              id_vars='index',
                              value_vars=column_names,
                              var_name='game_key_2')
    sim_matrix_long.rename(columns={'index': 'game_key'})

    # export long sim matrix:
    sim_matrix_long.to_csv(
        '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset-LONG_FORMAT.csv'
    )

    print("--- %s seconds ---" % (time.time() - start_time))
    print('function end reached')
df_model = df_model[~df_model['userId'].isin(drop_user_list)]
print('After Trim Shape: {}'.format(df_model.shape))
print('-Data Examples-')
df_model.head(5)

"""COLLABORATIVE FILTERING"""

!pip install surprise

from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import cross_validate

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_model[['userId', 'movieId', 'rating']], reader)

"""# Matrix factorization CF using sklearn surprise SVD"""

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])

df1.set_index('movieId', inplace = True)
df1

data_596 = df_model[(df_model['userId'] == 596) & (df_model['rating'] == 5)]
data_596 = data_596.set_index('movieId')
data_596 = data_596.join(df1)['title']
print(data_596)

data_596 = df1.copy()
Example #25
0
def splitTrainSetTestSet(odatas,frac):
    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(odatas[['userId', 'movieId', 'rating']], reader)
    trainset, testset = train_test_split(data, test_size=frac)
    return trainset,testset
Example #26
0
"""
SVD 알고리즘 적용 - 추천시스템
"""

import pandas as pd # csv file 
from surprise import SVD, accuracy # model 생성/평가 
from surprise import Reader, Dataset # dataset 생성 

# 1. 데이터 가져오기 
ratings = pd.read_csv('C:/ITWILL/4_Python-II/data/movie_rating.csv')
print(ratings) #  평가자[critic]   영화[title]  평점[rating]

# 2. rating dataset 생성 
reader = Reader(rating_scale=(1, 5))
data = Dataset(reader)
dataset = data.load_from_df(ratings[['critic','title','rating']], reader)

# train/test 
train = dataset.build_full_trainset()
test = train.build_anti_testset()

svd = SVD()
model = svd.fit(train)

# 3. 전체 사용자 대상 예측치 
pred = model.test(test)
pred
# uid='Jack', iid='Just My', r_ui=3.225806451612903, est=3.046417620945913,
# uid : 사용자, iid : 영화, r_ui : 실제 평점, est : 예측치 평점 

# uid='Toby'
Example #27
0
    return samplingDF


samplingDF = NegativeSampling(remark)
#%%
remark = pd.read_pickle('recordsForSurprise.pkl')
samplingDF = pd.read_pickle('negativeSampling.pkl')
merge = pd.concat([remark, samplingDF])
merge.reset_index(inplace=True)
del merge['index'], remark, samplingDF
merge = merge[['User', 'Item', 'rate']]

#%%

reader = surprise.Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(merge, reader)
del merge

train, test = train_test_split(data, random_state=123, test_size=0.1)
#%%训练模型(未调参)
algo = SVDpp()  #声明模型
algo.biased = False

algo.fit(train)

predictions = algo.test(test)
accuracy.mae(predictions)
a = algo.predict('15cbc496d67626ad90514b4243e7c045', '2204590')
print(a)
dump.dump(file_name='SVDmodel.pkl', algo=algo)
#%%
import pandas as pd

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.kernel_ridge import KernelRidge

import numpy as np

testdf = pd.read_csv("../testset.csv")
df = pd.read_csv("../trainset.csv")
combined = pd.concat([testdf, df])

years = pd.read_csv("../release-year.csv")

reader = Reader(rating_scale=(1, 5))  #for pandas only
data = Dataset.load_from_df(combined[['user', 'item', 'rating']], reader)

algo = SVD(reg_all=0.02)

trainset = data.build_full_trainset()
algo.fit(trainset)

moviematrix = algo.qi
y = years.values

#KERNEL RIDGE regression for release year
#best mean test MSE: 214.434
#best test MSE for a single split: 116.92
parameters = {"gamma": [1e0, 0.1, 1e-2, 1e-3, 1e-4, 1e-6]}
kr = KernelRidge(kernel='rbf')
clf = GridSearchCV(kr, parameters, cv=5, scoring='neg_mean_squared_error')
Example #29
0
list_reviews = read_datafile(data_file)

df = pd.DataFrame(list_reviews, columns=['UserId', 'ItemId', 'Playtime'])
#filter_dataset(df)
#normalize_playtime(df)

reader = Reader(rating_scale=(0, max(df.Playtime)))

sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

if cross_validate:
    data = Dataset.load_from_df(df, reader)

    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
else:
    train_df, test_df = train_test_split(df, test_size=0.2)
    train_data = Dataset.load_from_df(train_df, reader)
    training_set = train_data.build_full_trainset()
    algo.fit(training_set)

    for index, row in test_df.iterrows():
        user = row['UserId']
        item = row['ItemId']
        playtime = row['Playtime']
        prediction = algo.predict(user, item)
        print('{}:{} - {} / {}'.format(user, item, prediction, playtime))
Example #30
0
def make_prediction(test_data_imdb):
    train_data = pd.read_csv('../data/modeling/train/ratings_clean_std_0.csv',
                             sep=',').drop(columns={'Unnamed: 0'})
    omdb = pd.read_csv('../data/modeling/train/omdb_cleaned.csv')

    # build a reader, define the rating scale (minimum and maximum value)
    reader = Reader(rating_scale=(0.5, 5))
    # convert data to surprise format
    train_surprise = Dataset.load_from_df(train_data,
                                          reader).build_full_trainset()

    # Collaborative Filtering Models
    knn_collaborative = KNNWithMeans(k=115,
                                     min_k=5,
                                     sim_options={
                                         'name': 'msd',
                                         'user_based': False
                                     })
    knn_collaborative.fit(train_surprise)
    svd = SVD(lr_all=0.01, reg_all=0.05, n_epochs=23)
    svd.fit(train_surprise)
    preds = [[
        knn_collaborative.predict(test[1], test[3]).est
        for test in test_data_imdb.itertuples()
    ],
             [
                 svd.predict(test[1], test[3]).est
                 for test in test_data_imdb.itertuples()
             ]]

    # Content-Based Models
    # define features for content-based models
    params_features = {
        'threshold_actors': 0,
        'ts_languages': 0,
        'year': True,
        'runtime': True,
        'imdbvotes': True,
        'series': False,
        'awards': False,
        'genres': True,
        'imdb_rating': True,
        'roto_rating': True,
        'pg_rating': True,
        'threshold_newkeywords': 0,
        'threshold_plots': 0,
        'threshold_directors': 0
    }
    # load features
    features, names = preprocessing.features(**params_features)

    # add imdbID and set as index
    features = omdb[['imdbID'
                     ]].join(pd.DataFrame(features)).set_index('imdbID')

    # predict ratings
    pred_content = []
    no_of_ratings = []
    train_data = train_data[train_data['imdbID'] != 'tt0720339']
    for row in test_data_imdb.itertuples():
        # select user and movie

        imdbID = row.imdbID
        userID = row.user_id

        # compute predictions
        if imdbID == 'tt0720339':
            # exclude outlier movie without information
            pred_content.append(svd.predict(userID, imdbID).est)
        else:
            # select ratings of the user
            ratings_user = train_data.loc[train_data['user_id'] == userID]
            ratings_user.reset_index(inplace=True, drop=True)
            # select features of corresponding movies and convert to array
            features_user = np.array(features.loc[ratings_user['imdbID']])
            features_movie = np.array(features.loc[imdbID])

            pred_content.append(
                predict_movie_rating(ratings_user, features_user,
                                     features_movie))
        # store the number of predictions of a user:
        no_of_ratings.append(ratings_user.shape[0])

    # predictions of the models
    predictions = weighted_prediction(preds[0], preds[1], pred_content,
                                      no_of_ratings)
    test_data_with_rating = test_data_imdb.join(predictions)

    return test_data_with_rating[['user_id', 'movieID', 'rating']]
rating_train, rating_test = train_test_split(rating,
                                             train_size=0.1,
                                             test_size=0.01,
                                             random_state=12345)
print("================================================")
print("Training sample:")
print(rating_train.describe())
print("================================================")
print("Validation sample:")
print(rating_test.describe())

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0.5, 5))

# The columns must correspond to user id, item id and ratings (in that order).
rating_train2 = Dataset.load_from_df(
    rating_train[['userID', 'itemID', 'rating']], reader)
rating_test2 = Dataset.load_from_df(
    rating_test[['userID', 'itemID', 'rating']], reader)

trainset = rating_train2.build_full_trainset()
testset = rating_test2.build_full_trainset().build_testset()

#SlopeOne Model
count = 1

start = dt.datetime.today()
print("================================================")
algo = SlopeOne()

algo.train(trainset)
#print("This is the #" + str(count) + " parameter combination")
Example #32
0
import surprise as sp
from surprise import Dataset
from surprise.model_selection import cross_validate
import NetflixDataLoad

#for 100000 rows for fast processing
data = Dataset.load_from_df(
    NetflixDataLoad.df_filterd[['Cust_Id', 'Movie_Id', 'Rating']][:100000])

n_folds = 5

for algo in [sp.SVD(), sp.SVDpp(), sp.KNNBasic(), sp.KNNWithMeans()]:
    print(
        cross_validate(algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=n_folds,
                       verbose=True))

# Output Example
# Evaluating RMSE, MAE of algorithm SVD on 5 split(s).
#
#             Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std
# RMSE        0.9311  0.9370  0.9320  0.9317  0.9391  0.9342  0.0032
# MAE         0.7350  0.7375  0.7341  0.7342  0.7375  0.7357  0.0015
# Fit time    6.53    7.11    7.23    7.15    3.99    6.40    1.23
# Test time   0.26    0.26    0.25    0.15    0.13    0.21    0.06
Example #33
0
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified


# print("Improved Meta Data Recommender")
# print(improved_recommendations('The Dark Knight'))
# print("\n")
# print(improved_recommendations('Mean Girls'))

reader = Reader()
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
#data.split(n_folds=5)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
svd.fit(trainset)

ratings[ratings['userId'] == 1]

svd.predict(1, 302, 3)


def convert_int(x):
    try:
        return int(x)
amazon.describe().T["count"].sort_values(ascending = False)[:10]
amazond = amazon.drop('user_id', axis = 1)
amazond.head()
amazond.sum().sort_values(ascending = False).to_frame()[:20]
!pip install scikit-surprise
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split
amazon.columns

melt_amazon = amazon.melt(id_vars = amazon.columns[0], value_vars = amazon.columns[1:], var_name="movie name", value_name="ratings")
melt_amazon

from surprise import Dataset
reader = Reader(rating_scale=(-1,10))
data = Dataset.load_from_df(melt_amazon.fillna(0), reader = reader)
trainset, testset = train_test_split(data, test_size = 0.25)
from surprise import SVD
algo = SVD()
algo.fit(trainset)

prediction = algo.test(testset)
accuracy.rmse(prediction)

user_id = 'A3R5OBKS7OM2IR'
movie_id = 'Movie1'
rating = 5.0
algo.predict(user_id, movie_id, r_ui=rating, verbose = True)
# here it says the accuracy (estimated value as per the actual value , which is not good
#though the rmse value is also not good)
Example #35
0
import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {
    'itemID': [1, 1, 1, 2, 2],
    'userID': [9, 32, 2, 45, 'user_foo'],
    'rating': [3, 2, 4, 3, 1]
}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

print(str(predictions))
Example #36
0
for i in movies['genres'].values.tolist():
    [genres.add(ii) for ii in i.strip().split('|')]
genres_length = len(genres)
genres = dict(zip(list(genres), [i for i in range(len(genres))]))
movies_genres = pd.DataFrame(movies['genres'].map(
    lambda x: trans_genres(x, genres_length, genres)).values.tolist())
movies_genres.columns = list(genres.keys())
movies['publish_years'] = movies['title'].map(lambda x: trans_publish_years(x))
movies = pd.concat([movies, movies_genres], axis=1,
                   ignore_index=False).drop(columns=['genres'])
users['age'] = users['age'].map(lambda x: 0 if x <= 6 else x)
ratings = ratings[['user_id', 'movie_id', 'rating']]
# ratings['rating'] = ratings['rating'].map(lambda x: 0 if x < 4 else 1)
if not os.path.exists('feature/svd_pp_fi.pkl'):
    reader = Reader()
    data = Dataset.load_from_df(ratings, reader=reader)
    train, test = surprise_train_test_split(data,
                                            train_size=0.9,
                                            test_size=0.1,
                                            shuffle=False)
    svd = SVDpp(n_factors=20, n_epochs=5, random_state=321)
    svd.fit(train)
    svd_fu = pd.concat([
        ratings['user_id'].drop_duplicates().reset_index(drop=True),
        pd.DataFrame(svd.pu.tolist())
    ],
                       axis=1)
    svd_fi = pd.concat([
        ratings['movie_id'].drop_duplicates().reset_index(drop=True),
        pd.DataFrame(svd.qi.tolist())
    ],
Example #37
0
    algo = SVD()
    algoran = SVD()

    test = df.sample(n=20000, random_state=1)
    print(test)

    trainact1 = pd.concat([test, trainact]).drop_duplicates(keep=False)
    trainact1 = trainact1.head(i)
    print(trainact1)

    train = pd.concat([df, test]).drop_duplicates(keep=False)
    train = train.sample(n=i)
    print(train)

    trainsetact = Dataset.load_from_df(trainact1[['user', 'item', 'rating']],
                                       reader).build_full_trainset()
    trainset = Dataset.load_from_df(train[['user', 'item', 'rating']],
                                    reader).build_full_trainset()
    testset = Dataset.load_from_df(
        test[['user', 'item',
              'rating']], reader).build_full_trainset().build_testset()

    algo.fit(trainsetact)
    predictions = algo.test(testset)

    rmse_al.append(accuracy.rmse(predictions, verbose=False))

    algoran.fit(trainset)
    predictionsran = algoran.test(testset)

    rmse_ran.append(accuracy.rmse(predictionsran, verbose=False))
Example #38
0
from surprise import Reader
import time
import psutil
import matplotlib.pyplot as plt

x = []
timex = []
mem = []
m1 = psutil.virtual_memory().percent
#print(m1)

start = time.time()
df1 = pd.read_csv('C:/Users/Foram/Desktop/Project/ratings_1million1.csv',
                  dtype={'rating': float})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader)
algo = SVD()
result1 = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
#print(result1)
x.append(np.mean(result1['test_rmse']))
end = time.time()
#print("Time1",end - start)
timex.append(end - start)
#process=psutil.Process(os.getpid())
m2 = psutil.virtual_memory().percent
#print(m2)
mem.append(m2)

start = time.time()
df2 = pd.read_csv('C:/Users/Foram/Desktop/Project/ratings_1million2.csv',
                  dtype={'rating': float})