Ejemplo n.º 1
0
def temp1():
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import sys
    import pickle
    from surprise import Dataset, Reader, SVD, accuracy
    from surprise.model_selection import train_test_split

    movies = pd.read_csv('../data/movies.csv')
    # genome_scores = pd.read_csv('../data/genome-scores.csv')
    # tags = pd.read_csv('../data/tags.csv')
    # genome_tags = pd.read_csv('../data/genome-tags.csv')
    # Use ratings data to downsample tags data to only movies with ratings
    ratings = pd.read_csv('../data/ratings.csv')
    # print(ratings)
    print("+++++++++++==")
    ratings = ratings.drop_duplicates('movieId')

    # print(ratings)
    # 사용자 추가
    temp_df = pd.DataFrame
    rating_ser = [4.0, 5.0, 2.0]
    movie_ser = [1, 2, 3]
    user_id = ['138494', '138494', '138494']
    # userId,movieId,rating,timestamp
    # temp_df['userId'] = pd.Series(user_id)
    # temp_df['movieId'] = pd.Series(movie_ser)
    # temp_df['rating'] = pd.Series(rating_ser)

    # pd.concat([ratings, temp_df])

    # instantiate a reader and read in our rating data
    reader = Reader(rating_scale=(1, 5))
    ratings_f = ratings.groupby('userId').filter(lambda x: len(x) >= 55)
    movie_list_rating = ratings_f.movieId.unique().tolist()
    Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))
    data = Dataset.load_from_df(ratings_f[['userId', 'movieId', 'rating']],
                                reader)

    # train SVD on 75% of known rates
    trainset, testset = train_test_split(data, test_size=.25)
    algorithm = SVD()
    algorithm.fit(trainset)
    predictions = algorithm.test(testset)

    # check the accuracy using Root Mean Square Error
    accuracy.rmse(predictions)
    print("+++++++++++++1+++++++++++++")

    def pred_user_rating(ui):
        if ui in ratings_f.userId.unique():
            ui_list = ratings_f[ratings_f.userId == ui].movieId.tolist()
            d = {k: v for k, v in Mapping_file.items() if not v in ui_list}
            predictedL = []
            for i, j in d.items():
                predicted = algorithm.predict(ui, j)
                predictedL.append((i, predicted[3]))
            pdf = pd.DataFrame(predictedL, columns=['movies', 'ratings'])
            pdf.sort_values('ratings', ascending=False, inplace=True)
            pdf.set_index('movies', inplace=True)
            return pdf.head(10)
        else:
            print("User Id does not exist in the list!")
            return None

    user_id = 1
    print(pred_user_rating(user_id))
Ejemplo n.º 2
0
# data = surprise.Dataset.load_builtin('ml-100k')
# print(data)
# #data.split(n_folds=2)  # split data for 2-folds cross validation

# algo = SVD_SGD(learning_rate=.01, n_epochs=10, n_factors=10)
# cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
import pandas as pd
import numpy as np
import surprise # run 'pip install scikit-surprise' to install surprise
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
total_review_df = pd.read_csv("../data/total_review_df.csv")
# Load the dataset (download it if needed)
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(total_review_df[["user_name","res_id","rating"]],reader)

# Use the famous SVD algorithm
algo = SVD()
print("---------------------------SVD--------------------------------")
# Run 5-fold cross-validation and then print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

print()

print("---------------------------테스트--------------------------------")

uid = 1
iid = "형석"
pred = algo.predict(uid,iid,5)
Ejemplo n.º 3
0
# !/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: lishuang
@description: 使用邻域的协同过滤对movie lens进行预测,并采用K折交叉验证
"""

from surprise import KNNWithZScore, Reader, Dataset
from surprise import accuracy
from surprise.model_selection import KFold

# 加载数据
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('data/ratings.csv', reader)

# ItemCF 计算得分
# 取最相思的用户计算时,只取最相思的k个
algo = KNNWithZScore(k=40,
                     sim_options={
                         'user_based': False,
                         'verbose': 'True'
                     })

kf = KFold(n_splits=3)

for train_set, test_set in kf.split(data):
    algo.fit(train_set)
    pred = algo.test(test_set)
    rmse = accuracy.rmse(pred, verbose=True)
import os
from surprise import Dataset, Reader

# 指定文件所在路径
file_path = os.path.expanduser('ml-100k/u.data')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating timestamp', sep='\t')
# 加载数据
data2 = Dataset.load_from_file(file_path, reader=reader)
trainset = data2.build_full_trainset()# 转换成这种结构,才能获取到数据集详细信息
print('user count: ', trainset.n_users)# 用户数
print('item count', trainset.n_items)#电影数
Ejemplo n.º 5
0
def svd_impute(order_with_na):
    """
    order_for_svd_drop: 분당 주문 데이터 - 일시품절 구간 na
    train_set: 아소트 정상 판매된 분당 판매량 - svd 학습
    test_set: 일시 품절 발생한 train set 결측치 - svd 예측
    """

    random.seed(SEED)
    np.random.seed(SEED)

    """split all zero attributes"""
    all_zero_attributes = (order_with_na == 0).all(axis=1).loc[lambda x: x == True].index
    order_with_na_ = order_with_na.drop(all_zero_attributes)

    """svd require long shape"""
    sold_per_minute_long_reshape = (order_with_na_
                                    .reset_index()
                                    .melt(id_vars=ATTR_KEY,
                                          value_name=ORD_AMT_KEY,
                                          var_name=TIME_KEY))

    """check if acceptable"""
    predictable = sold_per_minute_long_reshape[ORD_AMT_KEY].isna().sum() >= 1
    if not predictable:  # return order itself  if no null
        return order_with_na

    train_set = sold_per_minute_long_reshape.dropna().copy()
    long_enough = train_set.shape[0] >= 5
    if not long_enough:
        return order_with_na

    """train svd"""
    sold_out_filter = sold_per_minute_long_reshape[ORD_AMT_KEY].isna()
    test_set = sold_per_minute_long_reshape[sold_out_filter].copy()
    # convert to surprise data type
    max_value = train_set[ORD_AMT_KEY].max()
    train_set_svd_object = Dataset.load_from_df(train_set, Reader(rating_scale=(0, max_value)))
    # find appropriate parameter grid search
    num_of_factor_candidates = 5
    num_of_attr = train_set[ATTR_KEY].unique().shape[0]
    n_factor_min = (num_of_attr // num_of_factor_candidates) + 1
    n_factor_max = num_of_attr - 1
    factor_candidates = np.linspace(start=n_factor_min, stop=n_factor_max, num=num_of_factor_candidates)
    n_factors_grid_search_pool = np.unique(np.floor(factor_candidates)).astype(int)  # svd의 파라미터로 쓸 latent factor 개수

    grid_search_pool = {'reg_all': [0],
                        'lr_all': [0.003, 0.001],
                        'n_factors': n_factors_grid_search_pool,
                        'n_epochs': [15, 22, 30],
                        'biased': [False]}
    error_measure = 'mae'
    grid_searcher = GridSearchCV(SVD, grid_search_pool, measures=[error_measure], cv=5,
                                 n_jobs=N_CORE)  # n_jobs: parallel compute

    try:
        grid_searcher.fit(train_set_svd_object)
    except:
        grid_searcher = GridSearchCV(SVD, grid_search_pool, measures=[error_measure], cv=2,
                                     n_jobs=N_CORE)  # n_jobs: parallel compute
        grid_searcher.fit(train_set_svd_object)

    best_error = grid_searcher.best_score[error_measure]
    best_param = grid_searcher.best_params[error_measure]
    """svd predict"""
    svd = SVD(**best_param).fit(train_set_svd_object.build_full_trainset())

    def _predict(test_set):
        return svd.predict(test_set[ATTR_KEY], test_set[TIME_KEY]).est

    for predict_row_idx, each_blank in test_set.iterrows():
        predicted_value = _predict(each_blank)
        test_set.loc[predict_row_idx, ORD_AMT_KEY] = predicted_value

    filled_sold_per_minute_long_shape = pd.concat([train_set, test_set]).round(0)
    imputed = filled_sold_per_minute_long_shape.pivot(index=ATTR_KEY, columns=TIME_KEY, values=ORD_AMT_KEY)

    """merge all zeros"""
    for attribute in all_zero_attributes:
        imputed.loc[attribute, :] = 0

    return imputed
Ejemplo n.º 6
0
    def train_collaborative_filtering(self, grid_search=False, gs_params=None):
        #transform page list in single value
        analytics_df_SVD = self.analytics_df.copy()
        analytics_df_SVD['ranking'] = analytics_df_SVD[[
            'totals.pageviews', 'totals.timeOnSite'
        ]].apply(lambda x: self.__generate_ranking(x), axis=1)
        analytics_df_SVD = analytics_df_SVD['pages_visited'].apply(lambda x: pd.Series(eval(x)))\
         .stack()\
         .reset_index(level=1,drop=True)\
         .to_frame('pageId')\
         .join(analytics_df_SVD[['visitId','ranking']], how='left')
        analytics_df_SVD = analytics_df_SVD.dropna()
        analytics_df_SVD = analytics_df_SVD[['visitId', 'ranking', 'pageId']]
        analytics_df_SVD['pageId'] = analytics_df_SVD['pageId'].apply(
            lambda x: int(x))

        # Saves Matrix for later use
        analytics_df_SVD.to_csv('state/visit_user_ranking.csv')

        # A reader is still needed but only the rating_scale param is requiered.
        reader = Reader(rating_scale=(1, 4))

        # The columns must correspond to user id, item id and ratings (in that order).
        data = Dataset.load_from_df(
            analytics_df_SVD[['visitId', 'pageId', 'ranking']], reader)

        trainset, testset = train_test_split(data, test_size=.1)

        # If user desires to use GridSearch to find best params and algo
        if grid_search:
            if (not gs_params):
                param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],\
                            'reg_all': [0.08, 0.1, 0.15]}
            else:
                param_grid = gs_params
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
            gs.fit(data)
            algo = gs.best_estimator['rmse']
            print(gs.best_score['rmse'])
            print(gs.best_params['rmse'])

        ## Comment next lines if you are searching the best params
        # We can now use this dataset as we please, e.g. calling cross_validate
        else:
            algo = SVD(n_factors=110, n_epochs=110, lr_all=0.008, reg_all=0.15)

        cross_validate(algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=5,
                       verbose=True)

        algo.fit(trainset)
        test_pred = algo.test(testset)
        print("SVD : Test Set")
        accuracy.rmse(test_pred, verbose=True)

        # Dump algorithm
        print('Saving trained algo...', end=" ")
        algo_list = glob.glob('state/algo_*')
        file_name = 'state/algo_' + datetime.datetime.now().strftime(
            "%Y_%B_%d__%Ih%M%p")
        dump.dump(file_name, algo=algo)
        for file in algo_list:
            os.remove(file)
        print('Done.')
Ejemplo n.º 7
0
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # * Precision at K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # * Recall at K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


# * using reader to be able to deal with the imported CSV
reader = Reader(
    line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1
)
# * loading the csv
data = Dataset.load_from_file(
    file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader
)
# * dividing in train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# * define a cross-validation iterator
kf = KFold(n_splits=5)

# * Choosing Slope One as algorithm
algo = SlopeOne()

# * Train the algorithm on the trainset, and predict ratings for the testset
Ejemplo n.º 8
0
import pandas as pd
from surprise import Reader
from surprise import Dataset
import time
import matplotlib.pyplot as plt
import psutil

timex = []
mem = []
m1 = psutil.virtual_memory().percent

#For 100 record dataset
start = time.time()
df1 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million1.csv',
                  dtype={'rating': float})
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader)
data.split(2)
algo = surprise.KNNBasic()
result1 = surprise.evaluate(algo, data, measures=['RMSE'])
end = time.time()
print("Time1", end - start)
timex.append(end - start)
m2 = psutil.virtual_memory().percent
#print(m2)
mem.append(m2)

#For 1000 record dataset
start = time.time()
df2 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million2.csv',
                  dtype={'rating': float})
"""

import numpy as np
import pprint as pp
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise import NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering
from surprise.model_selection import RandomizedSearchCV
import datetime

start = datetime.datetime.now(
)  #We want to record the time required to select the estimators
reader = Reader(sep=',', skip_lines=1)
data = Dataset.load_from_file(
    './DMT_2020__HW_2/DMT_2020__HW_2/Part_1/dataset/ratings.csv',
    reader=reader)  #Import dataset
#KNNBASELINE TUNING
similarity_options = {
    'name': ['pearson_baseline'],
    'user_based': [True, False]
}  #We set the similarity options for the RandomizedSearch.We set Pearson Baselina as suggested in the Surprise documentation
baseline_predictor_options = {
    'method': ['sgd'],
    'learning_rate': [0.002, 0.005, 0.01],
    'n_epochs': [50, 100, 150],
    'reg': [0.01, 0.02, 0.05]
}  #We set the baseline predictor options options for the RandomizedSearch
grid_of_parameters = {
Ejemplo n.º 10
0
    parser = argparse.ArgumentParser()
    parser.add_argument('--model',
                        '-m',
                        required=True,
                        choices=[
                            'NormalPredictor', 'BaselineOnly', 'KNNBasic',
                            'KNNWithMeans', 'KNNWithZScore', 'KNNBaseline',
                            'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering'
                        ])
    args = parser.parse_args()

    train_path = path + '/Data/train_format.txt'

    train_reader = Reader(line_format='user item rating timestamp',
                          sep=',',
                          rating_scale=(0, 5))
    trainset = Dataset.load_from_file(train_path, reader=train_reader)
    trainset = trainset.build_full_trainset()

    if args.model == 'NormalPredictor':
        model = surprise.NormalPredictor()
    elif args.model == 'BaselineOnly':
        model = surprise.BaselineOnly()
    elif args.model == 'KNNBasic':
        model = surprise.KNNBasic()
    elif args.model == 'KNNWithMeans':
        model = surprise.KNNWithMeans()
    elif args.model == 'KNNWithZScore':
        model = surprise.KNNWithZScore()
    elif args.model == 'KNNBaseline':
Ejemplo n.º 11
0
def train_model():
    # Send request to Nodejs server for authentication.
    if not is_good_request(request):
        return abort(400)

    # Extract data from request.
    data = request.get_json()
    dataset, data_header, model_name, params, train_type, save_on_server, save_on_local = data.values()

    # if not is_header_valid(data_header):
    #     return jsonify({'message': '[ERROR] Incorrect dataset format.'})

    # Use the data uploaded or data on server.
    df = pd.DataFrame(dataset, columns=data_header) if dataset else pd.read_csv('./data/data-new.csv', header=0)
    try:
        train_set, test_set = build_train_test(df, Reader(), full=train_type == 'full')
    except ValueError:
        return jsonify({'error': 'Incorrect dataset format.'})

    if model_name == 'insvd':
        n_factors, n_epochs, lr_all, reg_all, random_state = params.values()
        ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'}
        # Parse data types.
        n_factors = int(n_factors)
        n_epochs = int(n_epochs)
        lr_all = float(lr_all)
        reg_all = float(reg_all)
        random_state = int(random_state)
        model = InSVD(n_factors=n_factors, n_epochs=n_epochs,
                      lr_all=lr_all, reg_all=reg_all, random_state=random_state)
    else:
        k, sim_options, random_state = params.values()
        model = KNNBasic(k=int(k), sim_options={'name': sim_options, 'user_based': False},
                         random_state=int(random_state))

    # Fitting and testing.
    model.fit(train_set)
    predictions = model.test(test_set)

    # Add suffix if not save on server.
    model_path = f'./model/{model_name}' if save_on_server else f'./model/{model_name}-temp'

    # Save.
    dump.dump(model_path, algo=model, predictions=predictions)
    model_info = {
        'rmse': rmse(predictions),
        'mae': mae(predictions),
    }

    # Zip the trained model.
    try:
        zip_obj = ZipFile(f'{model_path}.zip', 'w')
        zip_obj.write(model_path)
        zip_obj.close()
    except FileNotFoundError:
        return abort(404)

    @after_this_request
    def remove_dump_files(response):
        # If not save model on server, delete model dump file.
        if not save_on_server:
            os.remove(model_path)

        # Always delete the .zip file.
        os.remove(f'{model_path}.zip')

        return response

    if save_on_local:
        with open(f'{model_path}.zip', 'rb') as f:
            model_zip = f.readlines()

        resp = Response(model_zip)
        resp.headers['X-Model-Info'] = json.dumps(model_info)
        resp.headers['Content-Type'] = 'application/zip'
        resp.headers['Content-Disposition'] = 'attachment; filename=%s;' % 'model.zip'

        return resp
        # return Response(model_zip, headers={
        #     'X-Info': json.dumps(model_info),
        #     'Content-Type': 'application/zip',
        #     'Content-Disposition': 'attachment; filename=%s;' % 'model.zip',
        # })

        # return send_from_directory('./model', model_file), 200

    return jsonify(model_info)
Ejemplo n.º 12
0
                  on=[itemID_column])
merged_data = result[[
    userID_column, itemID_column, itemName_column, ratings_column
]]
# -

len(ratings['user_id'].unique())

# # process data

# +
from surprise import Dataset
from surprise import Reader

data = merged_data[['user_id', 'book_id', 'rating']]
reader = Reader(rating_scale=(0.5, 4.5))
data = Dataset.load_from_df(data, reader)

popularity_rankings = merged_data['book_id'].value_counts()
rankings = pd.Series(range(1,
                           len(popularity_rankings) + 1, 1),
                     index=popularity_rankings.index)

processed_data = ProcessData(data, rankings)
# Train on leave-One-Out train set
trainSet = processed_data.GetLOOCVTrainSet()
testSet = processed_data.GetLOOCVTestSet()
# -

# # run test
Ejemplo n.º 13
0
'''Testing renaming of train() into fit()'''
import os

import pytest

from surprise import Dataset
from surprise import Reader
from surprise import AlgoBase
from surprise.model_selection import KFold

data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
data = Dataset.load_from_file(data_file, Reader('ml-100k'))
kf = KFold(n_splits=2)


def test_new_style_algo():
    '''Test that new algorithms (i.e. algoritms that only define fit()) can
    support both calls to fit() and to train()
    - algo.fit() is the new way of doing things
    - supporting algo.train() is needed for the (unlikely?) case where a user
    has defined custom tools that use algo.train().
    '''
    class CustomAlgoFit(AlgoBase):
        def __init__(self):
            AlgoBase.__init__(self)
            self.cnt = -1

        def fit(self, trainset):

            AlgoBase.fit(self, trainset)
            self.est = 3
Ejemplo n.º 14
0
def main():
    #Here we have 4 data that can be taken for the test. The basic one, the others with users and movies that have more than 10, 20 and 50 ratings
    elem_to_import = 4
    for i in range(elem_to_import):
        if(i == 0):
            #Import the basic data
            data_import = np.genfromtxt(DATA_PATH, delimiter=",", skip_header=1, dtype=str)
            #Construct User, Movie and rating
            userId, movieId, rating = construct_data(data_import);
        elif(i == 1):
            # Import the data with users and movies that have more than 10 ratings
            data_import = pd.read_excel(DATA_PATH_10, index_col=0, header=0)
            userId = data_import["userId"]
            movieId = data_import["movieId"]
            rating = data_import["rating"]
        elif(i == 2):
            # Import the data with users and movies that have more than 20 ratings
            data_import = pd.read_excel(DATA_PATH_20, index_col=0, header=0)
            userId = data_import["userId"]
            movieId = data_import["movieId"]
            rating = data_import["rating"]
        else:
            # Import the data with users and movies that have more than 30 ratings
            data_import = pd.read_excel(DATA_PATH_50, index_col=0, header=0)
            userId = data_import["userId"]
            movieId = data_import["movieId"]
            rating = data_import["rating"]

        #We take the indicies that we will shuffle
        indices_to_shuffle = np.array(range(len(userId)))

        test_ratio = 70
        #We create the train and test data (70% ratio of the data does on train) Indicies are shuffeled in split_data
        X_train_userId, X_train_movieId, X_train_rating, X_test_userId, X_test_movieId, X_test_rating = split_data(
            indices_to_shuffle, userId, movieId, rating, test_ratio)

        ratings_dict_train = {'itemID': X_train_movieId,
                        'userID': X_train_userId,
                        'rating': X_train_rating}

        #Create the dataframe for the surprise train
        df_train = pd.DataFrame(ratings_dict_train)
        reader_train = Reader(rating_scale=(1, 5))
        data_train = Dataset.load_from_df(df_train[['userID', 'itemID', 'rating']], reader_train)

        #We have to split the data because the algo test with the splited elements
        split = 3
        data_train.split(split)

        ratings_dict_train = {'itemID': X_test_movieId,
                            'userID': X_test_userId,
                            'rating': X_test_rating}

        # Create the dataframe for the surprise test
        df_test = pd.DataFrame(ratings_dict_train)
        reader_test = Reader(rating_scale=(1, 5))
        data_test = Dataset.load_from_df(df_test[['userID', 'itemID', 'rating']], reader_test)
        data_test.split(split)

        # KNN best param
        n_epochs = [5]
        reg_us = [5]
        reg_is = [5]

        '''
        #KNN (takes to long to test all of them but you can check)
        n_epochs = [5,10,15]
        reg_us = [5,10,15,20]
        reg_is = [5,10,20]
        '''

        # Apply the grid search for KNN
        perf_knn_grid = grid_search_knn_surprise(data_train, n_epochs, reg_us, reg_is)

        if (i == 0):
            #Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"], "surprise_bestKNN.csv")

        elif(i == 1):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN10.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN10.csv")
        elif (i == 2):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN20.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN20.csv")
        else:
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN50.xlsx')
            # KNN with the best params from GridSearch surprise
            knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN50.csv")

        # SVD best param
        n_epochs = [10]
        lr_alls = [0.00147]
        reg_alls = [0.2]
        init_mean = [0.2]
        n_factors = [80]

        '''
        #SVD (takes to long to test all of them but you can check)
        n_epochs = [5,10]
        lr_alls = [0.00145, 0.00146, 0.00147]
        reg_alls = [0.2,0.3]
        init_mean = [0, 0.2]
        n_factors = [80,100,120]
        '''

        # Apply the grid search for SVD
        perf_svd_grid = grid_search_svd_surprise(data_train, n_epochs, lr_alls, reg_alls, init_mean, n_factors)

        if (i == 0):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
                            'surpise_manualGS_SVD.xlsx')
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD.csv")

        elif(i == 1):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
                            'surpise_manualGS_SVD10.xlsx')
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD10.csv")

        elif (i == 2):
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
                            'surpise_manualGS_SVD20.xlsx')
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD20.csv")

        else:
            # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise)
            grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors,
                            'surpise_manualGS_SVD50.xlsx')
            # SVD with the best params from GridSearch surprise
            svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"],
                         perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD50.csv")
Ejemplo n.º 15
0
    `top_n:` a dictionary of the top k reccommendations for a given user

    `user:` internal user id(uid) used in the datasets like in ml-latest-parsed.csv

    Returns:
        a list containing the top k reccomendations for the given user
    """
    matches = [[iid for (iid, _) in user_ratings]
               for uid, user_ratings in top_n.items() if uid == user]
    return matches


# Load the movielens-100k dataset (download it if needed),
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                rating_scale=(0.5, 5),
                skip_lines=1)
data = Dataset.load_from_file('./ml-latest-parsed.csv', reader=reader)

trainset = data.build_full_trainset()

testset = trainset.build_anti_testset()
#testset = trainset.build_anti_testset()

#trainset, testset = train_test_split(data, test_size=.3)

# We'll use the famous SVD algorithm.
print("Creating Model")
sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 2}
algo = KNNBasic(k=40, min_k=2, sim_options=sim_options)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Feb  4 00:08:44 2019

@author: abhijithneilabraham
"""
from surprise import KNNBasic
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise.model_selection import train_test_split
import pandas as pd
customer = pd.read_csv('names.csv')

reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=',')
fieldnames = ['id', 'male_or_female']

for i in range(25):
    fieldnames.insert(2, 'question' + str(i + 1))

    data = Dataset.load_from_df(customer[fieldnames], reader)
    del fieldnames[2]
    trainset = data.build_full_trainset()

    algo = KNNBasic()
    algo.fit(trainset)
Ejemplo n.º 17
0
mtarix_toGO['Norm_Tot_Amnt'] = (mtarix_toGO['Mean_amount'] - min_amt) / max_amt
#lower_bound = min(mtarix_toGO['Log_Mean_Amount'])
#upper_bound = max(mtarix_toGO['Log_Mean_Amount'])
#print lower_bound
#print upper_bound
# Remove the outliers
dfx = mtarix_toGO[mtarix_toGO['Norm_Tot_Amnt'] <= 0.4]
lower_bound = min(dfx['Norm_Tot_Amnt'])
upper_bound = max(dfx['Norm_Tot_Amnt'])
print 'Lower Bound normalized spending =', lower_bound
print 'Upper Bound normalized spending =', upper_bound
print 'Number of Transactions remaining after removing Outliers::', mtarix_toGO.shape[
    0]

#define the reader  with  upper and lower bounds , also now we are predicting Normalized Total Amount column
reader_x = Reader(rating_scale=(lower_bound, upper_bound))
data = Dataset.load_from_df(
    df=dfx[['CustomerID', 'StockCode', 'Norm_Tot_Amnt']], reader=reader_x)

#for i in range(9):
#    print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0])

print 'difference in processed and pre-processed dataset = ', (
    data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0])

import time
start_time = time.time()

param_grid = {
    'n_factors': [2, 5, 10, 50],
    'n_epochs': [10, 50, 100],
Ejemplo n.º 18
0
from surprise import Reader, Dataset
from surprise import KNNBasic, evaluate
import csv

reader = Reader(line_format='user item rating', sep=';', rating_scale=(-1, 3))

print("Loading data...")
data = Dataset.load_from_file('./tost/u.data', reader=reader)
data.split(n_folds=3)
print(">OK\n")

#Build the trainset
trainset = data.build_full_trainset()

#Define similarities options
#sim_options = {
#    'name': 'cosine',
#    'user_based': True  # compute  similarities between items
#}

print("Training data...")
#Build an algo, and train it
algo = KNNBasic()
algo.train(trainset)
print("> OK\n")

#Get predicitions
print("Predictions :")
ratings = vars(data).get('raw_ratings')

with open('tost/u.results', "wb") as csv_file:
Ejemplo n.º 19
0
def predict(user_id, topk):

    #读取数据
    data = pd.read_csv("../data/data.dat",
                       sep="\t",
                       names=['userid', 'itemid', 'rating', 'timestamp'])

    #数据类型转换
    data['rating'] = data['rating'].astype(float)

    #数据分析,获取数据大小
    print('Dataset 1 shape: {}'.format(data.shape))

    #重建索引
    data.index = np.arange(0, len(data))
    print('Full dataset shape: {}'.format(data.shape))

    #计数
    p = data.groupby('rating')['rating'].agg(['count'])

    # 获取电影总数
    movie_count = data.isnull().sum()[1]

    # 获取用户总数
    cust_count = data['userid'].nunique() - movie_count

    # 获取评分总数
    rating_count = data['userid'].count() - movie_count

    #总体用户喜好分析
    ax = p.plot(kind='barh', legend=False, figsize=(15, 10))
    plt.title(
        'Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format(
            movie_count, cust_count, rating_count),
        fontsize=20)
    plt.axis('off')

    for i in range(1, 6):
        ax.text(p.iloc[i - 1][0] / 4,
                i - 1,
                'Rating {}: {:.0f}%'.format(
                    i, p.iloc[i - 1][0] * 100 / p.sum()[0]),
                color='white',
                weight='bold')
    '''
    影片ID是一个混乱的导入!通过dataframe循环添加电影ID列会使内核耗尽内存,
    因为它的效率太低。所以首先创建一个正确长度的numpy数组,
    然后将整个数组作为列添加到主数据dataframe中!
    '''
    df_nan = pd.DataFrame(pd.isnull(data.rating))
    df_nan = df_nan[df_nan['rating'] == True]
    df_nan = df_nan.reset_index()

    movie_np = []
    movie_id = 1

    for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]):

        temp = np.full((1, i - j - 1), movie_id)
        movie_np = np.append(movie_np, temp)
        movie_id += 1
    '''
    现在的数据集非常庞大,所以需要进行数据预处理
    删除评论过少的电影(它们相对不受欢迎)
    删除评论过少的客户(他们相对不活跃)
    在matrix的观点中,不受欢迎的电影和不活跃的客户与受欢迎的电影和活跃的客户所占的数量相同
    '''
    #筛选出评分很少的电影
    f = ['count', 'mean']
    #将数据按照itemid进行分组,然后根据rating对数据聚合,求count,mean
    df_movie_summary = data.groupby('itemid')['rating'].agg(f)
    #设置处理后的数据索引数据类型
    df_movie_summary.index = df_movie_summary.index.map(int)
    movie_benchmark = round(df_movie_summary['count'].quantile(0.8), 0)
    #筛选出不活跃的电影
    drop_movie_list = df_movie_summary[
        df_movie_summary['count'] < movie_benchmark].index
    print('Movie minimum times of review: {}'.format(movie_benchmark))

    #筛选出不活跃的用户
    df_cust_summary = data.groupby('userid')['rating'].agg(f)
    df_cust_summary.index = df_cust_summary.index.map(int)
    cust_benchmark = round(df_cust_summary['count'].quantile(0.8), 0)
    drop_cust_list = df_cust_summary[
        df_cust_summary['count'] < cust_benchmark].index
    print('Customer minimum times of review: {}'.format(cust_benchmark))

    print('Original Shape: {}'.format(data.shape))
    df = data[~data['itemid'].isin(drop_movie_list)]
    df = df[~df['userid'].isin(drop_cust_list)]
    print('After Trim Shape: {}'.format(df.shape))
    df_p = pd.pivot_table(df,
                          values='rating',
                          index='userid',
                          columns='itemid')

    #开始训练算法模型
    reader = Reader()

    #定义算法模型
    svd = SVD()

    #指定所需用户
    user_some = data[(data['userid'] == user_id) & (data['rating'] > 3)]
    user_some = user_some.set_index('itemid')
    user_some = user_some.reset_index()
    user_some = user_some[~user_some['itemid'].isin(drop_movie_list)]

    #将所有的数据集载入到data里面
    data = Dataset.load_from_df(df[['userid', 'itemid', 'rating']], reader)

    #将数据集转换为训练的数据集合
    trainset = data.build_full_trainset()
    svd.train(trainset)

    user_some['Estimate_Score'] = user_some['itemid'].apply(
        lambda x: svd.predict(user_id, x).est)
    user_some = user_some.sort_values('Estimate_Score', ascending=False)

    return user_some.head(topk)
Ejemplo n.º 20
0
from surprise import SlopeOne
from surprise import CoClustering
from surprise.model_selection import cross_validate

sys.path.insert(1, './')

from auto_surprise.engine import Engine

if __name__ == '__main__':
    print("Starting benchmark")
    # Surprise algorithms to evaluate
    algorithms = (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor)

    # Load dataset
    file_path = os.path.expanduser('../datasets/libimseti/ratings.dat')
    reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1, 10))
    data = Dataset.load_from_file(file_path, reader=reader)

    benchmark_results = {
        'Algorithm': [],
        'RMSE': [],
        'MAE': [],
        'Best params': [],
        'Time': []
    }

    # Evaluate Surprise Algorithms
    for algo in algorithms:
        algo_name = algo.__name__

        print("Running algorithm : %s" % algo_name)
Ejemplo n.º 21
0
Archivo: SVD++.py Proyecto: pqz793/Web
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
import os

# Load the movielens-100k dataset (download it if needed).
#data = Dataset.load_builtin('ml-100k')
file_path = os.path.expanduser("../train1.csv")
#reader = Reader(line_format="user item rating timestamp", sep=',')
reader = Reader(line_format="user item rating timestamp",
                sep=',',
                rating_scale=(0, 5))

data = Dataset.load_from_file(file_path, reader=reader)
'''
file_path1 = os.path.expanduser("../test1.csv")

reader1 = Reader(line_format="user item rating", sep=',')

data = Dataset.load_from_file(file_path, reader=reader)
data1 = Dataset.load_from_file(file_path1, reader=reader1)
'''
trainset = data.build_full_trainset()
#testset = data1.build_full_trainset()
# Use the famous SVD algorithm.
algo = SVDpp()

# Run 5-fold cross-validation and print results.
#cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
algo.fit(trainset)
Ejemplo n.º 22
0
#url = st.text_input('Enter the path for the data')
st.write('The data is loaded')
#data_load_state = st.text('Loading the data')
data = ds.get_data(option1)
st.write(data)

#data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99)
#data = ds.get_data('/Users/lalitharahul/Desktop/AutoRecommender/RecServe/sample_us.tsv')
#data = ds.get_data(url)
#st.write(data)
#data_load_state.text('Data is preprocessed')
data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \
rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'})

reader = Reader(rating_scale=(1.0, 5.0))
df_loaded = Dataset.load_from_df(data_surprise, reader)
#trainset = df_loaded.build_full_trainset()
results_list = []

# features
reviews = data.shape[0]
n_users = data.customer_id.nunique()
n_products = data.product_id.nunique()
mean_rating = data.star_rating.mean()
rating_std = data.star_rating.std()
sparsity = reviews * 100 / (n_users * n_products)
url = st.sidebar.text_input('Enter CustomerID')
#if st.button('Enter'):
# st.write('Entered customer id')
#st.write('The Entered Customer Id is', url)
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os

from surprise import BaselineOnly
from surprise import Dataset
from surprise import evaluate
from surprise import Reader

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)

# We'll use an algorithm that predicts baseline estimates.
algo = BaselineOnly()

# Evaluate performances of our algorithm on the dataset.
evaluate(algo, data)
Ejemplo n.º 24
0
def main():
    # Load full training data
    with open('data/train_users_04.p', 'rb') as f:
        users = pickle.load(f)

    f = open('data/train_users.csv', 'w')

    for user in users:
        if users[user] != {}:
            # zscores = stats.zscore(list(users[user].values()))
            # items = [(a[0], zscores[i]) for i, a
            #  in enumerate(users[user].items())]
            for i in users[user].items():
                t = float(i[1])
                if t > 6:
                    t = 6
                elif t < 1:
                    t = 1
                f.write('%s\t%s\t%.03f\n' % (user, i[0], t))
    f.close()

    print("finished train data")

    # Load full test data
    with open('data/test_users_04.p', 'rb') as f:
        users = pickle.load(f)

    f = open('data/test_users.csv', 'w')

    for user in users:
        if users[user] != {}:
            # zscores = stats.zscore(list(users[user].values()))
            # items = [(a[0], zscores[i]) for i,
            #  a in enumerate(users[user].items())]
            for i in users[user].items():
                t = float(i[1])
                if t > 6:
                    t = 6
                elif t < 1:
                    t = 1
                f.write('%s\t%s\t%.03f\n' % (user, i[0], t))
    f.close()

    print("finished test data")

    ######### START OF TRAINING #########

    reader = Reader(line_format='user item rating',
                    sep='\t',
                    rating_scale=(1, 6))

    train_data = Dataset.load_from_file('data/train_users.csv', reader=reader)
    #                  .build_full_trainset()

    with open('data/test_users.csv', 'r') as f:
        s = list(map(lambda x: tuple(x.split('\t')), f.read().split('\n')))
        test_data = []
        for x in s:
            if len(x) > 2:
                test_data.append((x[0], x[1], float(x[2])))

    # print(data.ur)

    # algo = KNNBasic(sim_options={'name': 'cosine'})
    # algo = NMF(verbose=True)
    algo = SVD(verbose=True)
    # algo = NormalPredictor()
    algo.fit(train_data.build_full_trainset())

    # cross_validate(algo, train_data, verbose=True)

    # print(algo.predict('76561197960675902', '70', r_ui=63, verbose=True))
    # print(algo.predict('76561197960675902', '4540', r_ui=22, verbose=True))
    # print(algo.predict('76561197960675902', '550', r_ui=791, verbose=True))
    # print(algo.predict('76561197960675902', '10190', r_ui=1253, verbose=True))
    # print(algo.predict('76561197960675902', '10', r_ui=1037, verbose=True))

    predictions = algo.test(test_data)

    print(accuracy.rmse(predictions))
import pandas as pd
from surprise import Dataset
from surprise import BaselineOnly
from surprise import Reader
import sys

input_path = sys.argv[1] + "yelp_train.csv"
test_file_name = sys.argv[2]
result_file_name = sys.argv[3]

reader = Reader(rating_scale=(0, 5))
train_read = pd.read_csv(input_path)
test_read = pd.read_csv(test_file_name)

train_load = Dataset.load_from_df(train_read,
                                  reader=reader).build_full_trainset()
test_load = Dataset.load_from_df(test_read,
                                 reader=reader).build_full_trainset()

bsl_options = {'method': 'als', 'n_epochs': 9, 'reg_u': 7.6, 'reg_i': 3.5}

algorithm = BaselineOnly(bsl_options=bsl_options)
results = algorithm.fit(train_load).test(test_load.build_testset())

with open(result_file_name, "w") as fout:
    fout.write("user_id, business_id, prediction\n")
    for p in results:
        fout.write(str(p.uid) + "," + str(p.iid) + "," + str(p.est) + "\n")
fout.close()
Ejemplo n.º 26
0
import os
import random

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import KNNBaseline
from surprise import evaluate
from surprise import GridSearch

# the test and train files are from the ml-100k dataset (10% of u1.base and
# 10 % of u1.test)
train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'))

random.seed(0)


def test_grid_search_cv_results():
    """Ensure that the number of parameter combinations is correct."""
    param_grid = {
        'n_epochs': [1, 2],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    }
    grid_search = GridSearch(SVD, param_grid)
    grid_search.evaluate(data)
    df_movie_summary['count'] < movie_benchmark].index

df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(J)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary)['count'].quantile(0.8)
drop_movie_list = df_cust_summary[
    df_cust_summary['count'] < cust_benchmark].index

# pivot the data set to make a matrix form
df_pivot = pd.pivot_table(df,
                          values='Rating',
                          index='Cust_Id',
                          columns='Movie_Id')

# Now start collaborative filtering
reader = Reader()

algo = SVD()

# Take input for Customer ID
customerID = input("Customer ID Please: ")
customerID = int(customerID)
print("We are recommending Movies for Customer ID: ", customerID)

number = input("Please input the total number of movies to be recommended: ")
number = int(number)
print("Number of Movies Being Recommended: ", number)

sdata = df[(df['Cust_Id'] == customerID)]
print(sdata)
sdata = sdata.set_index('Movie_Id')
Ejemplo n.º 28
0
def train(params):

    # Load the training data and create a df for training
    temp_df = read_dataframe(params.client_id, params.source_bucket, 'buy.csv')
    raw_df = pd.DataFrame(
        data={
            'entity_id': temp_df['entity_id'],
            'target_entity_id': temp_df['target_entity_id']
        })
    raw_df['rating'] = MAX_RATING

    # create the training set
    reader = Reader(rating_scale=(0, MAX_RATING))
    data = Dataset.load_from_df(
        raw_df[['entity_id', 'target_entity_id', 'rating']], reader)
    training_data = data.build_full_trainset()

    # Find optimal parameters
    print(' --> fitting the model')

    param_grid = {
        'n_epochs': [10, 30],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.2, 0.6]
    }
    gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])

    # Build an model, and train it
    print(' --> build the model')

    #model = SVD()
    model = gs.best_estimator['rmse']
    model.fit(training_data)

    # Batch predictions
    print(' --> batch predictions')

    unique_entity = np.unique(raw_df.entity_id.values)
    unique_target_entity = np.unique(raw_df.target_entity_id.values)

    px = pd.DataFrame(-1.0,
                      index=unique_entity,
                      columns=unique_target_entity,
                      dtype=np.float64)
    predx = training_data.build_anti_testset(fill=0)

    for p in predx:
        pred = model.predict(training_data.to_inner_uid(p[0]),
                             training_data.to_inner_iid(p[1]))
        px.at[p[0], p[1]] = round(pred.est, PRECISION)

    # create the export
    print(' --> create export')
    ex = pd.DataFrame(index=unique_entity,
                      columns={'entity_type', 'target_entity_type', 'values'})

    for id in unique_entity:
        p1 = px.loc[id, :]
        p2 = p1.sort_values(ascending=False)
        p3a = p2[p2 > FILTER_THRESHOLD]
        p3 = p3a[p3a < 1.0].head(MAX_PREDICTION)
        t = zip(p3.index.tolist(), p3.values)
        tf = [item for sublist in t for item in sublist]

        ex.at[id, 'entity_type'] = 'user'
        ex.at[id, 'target_entity_type'] = 'item'
        ex.at[id, 'values'] = tf

    write_dataframe(params.job_id, params.job_dir, 'pred_user.csv',
                    ['entity_type', 'target_entity_type', 'values'],
                    'entity_id', ex)
Ejemplo n.º 29
0
from surprise import NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

# Load the movielens‐100k dataset (download it if needed),
reader = Reader(line_format='user item rating', sep=' ')
dataset = Dataset.load_from_file(
    './sushi3-2016/sushi3b.5000.10.score_converted', reader=reader)
trainset = dataset.build_full_trainset()

# We'll use the famous NMF algorithm.
algo = NMF()
algo.fit(trainset)

# Run 5‐fold cross‐validation and print results
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Ejemplo n.º 30
0
from surprise import KNNBaseline, Reader
from surprise import Dataset

import cProfile as pickle
# 重建歌单id到歌单名的映射字典
id_name_dic = pickle.load(open("popular_playlist.pk1", "rb"))
print("加载歌单id到歌单名的映射字典完成....")
# 重建歌单名到歌单id的映射字典
name_id_dic = {}
for playlist_id in id_name_dic:
    name_id_dic[id_name_dic[playlist_id]] = playlist_id
print("加载歌单名到歌单id的映射字典完成.....")

file_path = os.path.expanduser("./popular_music_suprise_format.txt")
# 指定文件格式
reader = Reader(line_format="user item rating timestamp", sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲间的相似度
print("构建数据集......")
trainset = music_data.build_full_trainset()

# 模板之查找最近的user
print("开始训练模型.....")
algo = KNNBaseline()
algo.train(trainset)

current_playlist = name_id_dic.keys()[39]
print("歌单名称:", current_playlist)

# 取出近邻