コード例 #1
0
class Appraisal:

    def __init__(self, data, popularityRankings):
        self.rankings = popularityRankings

        # Build a full training set for evaluating overall properties
        self.fullTrainSet = data.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        # Build a 75/25 train/test split for measuring accuracy
        self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1)

        # Build a "leave one out" train/test split for evaluating top-N recommenders
        # And build an anti-test-set for building predictions
        LOOCV = LeaveOneOut(n_splits=1, random_state=1)
        for train, test in LOOCV.split(data):
            self.LOOCVTrain = train
            self.LOOCVTest = test

        self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()

        # Compute similarty matrix between items so we can measure diversity
        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)

    def EntireTraining(self):
        return self.fullTrainSet

    def EntireTesting(self):
        return self.fullAntiTestSet

    def OppositeTesting(self, USER):
        trainset = self.fullTrainSet
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(USER))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                         i in trainset.all_items() if
                         i not in user_items]
        return anti_testset

    def Training(self):
        return self.trainSet

    def Testing(self):
        return self.testSet

    def LeaveOneOutTraing(self):
        return self.LOOCVTrain

    def LeaveOneOutTesting(self):
        return self.LOOCVTest

    def LeaveOneOutTestingOppositeTesting(self):
        return self.LOOCVAntiTestSet

    def Closeness(self):
        return self.simsAlgo

    def Celebrity(self):
        return self.rankings
コード例 #2
0
from surprise import SVD, KNNBaseline
from surprise.model_selection import train_test_split, LeaveOneOut
from RecommenderMetrics import RecommenderMetrics

ml = MovieLens()

print("Loading movie ratings...")
data = ml.loadMovieLensLatestSmall()

print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.getPopularityRanks()

print("\nComputing item similarities so we can measure diversity later...")
fullTrainSet = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
simsAlgo = KNNBaseline(sim_options=sim_options)
simsAlgo.fit(fullTrainSet)

print("\nBuilding recommendation model...")
trainSet, testSet = train_test_split(data, test_size=.25, random_state=1)

algo = SVD(random_state=10)
algo.fit(trainSet)

print("\nComputing recommendations...")
predictions = algo.test(testSet)

print("\nEvaluating accuracy of model...")
print("RMSE: ", RecommenderMetrics.RMSE(predictions))
print("MAE: ", RecommenderMetrics.MAE(predictions))
コード例 #3
0
from surprise import SVD, KNNBasic, KNNWithMeans, KNNBaseline, NMF, SlopeOne, CoClustering, BaselineOnly, NormalPredictor
'''
    "SVD" -- https://en.wikipedia.org/wiki/Singular_value_decomposition
    "KNN" -- https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm
    "Centered KNN" -- KNN with mean user ratings considered 
    "KNN with Baseline" -- KNN with baseline considered 
    "NMF" -- https://en.wikipedia.org/wiki/Non-negative_matrix_factorization
    "SlopeOne" -- https://en.wikipedia.org/wiki/Slope_One
    "CoClustering" -- https://en.wikipedia.org/wiki/Biclustering
    "BaselineOnly" -- baseline predicted for specific user/item
    "NormalPredictor" -- predict random rating from normal distribution 

    https://surprise.readthedocs.io/en/stable/basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly

'''
labels = [
    "SVD", "KNN", "Centered KNN", "KNN with Baseline", "NMF", "SlopeOne",
    "CoClustering", "BaselineOnly", "NormalPredictor"
]

algorithms = [
    SVD(),
    KNNBasic(),
    KNNWithMeans(),
    KNNBaseline(),
    NMF(),
    SlopeOne(),
    CoClustering(),
    BaselineOnly(),
    NormalPredictor()
]
コード例 #4
0
    def train_knn(self, df, userId, user_m_ids, movies_watched):
        """
            param: df - movies pandas DataFrame
                   userId - user ID to predict movies with
                   user_m_ids - List of movieIDs of movies to be recommended upon
                                (as seen in TMDB dataset)
                   movies_watched - List of movie titles watched 
                                    (as seen in TMDB dataset)

            return: pandas DataFrame of the recommended movies with attributes - 
                    title, id, vote_average, vote_count, popularity, release date

            Collaborative filtering is done using KNN-Baseline and prediction is 
            done using pearson_baseline. The technique used is item-item based.
		"""
        reader = Reader(rating_scale=(1, 5))
        movie_ids = self.get_movie_ids()
        rec_result = dict()

        sim_options = {"name": "pearson_baseline", "user_based": False}

        data = Dataset.load_from_df(df[RATING_ATTR], reader)
        if isfile(PATH_COLL_FILTERING_CACHE):
            model = joblib.load(PATH_COLL_FILTERING_CACHE)
        else:
            trainset = data.build_full_trainset()
            model = KNNBaseline(sim_options=sim_options)
            model.fit(trainset)
            joblib.dump(model, PATH_COLL_FILTERING_CACHE)

        inn_id = model.trainset.to_inner_iid(user_m_ids[0])
        # print(self.get_movie_title(self.get_tmdb_id(user_m_ids[0])))
        inn_id_neigh = model.get_neighbors(inn_id, k=10)
        # print(inn_id_neigh)

        df_pref = pd.DataFrame(columns=[
            "title",
            "id",
            "vote_average",
            "vote_count",
            "popularity",
            "release_date",
        ])
        index = 0

        for m_id in inn_id_neigh:
            title_df = self.get_movie_title(
                self.get_tmdb_id(model.trainset.to_raw_iid(m_id)))
            try:
                if title_df[0] not in movies_watched:
                    df_pref.loc[index] = array([
                        title_df[0],
                        title_df[1],
                        title_df[2],
                        title_df[3],
                        title_df[4],
                        title_df[5],
                    ])
                    index += 1
            except:
                pass

        return df_pref
コード例 #5
0
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
#data = Dataset.load_builtin('ml-100k')
file_path = 'ratings_android.dat'
reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=' ')
data = Dataset.load_from_file(file_path, reader=reader)

trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': True}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)

# Read the mappings raw id <-> movie name
#rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
#toy_story_raw_id = name_to_rid['Toy Story (1995)']
#toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(1, k=10)

# Convert inner ids of the neighbors into names.
#toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
#                       for inner_id in toy_story_neighbors)
コード例 #6
0
def best_pred():
    review['새주소'] = review['장소'] + "*" + review['주소']
    review2 = review.drop([
        '장소', '주소', '위도', '경도', '분류', '대분류', '주소1', '주소2', '방문횟수', '년도', '월',
        '계절'
    ],
                          axis=1)
    review2 = review2[['이름', '새주소', '별점']]

    # 데이터 셋의 차원 줄이기
    # 저조한 평가를 기록한 장소 및 사용자 제외
    min_ratings = 50
    filter_review = review2['새주소'].value_counts() > min_ratings
    filter_review = filter_review[filter_review].index.tolist()

    min_user_ratings = 50
    filter_users = review2['이름'].value_counts() > min_user_ratings
    filter_users = filter_users[filter_users].index.tolist()

    review_new = review2[(review2['새주소'].isin(filter_review))
                         & (review2['이름'].isin(filter_users))]

    reader = Reader(rating_scale=(0, 5))
    data = Dataset.load_from_df(review_new[['이름', '새주소', '별점']], reader)

    benchmark = []
    # Iterate over all algorithms
    for algorithm in [
            SVD(),
            SVDpp(),
            SlopeOne(),
            NMF(),
            NormalPredictor(),
            KNNBaseline(),
            KNNBasic(),
            KNNWithMeans(), KNNWithZScore,
            BaselineOnly(),
            CoClustering()
    ]:
        # Perform cross validation
        algo = NMF()
        results = cross_validate(algo,
                                 data,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)
        trainset, testset = train_test_split(data, test_size=0.25)
        predictions = algo.fit(trainset).test(testset)
        # accuracy.rmse(predictions)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    surprise_results = pd.DataFrame(benchmark).set_index(
        'Algorithm').sort_values('test_rmse')

    # Train and Predict
    # CoClustering 알고리즘이 가장 좋은 rmse 결과를 보였다. 따라서 CoClustering 사용하여
    # 훈련 및 예측을 진행하고 교대최소제곱(ALS)를 사용할 것
    algo = NMF()
    cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)

    # rmse 정확도 훈련셋과 검증셋을 샘플링하기위해 train_Test_split()을 사용
    # rmse 정확도 척도를 사용
    # fit() 메소드를 통해 훈련셋의 알고리즘을 훈련시키고, test() 메소드를 통해 검증셋으로부터
    # 생성된 예측을 반환
    trainset, testset = train_test_split(data, test_size=0.25)
    # algo = BaselineOnly(bsl_options=bsl_options)
    algo = NMF()
    predictions = algo.fit(trainset).test(testset)

    # dump.dump('./dump_file',predictions, algo)
    # predictions, algo = dump.load('./dump_file')

    trainset = algo.trainset

    # 예측을 정확히 살펴보기 위해, 모든 예측에 대한 데이터프레임 생성

    def get_Iu(uid):
        try:
            return len(trainset.ur[trainset.to_inner_uid(uid)])
        except ValueError:  # user was not part of the trainset
            return 0

    def get_Ui(iid):
        try:
            return len(trainset.ir[trainset.to_inner_iid(iid)])
        except ValueError:
            return 0

    df = pd.DataFrame(predictions,
                      columns=['uid', 'iid', 'rui', 'est', 'details'])
    df['Iu'] = df.uid.apply(get_Iu)
    df['Ui'] = df.iid.apply(get_Ui)
    df['err'] = abs(df.est - df.rui)

    predictions = df.sort_values(by='err').drop_duplicates('iid')

    best_predictions = predictions[:100]
    worst_predictions = predictions[-10:]

    # tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],index=['Algorithm']))
    best_predictions['iid'] = best_predictions.iid.str.split('*').str[0]

    sql = "insert into rec(rec_uid, rec_iid, rec_rui, rec_est) values(:rec_uid, :rec_iid, :rec_rui, :rec_est)"
    data = best_predictions[['uid', 'iid', 'rui', 'est']]
    data.columns = ['rec_uid', 'rec_iid', 'rec_rui', 'rec_est']
    cursor.close()
    conn.close()
    return data
コード例 #7
0
# sim_options = {'name': 'cosine', 'user_based': False}
# algo = KNNBaseline(k = 10, sim_options = sim_options)
# cross_validate(algo, data, cv = 5, verbose = True)

# use Gridsearch to find the best parameters 
param_grid_knnbl = {'k': list(range(10, 50, 10)),
             'sim_options': {'name':['cosine', 'pearson'],
                            'user_based':[False]}}
gs_knnbl = GridSearchCV(KNNBaseline, param_grid_knnbl, measures = ['rmse','mae'], cv = 5)
gs_knnbl.fit(samp_dat)
algo_knnbl = gs_knnbl.best_estimator['rmse']
print(gs_knnbl.best_score['rmse'])
print(gs_knnbl.best_params['rmse'])

# Use the new parameters with the sampled training data
algo_knnbl = KNNBaseline(k = 20, sim_options = {'name': 'pearson', 'user_based': False})
fit_rmse(algo_knnbl, samp_dat)

output(algo_knnbl, "KNNBaseline_k20_pearson_ii.csv")

#%% 
# SVD
algo_svd = SVD()
fit_rmse(algo_svd, tr_dat)
output(algo_svd, "SVD.csv")

#%%
# SVDpp
algo_svdpp = SVDpp()
fit_rmse(algo_svdpp, tr_dat)    
output(algo_svdpp, "SVDPP.csv")
from surprise import KNNBasic, evaluate

algo = KNNBasic()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用均值协同过滤
from surprise import KNNWithMeans, evaluate

algo = KNNWithMeans()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用协同过滤baseline
from surprise import KNNBaseline, evaluate

algo = KNNBaseline()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD
from surprise import SVD, evaluate

algo = SVD()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用SVD++
from surprise import SVDpp, evaluate

algo = SVDpp()
perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'])

### 使用NMF
コード例 #9
0
pred1 = algo1.predict(uid, iid, verbose=True)
#KNNWithMeans
algo2 = KNNWithMeans(k=30,
                     sim_options={
                         'name': 'cosine',
                         'user_based': False
                     },
                     verbose=True)
algo2.fit(trainset)
pred2 = algo2.predict(uid, iid, verbose=True)

#KNNWithZScore f
algo3 = KNNWithZScore(k=30,
                      sim_options={
                          'name': 'MSD',
                          'user_based': True
                      },
                      verbose=True)
algo3.fit(trainset)
pred3 = algo3.predict(uid, iid, verbose=True)
#KNNBaseline
algo4 = KNNBaseline(k=30,
                    sim_options={
                        'name': 'MSD',
                        'user_based': True
                    },
                    verbose=True)
algo4.fit(trainset)
pred4 = algo4.predict(uid, iid, verbose=True)
コード例 #10
0
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie
movie_raw_id = name_to_rid['Clockwork Orange, A (1971)']
movie_inner_id = algo.trainset.to_inner_iid(movie_raw_id)
movie_neighbors = algo.get_neighbors(movie_inner_id, k=10)

# Convert inner ids of the neighbors into names.
movie_neighbors = (algo.trainset.to_raw_iid(inner_id)
                   for inner_id in movie_neighbors)
movie_neighbors = (rid_to_name[rid] for rid in movie_neighbors)
コード例 #11
0
class recsysBase:
    data = ''
    trainset = ''
    testset = ''
    algorithm = ''
    algo = ''
    predictions = ''

    def __init__(self,
                 data,
                 algorithm='svd',
                 algo_options={},
                 testset_percent=0):
        if not data:
            return

        self.data = data
        self.algorithm = algorithm

        ##
        if testset_percent == 0:
            self.trainset = self.data.build_full_trainset()
            self.testset = self.trainset.build_anti_testset()
        else:
            self.trainset, self.testset = train_test_split(
                self.data, test_size=testset_percent)

        if self.algorithm == 'svd':
            self.algo = SVD()
        elif self.algorithm == 'knn_basic':
            self.algo = KNNBasic()
        elif self.algorithm == 'knn_baseline':
            if not algo_options:
                algo_options = {
                    'name': 'pearson_baseline',
                    'user_based': False
                }

            self.algo = KNNBaseline(sim_options=algo_options)

        self.algo.fit(self.trainset)

    def exec(self):
        self.step1()
        self.step2()
        self.step3()

    def step1(self):
        pass

    def step2(self):
        pass

    def step3(self):
        pass

    def compute_rmse(self):
        if not self.predictions:
            self.test()

        accuracy.rmse(self.predictions)

    def load_from_file(self, file_path='predictions.csv'):
        self.predictions = pd.read_csv(filepath)

    def save_to_file(self, file_path='predictions.csv'):
        pd.DataFrame(algo.predictions).to_csv(file_path, index=False)

    def benchmark(self):
        cross_validate(self.algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=5,
                       verbose=True)

    def tune(self,
             opt_field='rmse',
             param_grid={
                 'n_epochs': [5, 10],
                 'lr_all': [0.002, 0.005],
                 'reg_all': [0.4, 0.6]
             },
             SHOW_RESULT=False):

        if self.algorithm == 'svd':
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

        ## Start tuning
        gs.fit(self.data)

        ## Save to self.algo
        self.algo = gs.best_estimator[opt_field]
        self.algo.fit(self.trainset)

        if SHOW_RESULT:
            # best RMSE score
            print(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print(gs.best_params['rmse'])

        return self

    def tune_and_test(self,
                      unbiased_percent=0.1,
                      opt_field='rmse',
                      param_grid={
                          'n_epochs': [5, 10],
                          'lr_all': [0.001, 0.01]
                      }):

        ## Get RAW
        raw_ratings = self.data.raw_ratings

        ## Shuffle ratings if you want
        random.shuffle(raw_ratings)

        ##
        threshold = int((1 - unbiased_percent) * len(raw_ratings))
        A_raw_ratings = raw_ratings[:threshold]
        B_raw_ratings = raw_ratings[threshold:]

        data = self.data
        data.raw_ratings = A_raw_ratings

        ## Select your best algo with grid search.
        grid_search = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
        grid_search.fit(data)

        self.algo = grid_search.best_estimator[opt_field]

        # retrain on the whole set A
        trainset = data.build_full_trainset()
        self.algo.fit(trainset)

        # Compute biased accuracy on A
        predictions = self.algo.test(trainset.build_testset())
        print('Biased accuracy on A,', end='   ')
        accuracy.rmse(predictions)

        # Compute unbiased accuracy on B
        testset = data.construct_testset(
            B_raw_ratings)  # testset is now the set B
        predictions = self.algo.test(testset)
        print('Unbiased accuracy on B,', end=' ')
        accuracy.rmse(predictions)

        return self

    def test(self):
        self.predictions = self.algo.test(self.testset)
        self.compute_rmse()

    def get_top_n(self, target_uid=None, n=10, SHOW_RESULT=False):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        if target_uid:
            target_uid = str(target_uid)

        # Check if testset is valid
        if not self.predictions:
            self.predictions = self.algo.test(self.testset)

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in self.predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            if target_uid and target_uid != uid:
                continue

            user_ratings.sort(key=lambda x: x[1], reverse=True)

            if target_uid:
                top_n = user_ratings[:n]
                break
            else:
                top_n[uid] = user_ratings[:n]

        # Print the recommended items for each user
        if SHOW_RESULT:
            try:
                for uid, user_ratings in top_n.items():
                    print(uid, [iid for (iid, _) in user_ratings])
            except:
                print(top_n)

        return top_n

    def precision_recall_at_k(self,
                              target_uid=1,
                              threshold=3.5,
                              k=10,
                              num_of_testset=5,
                              SHOW_RESULT=True):
        ## target_uid:  User ID to get result
        ## threshold:   the lowerbound that the rating should be higher
        ## k:           to get number of relevant and recommended items in top k

        if target_uid:
            target_uid = str(target_uid)

        kf = KFold(n_splits=num_of_testset)

        final_precision = []
        final_recalls = []

        for trainset, testset in kf.split(self.data):
            self.algo.fit(trainset)
            predictions = self.algo.test(testset)
            '''Return precision and recall at k metrics for each user.'''
            # First map the predictions to each user.
            user_est_true = defaultdict(list)
            for uid, _, true_r, est, _ in predictions:
                user_est_true[uid].append((est, true_r))

            precisions = dict()
            recalls = dict()
            for uid, user_ratings in user_est_true.items():
                # Sort user ratings by estimated value
                user_ratings.sort(key=lambda x: x[0], reverse=True)

                # Number of relevant items
                n_rel = sum(
                    (true_r >= threshold) for (_, true_r) in user_ratings)

                # Number of recommended items in top k
                n_rec_k = sum(
                    (est >= threshold) for (est, _) in user_ratings[:k])

                # Number of relevant and recommended items in top k
                n_rel_and_rec_k = sum(
                    ((true_r >= threshold) and (est >= threshold))
                    for (est, true_r) in user_ratings[:k])

                # Precision@K: Proportion of recommended items that are relevant
                precisions[
                    uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

                # Recall@K: Proportion of relevant items that are recommended
                recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

            if SHOW_RESULT:
                print('Relevant: ' + str(
                    sum(prec
                        for prec in precisions.values()) / len(precisions)))
                print('Recommended: ' +
                      str(sum(rec for rec in recalls.values()) / len(recalls)))

            final_precision.append(precisions[uid])
            final_recalls.append(recalls[uid])

        if SHOW_RESULT:
            print(final_precision, final_recalls)

        return final_precision, final_recalls

    def read_item_names(self,
                        file_name=get_dataset_dir() +
                        '/ml-100k/ml-100k/u.item'):
        """Read the u.item file from MovieLens 100-k dataset and return two
        mappings to convert raw ids into movie names and movie names into raw ids.
        """

        rid_to_name = {}
        name_to_rid = {}
        with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
            for line in f:
                line = line.split('|')
                rid_to_name[line[0]] = line[1]
                name_to_rid[line[1]] = line[0]

        return rid_to_name, name_to_rid

    def get_k_neighbors(self, name='Toy Story (1995)', k=10, SHOW_RESULT=True):
        ###########################################
        ## You need to use algorithm='knn_baseline' at the beginning
        ###########################################
        if self.algorithm != 'knn_baseline':
            self.__init__(data=self.data,
                          algorithm='knn_baseline',
                          testset_percent=0)

        ###########################################
        ###########################################
        ## Read the mappings raw id <-> movie name
        rid_to_name, name_to_rid = self.read_item_names()

        ##
        input_raw_id = name_to_rid[name]
        input_inner_id = self.algo.trainset.to_inner_iid(input_raw_id)

        ## Retrieve inner ids of the nearest neighbors of Toy Story.
        input_neighbors = self.algo.get_neighbors(input_inner_id, k=k)

        ## Convert inner ids of the neighbors into names.
        input_neighbors = (self.algo.trainset.to_raw_iid(inner_id)
                           for inner_id in input_neighbors)
        input_neighbors = (rid_to_name[rid] for rid in input_neighbors)

        ## Show result
        if SHOW_RESULT:
            print('\nThe ' + str(k) + ' nearest neighbors of "' + name +
                  '" are:')

            for neighbor in input_neighbors:
                print(neighbor)

        return input_neighbors
コード例 #12
0
ファイル: CF KNNBV01.py プロジェクト: Ge-Hub/week5_Stu_Ge-ZHU
print("KNNWithZScore Results:", perf)
print("-" * 118)

### 使用协同过滤正态分布 Item based
from surprise import KNNWithZScore
algo = KNNWithZScore(k=50, sim_options={'user_based': True, 'verbose': 'True'})
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)
print("KNNWithZScore Results:", perf)
print("-" * 118)

### 使用基础版协同过滤
from surprise import KNNBasic
algo = KNNBasic(k=50, sim_options={'user_based': False, 'verbose': 'True'})
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)
print("KNNBasic Results:", perf)
print("-" * 118)

### 使用均值协同过滤
from surprise import KNNWithMeans
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)
print("KNNWithMeans Results:", perf)
print("-" * 118)

### 使用协同过滤baseline
from surprise import KNNBaseline
algo = KNNBaseline(k=50, sim_options={'user_based': False, 'verbose': 'True'})
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)
print("KNNBaseline Results:", perf)
print("-" * 118)
コード例 #13
0
''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. 
    We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40  
    We train the model on train set '''

algo2 = KNNBasic(sim_options=sim_options, k=40, min_k=1)
algo2.fit(trainset)

predictions2 = algo2.test(testset)
print("RMSE for KNNBasic:", accuracy.rmse(predictions2, verbose=True))

# In[ ]:
''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. 
    We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40  
    We train the model on train set '''

algo3 = KNNBaseline(sim_options=sim_options, k=40, min_k=1)
algo3.fit(trainset)

predictions3 = algo3.test(testset)
print("RMSE for KNNBaseline:", accuracy.rmse(predictions3, verbose=True))

# In[ ]:
''' We build the model by making use of KNNBasic which is collaborative filtering based algorithm. 
    We are setting minimum number of neighbous (min_k) 1 and maximum number of neighbours (k) = 40  
    We train the model on train set '''

algo4 = KNNWithZScore(sim_options=sim_options, k=40, min_k=1)
algo4.fit(trainset)

predictions4 = algo4.test(testset)
print("RMSE for KNNBasic:", accuracy.rmse(predictions4, verbose=True))
コード例 #14
0
# 2. 做一个交叉验证的数据划分
data.split(5)

# 3. 模型对象的构建
bsl_options = {
    'method': 'als',  # 给定求解方式,可选值:als和sgd
    'n_epochs': 10,  # 迭代次数
    'reg_i': 20,  # 正则化系数,
    'reg_u': 10  # 正则化系数,
}
"""
k=40: 给定预测时候的邻居样本的数目
min_k=1:在产生预测值的时候,只要要求有多少个临近用户/物品
sim_options={} : 给定相似度矩阵的计算方式
"""
sim_options = {
    'name':
    'pearson_baseline',  # 指定相似度的计算法方式,可选值:pearson\msd\cosine\pearson_baseline
    'user_based': True  # 指定是基于用户的协同过滤,还是基于物品的协同过滤
}
algo = KNNBaseline(k=40,
                   min_k=1,
                   sim_options=sim_options,
                   bsl_options=bsl_options)
# algo = KNNBasic(sim_options=sim_options)

# 4. 模型效果评估
#均方误差(MSE)  平均绝对误差(MAE)  一致序列对比率评分(FCP)
evaluate(algo=algo, data=data, measures=['rmse', 'mae', 'fcp'])
コード例 #15
0
ファイル: tavsiye.py プロジェクト: berkayytu/Ara-Proje
    def getRecommendations(self,
                           IDUser,
                           method=9,
                           similarityMeasure=1,
                           isUserBased="Yes"):
        conn = sqlite3.connect(DATABASE_NAME)
        df = pd.read_sql_query(
            "SELECT userID, glassID, relativeRating FROM ratings", conn)

        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(
            df[['userID', 'glassID', 'relativeRating']], reader)

        trainset = data.build_full_trainset()

        isUserBased = True if (isUserBased == "Yes") else False
        if similarityMeasure == 1:
            similarityMeasure = "cosine"
        elif similarityMeasure == 2:
            similarityMeasure = "pearson"
        else:
            similarityMeasure = "pearson_baseline"

        sim_options = {'name': similarityMeasure, 'user_based': isUserBased}

        if method == 1:
            algo = SVD()
        elif method == 2:
            algo = SlopeOne()
        elif method == 3:
            algo = NMF()
        elif method == 4:
            algo = NormalPredictor()
        elif method == 5:
            algo = KNNBaseline(sim_options=sim_options)
        elif method == 6:
            algo = KNNBasic(sim_options=sim_options)
        elif method == 7:
            algo = KNNWithMeans(sim_options=sim_options)
        elif method == 8:
            algo = KNNWithZScore(sim_options=sim_options)
        elif method == 9:
            algo = BaselineOnly()
        else:
            algo = CoClustering()

        algo.fit(trainset)

        predictions = pd.DataFrame(columns=['glassID', 'estimatedRating'])

        totalGlass = df['glassID'].max()

        glassPivot = df.pivot_table(index='glassID',
                                    columns='userID',
                                    values='relativeRating')

        for iid in range(1, totalGlass + 1):
            isNan = True

            try:
                isNan = pd.isna(glassPivot.loc[iid, IDUser])
            except:
                continue

            if isNan:
                prediction = algo.predict(IDUser, iid, verbose=False)
                predictions = predictions.append(
                    pd.DataFrame([[iid, prediction[3]]],
                                 columns=predictions.columns))

        predictions = predictions.sort_values('estimatedRating',
                                              ascending=False)
        recommendationList = [
            item for item in predictions[predictions['estimatedRating'] > 3]
            ['glassID'].head(50).tolist()
        ]

        conn.close()

        return recommendationList
コード例 #16
0
for k in (10,20,30,40,50,60,70,80,90,100):
    algo = KNNWithMeans(sim_options=sim_options, k=k)
    x = cross_validate(algo, data, verbose=True)
    cur_mean = np.mean(x['test_rmse'])
    if(cur_mean < min_mean):
        min_mean = cur_mean
        optimal_k = k
    print("current optimal K={} min mean={}".format(optimal_k, min_mean))

data = Dataset.load_from_file(file_path, reader=reader)




benchmark = []
for algorithm in [SVD(), KNNBaseline(k=60,sim_options = {'name': 'cosine','user_based': True }), KNNBasic(), KNNWithMeans()]:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

benchmark = []
for algorithm in [SVD(),KNNBaseline(sim_options = {'name': 'pearson','user_based': True }), KNNBasic(k=30,sim_options = {'name': 'pearson','user_based': True }), KNNWithMeans(k=60,sim_options = {'name': 'pearson','user_based': True })]:
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')   
コード例 #17
0
    if not args.train:
        args.train = 'datasets/' + args.dataset + '/train.dat'

    if not args.test:
        args.test = 'datasets/' + args.dataset + '/test.dat'

    if not args.validation:
        args.validation = 'datasets/' + args.dataset + '/val.dat'

    sim_options = {
        'name': 'cosine',
        'user_based': False  # compute  similarities between items
    }

    if rec == 'ItemKNN':
        algorithm = [KNNBaseline(sim_options=sim_options)]
        name = ['ItemKNN']

    elif rec == 'SVD':
        algorithm = [SVD()]
        name = ['SVD']

    elif rec == 'NMF':
        algorithm = [NMF()]
        name = ['NMF']

    else:
        algorithm = [KNNBaseline(sim_options=sim_options), SVD(), NMF()]
        name = ['ItemKNN', 'SVD', 'NMF']

    # initialize evaluator
コード例 #18
0
ファイル: KNN_Netflix.py プロジェクト: MaJunhua/522Netflix
            if int(line[1]) in uid:
                uid[int(line[1])].update({int(line[0]):int(line[2])})
            else:
                uid[int(line[1])]={int(line[0]):int(line[2])}
#        print "done!"
    return uid


file_path=os.path.expanduser('~')+"/Downloads/CSC522/toy/sample.data"
reader=Reader(line_format='item user rating timestamp',sep=',')
data=Dataset.load_from_file(file_path,reader=reader)
data.split(n_folds=60)

trainset=data.build_full_trainset()
sim_options={'name':'pearson_baseline','user_based':True}
algo=KNNBaseline(sim_options=sim_options)
algo.train(trainset)

user_id='911'
user_inner_id=algo.trainset.to_inner_uid(user_id)
user_neighbors=algo.get_neighbors(user_inner_id,k=22)
user_neighbors=(algo.trainset.to_raw_uid(inner_id) for inner_id in user_neighbors)

print()
print('The 5 nearest neighbors of the userid %s are:'%user_id)
for userid in user_neighbors:
    print(userid)

perf=evaluate(algo,data,measures=['RMSE','MAE'])
print (perf)
コード例 #19
0
def collaborative_fitlering(raw_uid):
    # =============== 数据预处理 ===========================
    # 将数据库中的所有数据读取转换到文件
    # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data'
    dir_data = './collaborative_filtering/cf_data'
    file_path = '{}/dataset_user_5.txt'.format(dir_data)
    if not os.path.exists(dir_data):
        os.makedirs(dir_data)

    # 数据库操作
    # 打开数据库连接
    db = pymysql.connect("localhost",
                         "music_system",
                         "music_system",
                         "music_recommender",
                         charset='utf8')

    # 使用 cursor() 方法创建一个游标对象 cursor
    cursor = db.cursor()

    sql = """SELECT uid, song_id, rating
              FROM user_rating
               WHERE 1"""
    cursor.execute(sql)
    results = cursor.fetchall()
    with open(file_path, "w+") as data_f:
        for result in results:
            uid, song_id, rating = result

            data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating))

    if not os.path.exists(file_path):
        raise IOError("Dataset file is not exists!")

    # ===========  cf recommend ==================
    # 导入数据
    reader = Reader(line_format='user item rating', sep='\t')
    data = Dataset.load_from_file(file_path, reader=reader)

    # 所有数据生成训练集
    trainset = data.build_full_trainset()

    # ================= BaselineOnly  ==================
    # start = time.clock()

    bsl_options = {
        'method': 'sgd',
        'learning_rate': 0.0005,
    }
    algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options)
    algo_BaselineOnly.fit(trainset)

    # 获得推荐结果
    rset = user_build_anti_testset(trainset, raw_uid)
    predictions = algo_BaselineOnly.test(rset)
    top_n_baselineonly = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- BaselineOnly 耗时: %.2fs\n" % (end-start))
    # print("BaselineOnly 推荐结果:{}\n".format(top_n_baselineonly))

    # ================= KNNBasic  ==================
    sim_options = {'name': 'pearson', 'user_based': True}
    algo_KNNBasic = KNNBasic(sim_options=sim_options)
    algo_KNNBasic.fit(trainset)

    # 获得推荐结果  ---  只考虑 knn 用户的
    # start = time.clock()
    predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBasic.test(knn_anti_set)
    top_n_knnbasic = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- KNNBasic 耗时: %.2fs\n" % (end-start))
    # print("KNNBasic 推荐结果:{}\n".format(top_n_knnbasic))

    # ================= KNNBaseline  ==================
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo_KNNBaseline = KNNBaseline(sim_options=sim_options)
    algo_KNNBaseline.fit(trainset)

    # 获得推荐结果  ---  只考虑 knn 用户的
    # start = time.clock()
    predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid)
    knn_anti_set = predictor.user_build_anti_testset()
    predictions = algo_KNNBaseline.test(knn_anti_set)
    top_n_knnbaseline = get_top_n(predictions, n=5)

    # end = time.clock()
    # print("user-50NN --- KNNBaseline 耗时: %.2fs\n" % (end-start))
    # print("KNNBaseline 推荐结果:{}\n".format(top_n_knnbaseline))

    # =============== 按比例生成推荐结果 ==================
    recommendset = set()
    for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]:
        for key in results.keys():
            for recommendations in results[key]:
                iid, rating = recommendations
                recommendset.add(iid)

    items_baselineonly = set()
    for key in top_n_baselineonly.keys():
        for recommendations in top_n_baselineonly[key]:
            iid, rating = recommendations
            items_baselineonly.add(iid)

    items_knnbasic = set()
    for key in top_n_knnbasic.keys():
        for recommendations in top_n_knnbasic[key]:
            iid, rating = recommendations
            items_knnbasic.add(iid)

    items_knnbaseline = set()
    for key in top_n_knnbaseline.keys():
        for recommendations in top_n_knnbaseline[key]:
            iid, rating = recommendations
            items_knnbaseline.add(iid)

    rank = dict()
    for recommendation in recommendset:
        if recommendation not in rank:
            rank[recommendation] = 0
        if recommendation in items_baselineonly:
            rank[recommendation] += 1
        if recommendation in items_knnbasic:
            rank[recommendation] += 1
        if recommendation in items_knnbaseline:
            rank[recommendation] += 1

    max_rank = max(rank, key=lambda s: rank[s])
    if max_rank == 1:
        # print(items_baselineonly)
        return items_baselineonly
    else:
        result = nlargest(5, rank, key=lambda s: rank[s])
        # print(result)
        return result
コード例 #20
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File  : KNN_baseline_movie.py
# Author: WangYu
# Date  : 2020-04-12

from surprise import KNNWithMeans
from surprise import Dataset, Reader
from surprise import KNNBaseline
from surprise.model_selection import KFold
from surprise import accuracy

# 数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)

KF = KFold(n_splits=3)
algo = KNNBaseline(k=50, sim_options={'user_based': False, 'verbose': 'True'})

for train, test in KF.split(data):
    algo.fit(train)
    predictions = algo.test(test)
    accuracy.rmse(predictions, verbose=True)
    accuracy.mae(predictions, verbose=True)
コード例 #21
0
class EvaluationData:
    def __init__(self, data, popularityRankings, doTopN: bool):

        self.rankings = popularityRankings
        self.data = data
        self.fullTrainSet = data.build_full_trainset()
        # And build an anti-test-set for building predictions
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        #Build a 75/25 train/test split for measuring accuracy
        self.trainSet, self.testSet = train_test_split(data,
                                                       test_size=.25,
                                                       random_state=1)

        if (doTopN):
            # Build a full training set for evaluating overall properties

            # Build a "leave one out" train/test split for evaluating top-N recommenders
            LOOCV = LeaveOneOut(n_splits=1, random_state=1)
            for train, test in LOOCV.split(data):
                self.LOOCVTrain = train
                self.LOOCVTest = test

            self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()

            # Compute similarty matrix between items so we can measure diversity
            sim_options = {'name': 'cosine', 'user_based': False}
            self.simsAlgo = KNNBaseline(sim_options=sim_options)
            self.simsAlgo.fit(self.fullTrainSet)

    def GetFullTrainSet(self):
        return self.fullTrainSet

    def GetFullAntiTestSet(self):
        return self.fullAntiTestSet

    def GetAntiTestSetForUser(self, testSubject):
        trainset = self.fullTrainSet
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(testSubject))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill)
                         for i in trainset.all_items() if i not in user_items]
        return anti_testset

    def GetTrainSet(self):
        return self.trainSet

    def GetTestSet(self):
        return self.testSet

    def GetLOOCVTrainSet(self):
        return self.LOOCVTrain

    def GetLOOCVTestSet(self):
        return self.LOOCVTest

    def GetLOOCVAntiTestSet(self):
        return self.LOOCVAntiTestSet

    def GetSimilarities(self):
        return self.simsAlgo

    def GetPopularityRankings(self):
        return self.rankings
コード例 #22
0
class MovieRecommender:
    def __init__(self):
        self._knn = None
        self._nmf = None
        self._trainset = None
        self._predictions = None

        self.initialized = False

    def initialize(self, data_filepath):
        self._data = Dataset.load_from_file(data_filepath,
                                            reader=Reader('ml-100k'))
        self._trainset = self._data.build_full_trainset()

        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        self._knn = KNNBaseline(sim_options=sim_options)
        self._nmf = NMF()

        start_new_thread(self._train)

    def get_similar_movies(self, movie_id, k=10):
        if not self.initialized:
            return []

        model = self._knn

        movie_inner_id = model.trainset.to_inner_iid(movie_id)
        similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k)

        to_raw_iid = model.trainset.to_raw_iid
        similar_movie_ids = (to_raw_iid(inner_id)
                             for inner_id in similar_movie_inner_ids)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def get_similar_movies_for_user(self, user_id, num_movies=10):
        if not self.initialized:
            return []

        user_id = str(user_id)
        user_predictions = [
            prediction for prediction in self._predictions
            if prediction[0] == user_id
        ]

        sorted_predictions = sorted(user_predictions,
                                    key=lambda x: x.est,
                                    reverse=True)
        top_n_predictions = sorted_predictions[:num_movies]

        similar_movie_ids = (prediction.iid
                             for prediction in top_n_predictions)

        movie_ids = [
            similar_movie_id.encode('ascii')
            for similar_movie_id in similar_movie_ids
        ]
        return movie_dataset.get_movies(movie_ids)

    def update_user_ratings(self, user_id, movie_id, rating):
        if not self.initialized:
            return

        rating = float(rating)

        has_previous_rating = False
        if self._trainset.knows_user(user_id):
            trainset_dict = dict(self._trainset.ur[user_id])
            has_previous_rating = movie_id in trainset_dict

        user_id = str(user_id)
        movie_id = str(movie_id)
        new_rating = (user_id, movie_id, rating, time())
        if has_previous_rating:
            for i, rating in enumerate(self._data.raw_ratings):
                if rating[0] == user_id and rating[1] == movie_id:
                    self._data.raw_ratings[i] = new_rating
                    break
        else:
            self._data.raw_ratings.append(new_rating)

        self._trainset = self._data.build_full_trainset()
        self._train()

    def _train(self):
        self._nmf.train(self._trainset)
        self._knn.train(self._trainset)

        self._predictions = self._nmf.test(self._trainset.build_anti_testset())

        self.initialized = True
コード例 #23
0
    file_name = df
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
trainset = df.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
#*********************Movie Recommended for Movie**********
toy_story_raw_id = name_to_rid['Toy Story']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
コード例 #24
0
def get_top_n(predictions, n=5):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, round(est, 3)))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n


user_id = input('User ID: ')

data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 5}
algo = KNNBaseline(k=4, sim_options=sim_options)
algo.fit(trainset)

testset = trainset.build_anti_testset()
testset = filter(lambda x: x[0] == user_id, testset)
predictions = algo.test(testset)

top_n = get_top_n(predictions)
rid_to_name = read_item_names()

print('User ' + user_id)
for movie_rid, rating in top_n[user_id]:
    print('{:4s} {:70s} {}'.format(movie_rid, str(rid_to_name[movie_rid]),
                                   rating))
コード例 #25
0
class KnnRecom:
    def __init__(self):
        pass

    def load_data(self):
        # 载入原始.csv数据
        reader = Reader(line_format='user item rating timestamp',
                        sep=',',
                        skip_lines=1)
        data = Dataset.load_from_file(
            file_path='../' + Config().config['DATAPATH']['ratings_path'],
            reader=reader)

        # 构建训练集
        self.trainset = data.build_full_trainset()
        logger.info('数据集成功构建')

    def get_neighbors(self, movie_id_raw, n=10):
        movie_id_inner = self.trainset.to_inner_iid(movie_id_raw)
        movies_inner_id = self.algo.get_neighbors(movie_id_inner, k=n)
        movies_raw_id = [
            self.trainset.to_raw_iid(inner_id) for inner_id in movies_inner_id
        ]
        return movies_raw_id

    def fit(self):
        # 训练模型
        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        self.algo = KNNBaseline(sim_options=sim_options)
        self.algo.fit(trainset=self.trainset)

    def save_mode(self, file_path):
        dump.dump(file_path, algo=self.algo)

    def tosql(self, n, database, tablename):
        '''

        table name:knn_predictions
        table columns:['movieId','k nearest neighbors']
        :Args:
            n:number of nearest neighbors
            database:name of database stored in mysql
            tablename:name of table stored in mysql
        :return:
        '''

        data = []

        for iid in self.trainset.all_items():
            movie_rawid = self.trainset.to_raw_iid(iid)
            movies_inner_id = self.algo.get_neighbors(iid, k=n)
            data.append([
                movie_rawid, ','.join([(self.trainset.to_raw_iid(inner_id))
                                       for inner_id in movies_inner_id])
            ])

        df = pd.DataFrame(data=data)
        df.columns = ['movieId', 'k_nearest_neighbors']
        BASE_DIR = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
        knn_predictions = os.path.join(BASE_DIR, 'data/knn_predictions.csv')

        df.to_csv(knn_predictions, index=False)

        dftosql(DataFrame=df,
                database=database,
                table_name=tablename,
                if_exists='replace')

    def load_model(self, file_path):
        predictions, loaded_algo = dump.load(file_path)
        return predictions, loaded_algo
コード例 #26
0
    return precisions, recalls, f1scores


# load dataset
data_path = abspath("../../../resources/ml-100k/i.data")

# set rating range when loading in the dataset
reader = Reader(line_format='user item rating timestamp',
                sep='\t',
                rating_scale=(0, 1))

# load the dataset
data = Dataset.load_from_file(data_path, reader=reader)

# calculate RMSE and MAE
for algo in [KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]:
    # Run 5-fold cross-validation and print results.
    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

for algo in [KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]:
    # Calculate precision and recall and f1score
    kf = KFold(n_splits=5)
    fold_count = 1
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        precisions, recalls, f1scores = precision_recall_at_k(predictions,
                                                              k=5,
                                                              threshold=4)

        # Precision and recall can then be averaged over all users
コード例 #27
0
 def fit(self):
     # 训练模型
     sim_options = {'name': 'pearson_baseline', 'user_based': False}
     self.algo = KNNBaseline(sim_options=sim_options)
     self.algo.fit(trainset=self.trainset)
コード例 #28
0
ファイル: accuracy.py プロジェクト: MANSI-MEHTA/Game
from surprise import accuracy
import os

reader = Reader(line_format='user item rating',
                sep=',',
                skip_lines=3,
                rating_scale=(1, 5))

custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) +
                       '\\ratings.csv')
print("Using: " + custom_dataset_path)

data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader)
trainingSet = data.build_full_trainset()

sim_options = {
    'name': 'pearson_baseline',
    'shrinkage': 0  # no shrinkage
}

knn = KNNBaseline(sim_options=sim_options)

knn.fit(trainingSet)

testSet = trainingSet.build_testset()

predictions = knn.test(testSet)

accuracy.rmse(predictions, verbose=True)
accuracy.mae(predictions, verbose=True)
コード例 #29
0
ファイル: cf_util.py プロジェクト: 39239580/res_sys_tool-new-
    def __init__(self, module_type, baseline_type, cf_type, similar, sim_type, params):
        assert baseline_type in {"ALS", "SGD", "default"}
        assert cf_type in {None, "base_user", "base_item"}
        assert similar in {None, "COSINE", "cosine", "MSD", "msd", "PEARSON", "pearson",
                           "PEARSON_BASELINE", "pearson_baseline", "JACCARD", "jaccard",
                           "EUCLIDEAN", "euclidean"}
        assert sim_type in {None, "default"}
        self.module_type = module_type
        self.baseline_type = baseline_type
        self.cf_type = cf_type
        self.similar = similar
        self.sim_type = sim_type
        self.bu = None
        self.bi = None
        self.sim = None
        if self.baseline_type == "ALS":
            bsl_options = {'method': params["bsl_options"].get("method", 'als'),
                           'n_epochs': params["bsl_options"].get("n_epochs", 10),
                           'reg_u': params["bsl_options"].get("reg_u", 15),
                           'reg_i': params["bsl_options"].get("reg_i", 10)
                           }
        elif self.baseline_type == "SGD":
            bsl_options = {'method':  params["bsl_options"].get("method", 'sgd'),
                           'n_epochs': params["bsl_options"].get("n_epochs", 20),
                           'reg': params["bsl_options"].get("reg", 0.02),
                           'learning_rate': params["bsl_options"].get("learning_rate", 0.005)
                           }
        else:   # 默认值
            bsl_options = {}
        params["sim_options"] = {}

        if self.cf_type == "base_user":
            params["sim_options"]["user_based"] = True
        elif self.cf_type == "base_item":
            params["sim_options"]["item_based"] = False
        else:
            params["sim_options"]["user_based"] = True

        if self.similar == "COSINE" or self.similar == "cosine":
            params["sim_options"]["name"] = "cosine"
        elif self.similar == "MSD" or self.similar == "msd":
            params["sim_options"]["name"] = "msd"
        elif self.similar == "PEARSON" or self.similar == "pearson":
            params["sim_options"]["name"] = "pearson"
        elif self.similar == "PEARSON_BASELINE" or self.similar == "pearson_baseline":
            params["sim_options"]["name"] = "pearson_baseline"
        elif self.similar == "JACCARD" or self.similar == "jaccard":
            params["sim_options"]["name"] = "jaccard"
        elif self.similar == "EUCLIDEAN" or self.similar == "euclidean":
            params["sim_options"]["name"] = "euclidean"
        else:
            params["sim_options"]["name"] = "msd"

        if self.sim_type == "default":
            sim_options = {}
        else:
            sim_options = {"name": params["sim_options"].get("name", "MSD"),
                           "user_based": params["sim_options"].get("user_based", True),
                           "min_support": params["sim_options"].get("min_support", 5),
                           "shrinkage": params["sim_options"].get("shrinkage", 100)
                           }

            """
            'name':要使用的相似性名称,如similarities模块中所定义 。默认值为'MSD'。
            'user_based':将计算用户之间还是项目之间的相似性。这对预测算法的性能有巨大影响。默认值为True。
            'min_support':相似度不为零的最小公共项数('user_based' 为'True'时)或最小公共用户数('user_based'为 'False'时)。
            简单地说,如果 |Iuv|<min_support 然后 sim(u,v)=0。项目也是如此。
            'shrinkage':
            """
        if self.module_type == "KNNmeans":
            # 在KNNBasic算法的基础上,考虑用户均值或项目均值
            self.model = KNNWithMeans(k=params.get("k", 40),
                                      min_k=params.get("min_k", 1),
                                      sim_options=sim_options,
                                      verbose=params.get("verbose", True))
        elif self.module_type == "KNNzscore":
            # 引入Z - Score的思想
            self.model = KNNWithZScore(k=params.get("k", 40),
                                       min_k=params.get("min_k", 1),
                                       sim_options=sim_options,
                                       verbose=params.get("verbose", True))
        elif self.module_type == "KNNbase":
            # 和KNNWithMeans的区别在于,用的不是均值而是bias
            self.model = KNNBaseline(k=params.get("k", 40),
                                     min_k=params.get("min_k", 1),   # 最少的邻居个数
                                     sim_options=sim_options,
                                     bsl_options=bsl_options,
                                     verbose=params.get("verbose", True))
        elif self.module_type == "KNNbasic":
            # 最基础的KNN算法,可分为user - based KNN和item - based KNN
            self.model = KNNBasic(k=params.get("k", 40),
                                  min_k=params.get("min_k", 1),
                                  sim_options=sim_options,
                                  verbose=params.get("verbose", True))
        elif self.module_type == "SVD":
            self.model = SVD(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False)
                             )
            """
            n_factors –因素数。默认值为100。
            n_epochs – SGD过程的迭代次数。默认值为 20。
            偏见(bool)–是否使用基线(或偏见)。请参阅上面的注释。默认值为True。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0.1。
            lr_all –所有参数的学习率。默认值为0.005。
            reg_all –所有参数的正则项。默认值为 0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SVDpp":
            self.model = SVDpp(n_factors=params.get("n_factors", 100),
                               n_epochs=params.get("n_epochs", 20),
                               init_mean=params.get("init_mean", 0),
                               init_std_dev=params.get("init_std_dev", 0.1),
                               lr_all=params.get("lr_all", 0.005),
                               reg_all=params.get("reg_all", 0.02),
                               lr_bu=params.get("lr_bu", None),
                               lr_bi=params.get("lr_bi", None),
                               lr_pu=params.get("lr_pu", None),
                               lr_qi=params.get("lr_qi", None),
                               reg_bu=params.get("reg_bu", None),
                               reg_bi=params.get("reg_bi", None),
                               reg_pu=params.get("reg_pu", None),
                               reg_qi=params.get("reg_qi", None),
                               random_state=params.get("random_state", None),
                               verbose=params.get("verbose", False))
            """
            n_factors –因素数。默认值为20。
            n_epochs – SGD过程的迭代次数。默认值为
            20。
            init_mean –因子向量初始化的正态分布平均值。默认值为0。
            init_std_dev –因子向量初始化的正态分布的标准偏差。默认值为0
            .1。
            lr_all –所有参数的学习率。默认值为0
            .007。
            reg_all –所有参数的正则项。默认值为
            0.02。
            lr_bu –的学习率bu。lr_all如果设置优先 。默认值为None。
            lr_bi –的学习率bi。lr_all如果设置优先 。默认值为None。
            lr_pu –的学习率pu。lr_all如果设置优先 。默认值为None。
            lr_qi –的学习率qi。lr_all如果设置优先 。默认值为None。
            lr_yj –的学习率yj。lr_all如果设置优先 。默认值为None。
            reg_bu –的正则化术语bu。reg_all如果设置优先。默认值为None。
            reg_bi –的正则化术语bi。reg_all如果设置优先。默认值为None。
            reg_pu –的正则化术语pu。reg_all如果设置优先。默认值为None。
            reg_qi –的正则化术语qi。reg_all如果设置优先。默认值为None。
            reg_yj –的正则化术语yj。reg_all如果设置优先。默认值为None。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用
            fit()。如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为
            None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "NMF":
            # 非负矩阵分解,即要求p矩阵和q矩阵都是正的
            self.model = NMF(n_factors=params.get("n_factors", 100),
                             n_epochs=params.get("n_epochs", 20),
                             init_mean=params.get("init_mean", 0),
                             init_std_dev=params.get("init_std_dev", 0.1),
                             lr_all=params.get("lr_all", 0.005),
                             reg_all=params.get("reg_all", 0.02),
                             lr_bu=params.get("lr_bu", None),
                             lr_bi=params.get("lr_bi", None),
                             lr_pu=params.get("lr_pu", None),
                             lr_qi=params.get("lr_qi", None),
                             reg_bu=params.get("reg_bu", None),
                             reg_bi=params.get("reg_bi", None),
                             reg_pu=params.get("reg_pu", None),
                             reg_qi=params.get("reg_qi", None),
                             random_state=params.get("random_state", None),
                             verbose=params.get("verbose", False))

            """
            n_factors –因素数。默认值为15。
            n_epochs – SGD过程的迭代次数。默认值为 50。
            偏见(bool)–是否使用基线(或偏见)。默认值为 False。
            reg_pu –用户的正则化术语λu。默认值为 0.06。
            reg_qi –项目的正规化术语λi。默认值为 0.06。
            reg_bu –的正则化术语bu。仅与偏置版本相关。默认值为0.02。
            reg_bi –的正则化术语bi。仅与偏置版本相关。默认值为0.02。
            lr_bu –的学习率bu。仅与偏置版本相关。默认值为0.005。
            lr_bi –的学习率bi。仅与偏置版本相关。默认值为0.005。
            init_low –因子的随机初始化的下限。必须大于0以确保非负因素。默认值为 0。
            init_high –因子的随机初始化的上限。默认值为1。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细 –如果True,则打印当前纪元。默认值为False。
            """
        elif self.module_type == "SlopeOne":
            self.model = SlopeOne(**params)

        elif self.module_type == "cc":
            # 基于聚类的协同过滤
            self.model = CoClustering(n_cltr_u=params.get("n_cltr_u", 3),
                                      n_cltr_i=params.get("n_cltr_i", 3),
                                      n_epochs=params.get("n_epochs", 20),
                                      random_state=params.get("random_state", None),
                                      verbose=params.get("verbose",False)
                                      )
            """
            n_cltr_u(int)–用户集群的数量。默认值为3。
            n_cltr_i(int)–项目集群的数量。默认值为3。
            n_epochs(int)–优化循环的迭代次数。默认值为 20。
            random_state(int,numpy中的RandomState实例或None)–确定将用于初始化的RNG。
            如果为int,random_state则将用作新RNG的种子。通过多次调用进行相同的初始化非常有用 fit()。
            如果是RandomState实例,则将该实例用作RNG。如果为None,则使用numpy中的当前RNG。默认值为 None。
            详细(bool)–如果为True,则将打印当前纪元。默认值为 False。
            """

        elif self.module_type == "BaselineOnly":
            # 不考虑用户的偏好
            self.model = BaselineOnly(bsl_options=bsl_options, verbose=True)

        elif self.module_type == "Np":
            # 该算法即随机预测算法,假设测试集的评分满足正态分布,然后生成正态分布的随机数进行预测,
            self.model = NormalPredictor()
コード例 #30
0
ファイル: rec32.py プロジェクト: nikiforovaki/homework
        for j in result:
            if ('film' in j['description']):
                q.append((query, j['id']))
                break
    return q


k = 4
top_n = 5
user = input('Enter user ID:')
# загрузка встроенного набора данных
data = Dataset.load_builtin('ml-100k')
# создание класса
train = data.build_full_trainset()
# использование алгоритма для прогноза
algorithm = KNNBaseline(k, sim_options={'name': 'cosine', 'min_support': 5})
algorithm.fit(train)


# чтение файла в словарь
def read():
    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_name = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as file:
        for line in file:
            line = line.split('|')
            rid_name[line[0]] = (line[1], line[2])
    return rid_name


# оценка с наилучшими параметрами(test)
コード例 #31
0
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)