Example #1
0
def hook_user_based_recommend(df):
    df_shop = prep_shop_model(df)
    df_shop = transform_data(df_shop)
    data, train, test = prep_surprise_dataset(df_shop, 'shop_id')
    option = {'name': 'cosine'}  # cosine, msd, pearson, pearson_baseline
    algo = KNNBaseline(sim_options=option)
    algo.fit(train)

    return algo, df_shop
Example #2
0
def getSimModle():
    # 默认载入movielens数据集
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    #使用pearson_baseline方式计算相似度  False以item为基准计算相似度 本例为电影之间的相似度
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    ##使用KNNBaseline算法
    algo = KNNBaseline(sim_options=sim_options)
    #训练模型
    algo.fit(trainset)
    return algo
Example #3
0
def get_trained_model(dataset):
    # To use item-based cosine similarity
    sim_options = {
        "name": "msd",
        "user_based": False,  # Compute  similarities between items
    }
    model = KNNBaseline(sim_options=sim_options)

    training_set = dataset.build_full_trainset()
    model.fit(training_set)
    return model
Example #4
0
class EvaluationData:
    def __init__(self, data, popularityRankings):
        self.rankings = popularityRankings
        self.fullTrainSet = data.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()
        self.trainSet, self.testSet = train_test_split(data,
                                                       test_size=.25,
                                                       random_state=1)

        LOOCV = LeaveOneOut(n_splits=1, random_state=1)
        for train, test in LOOCV.split(data):
            self.LOOCVTrain = train
            self.LOOCVTest = test

        self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()

        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)

    def GetFullTrainSet(self):
        return self.fullTrainSet

    def GetFullAntiTestSet(self):
        return self.fullAntiTestSet

    def GetAntiTestsetForUser(self, testSubject):
        trainset = self.fullTrainSet
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(testSubject))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill)
                         for i in trainset.all_items() if i not in user_items]
        return anti_testset

    def GetTrainSet(self):
        return self.trainSet

    def GetTestSet(self):
        return self.testSet

    def GetLOOCVTrainSet(self):
        return self.LOOCVTrain

    def GetLOOCVTestSet(self):
        return self.LOOCVTest

    def GetPopularityRankings(self):
        return self.rankings

    def GetSimilarities(self):
        return self.simsAlgo
def get_similar_items(iid, n = 10):
    trainset = data.build_full_trainset()
    algo = KNNBaseline(sim_option = item_based_sim_option)
    algo.fit(trainset)
    inner_id = algo.trainset.to_inner_iid(iid)
    # 使用get_neighbors方法得到n个最相似的电影
    neighbors = algo.get_neighbors(inner_id, k=n)
    neighbors_iid = ( algo.trainset.to_raw_iid(x) for x in neighbors )
    recommendations = [ item_dict[x] for x in neighbors_iid ]
    # print('\nten movies most similar to the',item_dict[iid])
    # for i in recommendations:
    #     print(i)
    return recommendations
Example #6
0
def browse(uID):
    #dataset
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    #create and fit the KNN classifier
    algo = KNNBaseline()
    algo.fit(trainset)

    #movie lists
    #0, movie id. 1, our predicted rating. 2, title. 3, actual rating
    top5_definite = get_best_recs_(uID, algo)
    top5_somewhat = get_med_recs_(uID, algo)

    #this are fixed length, range 5 because we want 0-4 movies, and range 4 becasue each has 0-3 attributes
    definite_like = [[0 for x in range(4)] for y in range(5)]
    somewhat_like = [[0 for x in range(4)] for y in range(5)]

    #filling the final 2d list of definite likes
    for i in range(len(definite_like)):
        #first value is the movie ID
        definite_like[i][0] = top5_definite[i][0]
        #second value is our predicted rating
        definite_like[i][1] = top5_definite[i][0]
        #third is the title
        definite_like[i][2] = id_to_title_(definite_like[i][0])
        #fourth is the average rating
        definite_like[i][3] = get_rating_(top5_definite[i][0])

    #same as last loop, but for the somewhat like list
    for i in range(len(somewhat_like)):
        somewhat_like[i][0] = top5_somewhat[i][0]
        somewhat_like[i][1] = top5_somewhat[i][1]
        somewhat_like[i][2] = id_to_title_(somewhat_like[i][0])
        somewhat_like[i][3] = get_rating_(top5_somewhat[i][0])

    print("\n\nDefinite like:\n")
    for item in definite_like:
        print("Movie ID: " + str(item[0]))
        print("Our rating prediction: " + str(item[1]))
        print("Title: " + str(item[2]))
        print("Average rating: " + str(item[3]))

    print("\n\nSomewhat like:\n")
    for item in somewhat_like:
        print("Movie ID: " + str(item[0]))
        print("Our rating prediction: " + str(item[1]))
        print("Title: " + str(item[2]))
        print("Average rating: " + str(item[3]))

    return definite_like, somewhat_like
Example #7
0
class KNNBaselineRS(AbstractRS):
    def __init__(self,
                 path,
                 sim_options={
                     'name': 'pearson_baseline',
                     'user_based': False
                 }):
        self.path = os.path.expanduser(path)
        self.algo = KNNBaseline(sim_options=sim_options)

    def train(self):
        self.data.build_full_trainset()
        self.algo = KNNBaseline()
        self.algo.fit(self.data)
Example #8
0
class Recmodel(object):
    def __init__(self, algo='knn_baseline', filepath=None):
        if not os.path.exists(filepath):
            raise FileNotFoundError("{} not exist".format(filepath))
        self.filepath = filepath
        if algo == 'nmf':
            self.algo = NMF()
            self.model_name = 'nmf'
        else:
            self.algo = KNNBaseline()
            self.model_name = 'knn_baseline'

        self.convertor = DataConvertHelper()

    def buildDataSet(self):
        reader = Reader(line_format='user item rating timestamp', sep=',')
        music_data = Dataset.load_from_file(file_path=self.filepath,
                                            reader=reader)
        self.trainset = music_data.build_full_trainset()

    def train(self):
        print("begin training...")
        self.algo.fit(self.trainset)

    def evaluate(self, index):
        current_playlist_name = self.convertor.get_name_by_index(index)
        print('当前歌单:{}'.format(current_playlist_name))

        current_playlist_rid = self.convertor.get_rid_by_name(
            current_playlist_name)
        print("当前歌单rid: {}".format(current_playlist_rid))

        playlist_inner_id = self.algo.trainset.to_inner_uid(
            current_playlist_rid)
        print('歌单inid', playlist_inner_id)

        playlist_neighbors_inner_ids = self.algo.get_neighbors(
            playlist_inner_id, k=10)
        playlist_neighbors_rids = (
            self.algo.trainset.to_raw_uid(inner_id)
            for inner_id in playlist_neighbors_inner_ids)
        playlist_neighbors_names = (self.convertor.get_name_by_rid(rid)
                                    for rid in playlist_neighbors_rids)

        print('歌单 《', current_playlist_name, '》 最接近的10个歌单为:')
        for playlist_name in playlist_neighbors_names:
            print(
                playlist_name,
                self.algo.trainset.to_inner_uid(
                    self.convertor.get_rid_by_name(playlist_name)))
def knn_baseline_movie(train, test, ids, Xtest, Xids):
    """
    nearest neighbour approach using the movie baseline
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """

    print('kNN Baseline Movie')
    bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01}

    sim_option = {
        'name': 'pearson_baseline',
        'min_support': 1,
        'user_based': False
    }

    algo = KNNBaseline(k=100,
                       bsl_options=bsl_option,
                       sim_options=sim_option,
                       verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Example #10
0
def knn():
    df = pd.read_sql("select userId,movieId,rating from rating", db.engine)
    print('finished load data')
    reader = Reader(rating_scale=(1, 5), line_format='user item rating')
    print(df.head())
    data = Dataset.load_from_df(df, reader)
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': True}

    algo = KNNBaseline(sim_options=sim_options)
    print('start fit')
    algo.fit(trainset)
    dump.dump(f'{RECOMMEND_MODEL_SAVED_PATH}/knn', algo=algo)
    print('saved to knn')
    cross_validate(algo, data, measures=['RMSE', 'MAE'], verbose=True)
Example #11
0
def build_recommender(data, model_meta):
    """ This function takes order, item, and rating data returns a
        KNN recommendation engine.

    Args:
        data: dataframe with columns order_id, product_id, and rating
        model_meta: original model_meta dict
        """

    from surprise import Dataset, Reader  # import libraries for EB
    logging.info('Setting up random state')
    random.seed(model_meta['train_recommender']['random_state'])

    np.random.seed(model_meta['train_recommender']['random_state'])
    logging.info('Setting up Surprise data reader')
    reader = Reader(rating_scale=(max(data.rating), min(data.rating)))

    logging.info('Calling load_from_df')
    data = Dataset.load_from_df(data, reader)  # reads and sets up data

    logging.info('Setting up recommender')
    k = model_meta['train_recommender']['neighbors']
    sim_options = model_meta['train_recommender']['sim_options']
    knn = KNNBaseline(k, sim_options=sim_options)  # collaborative filtering

    logging.info('Calling build_full_trainset')
    data = data.build_full_trainset()  # uses whole dataset to build model

    logging.info('Fit recommender')
    fit = knn.fit(data)  # fits model to data

    logging.info('Return recommender')
    return (fit)
Example #12
0
class EvaluationData:
    def __init__(self,data,withSim=False):
        self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=0)

        LOOX = LeaveOneOut(1, random_state=1)
        for xtrain, xtest in LOOX.split(data):
            self.LOOX_trainSet = xtrain
            self.LOOX_testSet = xtest
            del xtrain, xtest
        self.LOOX_antitestSet = self.LOOX_trainSet.build_anti_testset()

        self.full_trainSet = data.build_full_trainset()
        self.full_antitestSet = self.full_trainSet.build_anti_testset()
        if withSim:
            sim_options = {'name': 'cosine', 'user_based': False}
            self.simAlgo = KNNBaseline(sim_options=sim_options)
            self.simAlgo.fit(self.full_trainSet)
Example #13
0
def get_nearest_neighbors(user_id):
    #path to dataset file
    file_path = os.path.expanduser('outward.csv')

    # define a reader object for our dataset
    reader = Reader(sep=',')

    #load data from dataset
    data = Dataset.load_from_file(file_path, reader=reader)

    #Train algorithm on dataset
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)

    #Retrieve inner id of the user in question
    user_inner_id = algo.trainset.to_inner_uid(str(user_id))

    #Retrieve inner ids of the nearest neighbors of user
    user_neighbors = algo.get_neighbors(user_inner_id, k=10)

    #Convert inner ids of the neighbors into raw user ids
    user_neighbors = (algo.trainset.to_raw_uid(inner_id)
                      for inner_id in user_neighbors)

    neighbors_lst = []
    print()
    print(f'The 10 nearest neighbors of {user_id} are:')
    for user in user_neighbors:
        print(user)
        neighbors_lst.append(user)

    return neighbors_lst










# pred = algo.predict(uid, bid, verbose=True)
# pred is <class 'surprise.prediction_algorithms.predictions.Prediction'>
Example #14
0
def init_collaborative_filtering():

    # Step 1: Read data from database
    ratings = pps.get_all_ratings_as_df()
    ratings[RATING] = None
    ratings.loc[ratings[LIKED] == True, RATING] = 1
    ratings.loc[ratings[LIKED] == False, RATING] = 0

    # Step 2: Transform to training set
    reader = Reader(rating_scale=(0.0, 1.0))
    data = Dataset.load_from_df(ratings[[USER_ID, POI_ID, RATING]], reader)
    trainset = data.build_full_trainset()

    # Step 3: Apply training of collaborative filtering (CF) algorithm
    algo = KNNBaseline(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
    algo.fit(trainset)

    return algo, ratings
Example #15
0
class EvaluationData:
    def __init__(self, data, popularityRankings):

        self.rankings = popularityRankings

        self.fullTrainSet = data.build_full_trainset()
        self.fullAntiTestSet = self.fullTrainSet.build_anti_testset()

        self.trainSet, self.testSet = train_test_split(data,
                                                       test_size=.25,
                                                       random_state=1)

        #Compute similarty matrix between items so we can measure diversity
        sim_options = {'name': 'cosine', 'user_based': False}
        self.simsAlgo = KNNBaseline(sim_options=sim_options)
        self.simsAlgo.fit(self.fullTrainSet)

    def GetFullTrainSet(self):
        return self.fullTrainSet

    def GetFullAntiTestSet(self):
        return self.fullAntiTestSet

    def GetAntiTestSetForUser(self, userId):
        trainset = self.fullTrainSet
        fill = trainset.global_mean
        anti_testset = []
        u = trainset.to_inner_uid(str(userId))
        user_items = set([j for (j, _) in trainset.ur[u]])
        anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill)
                         for i in trainset.all_items() if i not in user_items]
        return anti_testset

    def GetTrainSet(self):
        return self.trainSet

    def GetTestSet(self):
        return self.testSet

    def GetSimilarities(self):
        return self.simsAlgo

    def GetPopularityRankings(self):
        return self.rankings
Example #16
0
def recommend_friends(request):
    queryset = Rate.objects.all()
    query, params = queryset.query.as_sql(
        compiler='django.db.backends.sqlite3.compiler.SQLCompiler',
        connection=connections['default'])
    df = pd.read_sql_query(query, con=connections['default'], params=params)
    print("load data")
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'item_id', 'rate']], reader)
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline'}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)
    for given_user_id in set(df['user_id']):
        print(given_user_id)
        given_user_id = int(given_user_id)
        _from = get_object_or_404(Profile, profile_id=given_user_id)
        inner_id = algo.trainset.to_inner_uid(given_user_id)
        #    to_inner_uid(), to_inner_iid(), to_raw_uid(), and to_raw_iid()
        neighbors = algo.get_neighbors(inner_id, k=5)
        results = [
            algo.trainset.to_raw_uid(inner_user_id)
            for inner_user_id in neighbors
        ]
        print('The 5 nearest neighbors of Given User Id:')

        for raw_user_id in results:
            _to = get_object_or_404(Profile, user_id=int(raw_user_id))
            # print(raw_user_id,Candidates2.objects.filter(user_from=user_from,user_to=user_to))
            if Candidates2.objects.filter(user_from=_from):
                if Candidates2.objects.filter(user_from=_from, user_to=_to):
                    print("user from , to 다 일치")
                    pass
                else:
                    cand = Candidates2.objects.get(user_from=_from)
                    cand.user_to.add(_to)
                    print("user from만 일치, to 추가")
            else:
                cand = Candidates2.objects.create()
                cand.user_from.add(_from)
                cand.user_to.add(_to)
            print("해당 유저 %s 에 대한 데이터 저장완료" % given_user_id)
    return render(request, "recommend_completed.html")
Example #17
0
def predict(rating_dic):

    df_clean = pd.read_csv("dataset_clean.csv")
    #######################
    # Fit surprise model
    #######################

    final_model = KNNBaseline(k=60, min_k=2, sim_options={'name': 'pearson_baseline', 'user_based': True})

    new_user_id = max(df_clean["userID"]) + 1
    ratings = np.array(list(rating_dic.values()))
    rated_mask = ratings != None
    ratings = ratings[rated_mask]
    items = np.array(list(rating_dic.keys()))[rated_mask]
    user = np.ones(len(items), dtype="int") * new_user_id
    new_user_df = pd.DataFrame({"userID": user, "itemID": items, "rating": ratings})

    total_df = df_clean.append(new_user_df)

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(0, 10))

    # The columns must correspond to user id, item id and ratings (in that order).
    new_trainset = Dataset.load_from_df(total_df, reader).build_full_trainset()

    ## Fit the best model

    final_model.fit(new_trainset)

    predicted_ratings = []
    for nootropic in nootropics_list:
        predicted_ratings.append(final_model.predict(new_user_id, nootropic).est)

    item_baselines = final_model.default_prediction() + final_model.compute_baselines()[
        1]  # mean rating + item baseline ?

    result_df = pd.DataFrame(
        {"nootropic": nootropics_list, "predicted_rating": predicted_ratings, "baseline_rating": item_baselines})

    nootropics_without_ratings = [nootropic for nootropic in nootropics_list if (nootropic not in rating_dic.keys())]
    new_result_df = result_df[result_df["nootropic"].isin(nootropics_without_ratings)]
    return new_result_df.sort_values("predicted_rating", ascending=False, ignore_index=True)
Example #18
0
def eval(user_id):

    # Step 1: Define variables
    ratings = pps.get_all_ratings_as_df() # read ratings from database
    ratings[RATING] = None
    ratings.loc[ratings[LIKED] == True, RATING] = 1
    ratings.loc[ratings[LIKED] == False, RATING] = 0

    reader = Reader(rating_scale=(0.0, 1.0))

    all_items = ratings.poi_id.unique() # find all items
    user_rmse = pd.DataFrame(columns=['est', 'true']) # define resulting dataframe for storing the probabilites

    # Step 2: Iterating over all items and leave out the current iteration's item (x) for training
    for x in np.nditer(all_items):

        # Step 2a: Define test dataset -> rating of currentUser and current (leaved out) item
        testset = ratings[(ratings.user_id == user_id)]
        testset = testset[(testset.poi_id == x)]

        # Step 2b: If user has given no rating for this item, the prediction cannot be compared to something true => thus skip
        if testset.rating.size == 0:
            continue

        # Step 2c: Define train dataset -> leave out the current item x
        trainset = ratings[~ratings.isin(testset).all(1)]
        trainset = Dataset.load_from_df(trainset[[USER_ID, POI_ID, RATING]], reader)
        trainset = trainset.build_full_trainset()

        # Step 2d: Apply algorithm by training and predicting of the item x that was leaved out
        algo = KNNBaseline(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
        algo.fit(trainset)

        pred = algo.predict(user_id, np.asscalar(x), r_ui=4, verbose=False) # execute the calculation

        # Step 2e: Store estimate and true value into output dataframe
        user_rmse.loc[len(user_rmse)] = [pred.est, np.asscalar(testset.rating)]

    # Step 3: Calculate the RMSE over all leave out estimatieons
    confidence = np.mean((user_rmse.est - user_rmse.true)**2)

    return confidence
	def get(self, item_id):
		# SQL query
		conn = mysql.connect()
		cursor = conn.cursor()
		# STEP 1 : KNN WITH MSD
		df = pd.read_sql_query("SELECT * FROM story_reviews", conn)

		# Data and Model
		reader = Reader(rating_scale=(1, 5))
		data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader)
		sim_options = {'name': 'pearson_baseline', 'user_based': False}

		model = KNNBaseline(sim_options=sim_options)
		
		# Training
		training_set = data.build_full_trainset()
		model.fit(training_set)

		item_inner_id = model.trainset.to_inner_iid(item_id)
		item_neighbors_inner = model.get_neighbors(item_inner_id, k=10)
		item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner]

		# STEP 2 : CASCADE IT WITH TF-IDF
		df_stories = pd.read_sql_query("SELECT * FROM stories", conn)

		# Model
		tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')
		tf_idf_matrix = tf.fit_transform(df_stories['title'])
		cosine_similarities = cosine_similarity(tf_idf_matrix, tf_idf_matrix)

		# Retrieve similar items
		cosine_similarities_row = cosine_similarities[item_id-1]
		recommendations_list = []
		n = 10
		for i in range(n):
			recommendations_list.append((item_neighbors[i], cosine_similarities_row[item_neighbors[i]-1]))
				
		recommendations_list.sort(key=lambda x:x[1], reverse=True)
		formatted_recommendations_list = [item[0] for item in recommendations_list]

		# Return K Nearest Neighbors
		return jsonify(recommendations = formatted_recommendations_list)
Example #20
0
def compute_user_neighbors(id_name_dic, name_id_dic, trainset):
    algo = KNNBaseline()
    algo.fit(trainset)

    user_name = name_id_dic.keys()[1]
    print("user_name: ", user_name)
    user_id = name_id_dic[user_name]
    print("user_id: ", user_id)
    user_inner_id = algo.trainset.to_inner_uid(user_id)
    print("内部id: ", user_inner_id)

    user_neighbors = algo.get_neighbors(user_inner_id, k=10)

    user_neighbors = (algo.trainset.to_raw_uid(inner_id)
                      for inner_id in user_neighbors)
    user_neighbors = (id_name_dic[user_id] for user_id in user_neighbors)
    print()
    print("和user 《", user_name, "》 最接近的10个user为:")
    for user in user_neighbors:
        print(algo.trainset.to_inner_uid(name_id_dic[user]), user)
def KNN(data, kwargs):
    # Set algorithm
    k_neigbor     = kwargs.get('n_neigbor')
    min_neighb    = kwargs.get('min_neigbor')
    similarity    = kwargs.get('similarity')
    
    options = {'name': similarity}
    algo = KNNBaseline(k = k_neigbor, 
                       min_k = min_neighb, 
                       sim_options = options)
    
    # Train the algorithm on the data, and predict ratings for the testset
    algo.fit(data)
    
    prediction = np.zeros([10000,1000])
    for row in range(10000):
        for col in range(1000):
            prediction[row,col] = algo.predict(str(row+1),str(col+1)).est
            
    return prediction
Example #22
0
    def knn_centered(self):
        print("calculating knn centered... File Rating: " + self.file_path)
        print("calculating knn centered... Item to Evaluate: " +
              self.item_evaluated)
        print("calculating knn centered... Number of recommendations: " +
              str(self.number_recommendations))

        # Reader
        reader = Reader(line_format='item user rating',
                        sep=self.delimiter,
                        skip_lines=1)

        # Dataset
        data = Dataset.load_from_file(self.file_path, reader=reader)

        trainset = data.build_full_trainset()
        sim_options = {'name': 'pearson_baseline', 'user_based': False}
        algo = KNNBaseline(sim_options=sim_options)
        algo.fit(trainset)

        item_inner_id = algo.trainset.to_inner_iid(self.item_evaluated)

        item_neighbors = algo.get_neighbors(item_inner_id,
                                            k=int(self.number_recommendations))

        item_neighbors = (algo.trainset.to_raw_iid(inner_id)
                          for inner_id in item_neighbors)

        dictionary_neighbors = {}
        print(
            "\nTransition Component Based Ratings >> Recommended items by KNN Centered:"
        )
        i = 0
        for item in item_neighbors:
            i += 1
            dictionary_neighbors[i] = item
            print("- " + item)

        return dictionary_neighbors
Example #23
0
def compute_movie_neighbors(id_name_dic, name_id_dic, trainset):
    sim_options = {'user_based': False}
    algo = KNNBaseline(sim_options=sim_options)
    algo.fit(trainset)

    #movie_name = name_id_dic.keys()[1]
    movie_name = "古墓迷途2"
    print("movie_name: ", movie_name)
    movie_id = name_id_dic[movie_name]
    print("movie_id: ", movie_id)
    movie_inner_id = algo.trainset.to_inner_iid(movie_id)
    print("内部id: ", movie_inner_id)

    movie_neighbors = algo.get_neighbors(movie_inner_id, k=10)

    movie_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in movie_neighbors)
    movie_neighbors = (id_name_dic[movie_id] for movie_id in movie_neighbors)
    print()
    print("和movie 《", movie_name, "》 最接近的10个movie为:")
    for movie in movie_neighbors:
        print(algo.trainset.to_inner_iid(name_id_dic[movie]), movie)
Example #24
0
def get_top_n_for_user(target_user_id, recom_alg, recom_size):
    
    file_path = os.path.expanduser('static/CRdata.csv')
    reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0,100))
    data = Dataset.load_from_file(file_path,reader=reader)
    trainset = data.build_full_trainset()
    testset = trainset.build_anti_testset()
    
    
    if(recom_alg == 'KNNBaseline'):
    
        similarity = {'name': 'cosine',
            'user_based': True  # compute  similarities between users
            }
        algo = KNNBaseline(sim_options=similarity)
        
    
    
    elif(recom_alg == 'CoClustering'):
        algo = CoClustering()
        
    else:
        algo = SVD()
        

    algo.fit(trainset)
    predictions  = algo.test(testset)

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:recom_size]

    return top_n[str(target_user_id)]
Example #25
0
def main():

	# Loads dataset
	rating_data_set = load_dataset(TRAINING_SET_PATH)

	# # Clean data
	rating_data_set = remove_missing_values(rating_data_set)

	# Slice data
	drop_movie_list, rating_data_set = slice_data(rating_data_set)

	# Loads movie file
	movies = load_movies_file(drop_movie_list, MOVIES_FILE_PATH)


	reader = Reader()

	sim_options = {'name': 'cosine', 'min_support': 2, 'shrinkage': 100, 'user_based': True}
	bsl_options = {'method': 'sgd'}
	data = Dataset.load_from_df(rating_data_set[['CustomerID', 'MovieID', 'Rating']][:1000], reader)

	kf = KFold(n_splits=5)
	#algo = SVD()
	algo = KNNBaseline(k=N, sim_options=sim_options, bsl_options=bsl_options)

	i = 0

	for trainset, testset in kf.split(data):
		print("Running fold: ", i)
		algo.fit(trainset)
		predictions = algo.test(testset)
		precisions, recalls = precision_recall(predictions, 20)

	    # Precision and recall can then be averaged over all users
		print(sum(prec for prec in precisions.values()) / len(precisions))
		print(sum(rec for rec in recalls.values()) / len(recalls))

		i += 1
Example #26
0
def train(trainset: Trainset):
    """
    Train SVD model based on options using utility matrix,
    then dump prediction and algorithm for future usage.
    """
    global loaded_svd_algo, loaded_knn_algo

    svd_options = {
        'n_factors': 82,
        'n_epochs': 33,
        'lr_all': 0.115,
        'reg_all': 0.02
    }

    knn_options = {
        'sim_options': {
            'name': 'pearson',
            'min_support': 4,
            'user_based': False
        },
        'k': 35,
        'min_k': 1
    }

    # setup the algorithm
    svd_algo = SVD(**svd_options)
    knn_algo = KNNBaseline(**knn_options)

    # train and dump
    svd_algo.fit(trainset)
    loaded_svd_algo = svd_algo
    dump.dump(base_dir.joinpath('svd.dump'), algo=svd_algo)

    knn_algo.fit(trainset)
    loaded_knn_algo = knn_algo
    dump.dump(base_dir.joinpath('knn.dump'), algo=knn_algo)
    print('Training and dumping completed')
Example #27
0
def trainModel():
    userID = []
    itemID = []
    rating = []
    # the DSN value should be the name of the entry in odbc.ini, not freetds.conf
    # change the UID and PWD to your own
    conn = pyodbc.connect('DSN=MYMSSQL;UID=SA;PWD=Easton888')
    crsr = conn.cursor()
    with crsr:
        crsr.execute("use DataMiningFull")
        rows = crsr.execute("select users_ind, movies_ind, rating from users_movies \
            where users_ind < 5000").fetchall()

    crsr.close()
    conn.close()

    # make panda dataframe
    for i in rows:
        userID.append(i[0])    
        itemID.append(i[1])
        rating.append(i[2])

    rating_dict = {
        'userID': userID,
        'itemID': itemID,
        'rating': rating
    }
    df = pd.DataFrame(rating_dict)

    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(sim_options=sim_options)
    # train
    algo.fit(trainset)
    return algo
Example #28
0
def get_item_baseline():
    df_clean = pd.read_csv("dataset_clean.csv")
    #######################
    # Fit surprise model
    #######################

    final_model = KNNBaseline(k=60, min_k=2, sim_options={'name': 'pearson_baseline', 'user_based': True})

    total_df = df_clean

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(0, 10))

    # The columns must correspond to user id, item id and ratings (in that order).
    new_trainset = Dataset.load_from_df(total_df, reader).build_full_trainset()

    ## Fit the best model

    final_model.fit(new_trainset)

    item_baselines = final_model.default_prediction() + final_model.compute_baselines()[
        1]  # mean rating + item baseline ?

    return pd.DataFrame({"nootropic": nootropics_list, "item_baselines":item_baselines})
Example #29
0
def train_baseon_playlist():
    # 数据预处理
    data_preprocess()
    path = "./data/"
    file_path = os.path.expanduser(path + "popular_music_suprise_format.txt")
    # 指定文件格式
    reader = Reader(line_format='user item rating timestamp', sep=',')
    # 从文件读取数据
    music_data = Dataset.load_from_file(file_path, reader=reader)

    # 计算歌单和歌单之间的相似度
    print("构建数据集...")
    trainset = music_data.build_full_trainset()  # 把全部数据进行训练,不进行交叉验证

    print("开始训练模型...")
    sim_options = {'user_based': True}  # 基于歌单的协同过滤
    algo = KNNBaseline(sim_options=sim_options)

    algo.fit(trainset)
    surprise.dump.dump(path + 'KNNBaseline_Playlist_Recommand.model',
                       algo=algo)
    # 保证数据一致性
    # 重建歌单id到歌单名的映射字典
    f1 = open(path + "playlist_id_name_dic.pkl", "rb")
    playlist_id_name_dic = pickle.load(f1)
    f1.close()
    f2 = open(path + "popular_music_suprise_format1.txt")
    context = f2.readlines()
    new_playlist_id_name_dic = {}
    for line in context:
        playlist_id, song_id, rating, time = line.split(',')
        new_playlist_id_name_dic[playlist_id] = playlist_id_name_dic[
            playlist_id]
    pickle.dump(new_playlist_id_name_dic,
                open(path + "playlist_id_name_dic.pkl", "wb"))
    f2.close()
	def get(self, algorithm, item_id):
		# SQL query
		conn = mysql.connect()
		cursor = conn.cursor()
		df = pd.read_sql_query("SELECT * FROM story_reviews", conn)

		# Data and Model
		reader = Reader(rating_scale=(1, 5))
		data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader)

		if algorithm == 'pearson':
			sim_options = {'name': 'pearson', 'user_based': False}
		elif algorithm == 'cosine':
			sim_options = {'name': 'cosine', 'user_based': False}
		elif algorithm == 'pearsonbaseline':
			sim_options = {'name': 'pearson_baseline', 'user_based': False}
		elif algorithm == 'msd':
			sim_options = {'name': 'msd', 'user_based': False}
		else:
			sim_options = {'name': 'pearson_baseline', 'user_based': False}

		model = KNNBaseline(sim_options=sim_options)
		
		# Training
		training_set = data.build_full_trainset()
		model.fit(training_set)

		# TESTING
		# cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

		item_inner_id = model.trainset.to_inner_iid(item_id)
		item_neighbors_inner = model.get_neighbors(item_inner_id, k=10)
		item_neighbors = [model.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors_inner]

		# Return K Nearest Neighbors
		return jsonify(recommendations = item_neighbors)
Example #31
0
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid


# First, train the algortihm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]