Example #1
0
def collaborative_filter(id, new_words):
    ratings_dict = calc_collaborative_param(new_words, id)

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is required.
    reader = Reader(rating_scale=(0.0, 5.0))
    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    # define a cross-validation iterator
    kf = KFold(n_splits=3)

    algo = KNNBasic()

    for trainset, testset in kf.split(data):
        # train and test algorithm.
        algo.fit(trainset)
        kf_predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(kf_predictions, verbose=True)

    trainset = data.build_full_trainset()

    new_data = trainset.build_anti_testset()
    predictions = algo.test(new_data)

    top_n = get_top_n(predictions, n=3)

    with open('top_n.json', 'w') as fp:
        dump(top_n, fp, indent=4)

    return top_n
Example #2
0
def knn_running_time(data):
    '''
        Calculates the running times for training and predictions for Basic KNN

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnBasictrain: running time for training
            elapsedtime_KnnBasictest: running time for predictions on testset
    '''
    elapsedtime_KnnBasictrain = []
    elapsedtime_KnnBasictest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNBasic,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knn = KNNBasic(k=k,
                       name=sim,
                       min_support=min_support,
                       user_based=user_based)
        knn.train(training)
        elapsedtime_KnnBasictrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knn.test(testing)
        elapsedtime_KnnBasictest.append(time.time() - test_start)
    return elapsedtime_KnnBasictrain, elapsedtime_KnnBasictest
def use_cosine_similarity():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using cosine similarity')
    sim_options = {
        'name': 'cosine',
        'user_based': False  # compute  similarities between items
    }
    algo_cosine = KNNBasic(sim_options=sim_options)
    algo_cosine.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_KNN = algo_cosine.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_KNN)
    accuracy_mae = accuracy.mae(predictions_KNN)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
def rodar_modelo(data, teste_tamanho, sim_opcoes, k):
    treina, testa = train_test_split(data, teste_tamanho)
    knn = KNNBasic(k=k, sim_options=sim_opcoes)
    knn.fit(treina)
    knn_predicoes = knn.test(testa)
    accuracy.rmse(knn_predicoes)
    return knn
def use_pearson_baseline():
    start = time.time()
    performance = []

    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()

    print('Using Pearson baseline')
    sim_options = {
        'name': 'pearson_baseline',
        'shrinkage': 0  # no shrinkage
    }
    algo_pearson = KNNBasic(sim_options=sim_options)
    algo_pearson.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions_KNN = algo_pearson.test(testset)

    accuracy_rmse = accuracy.rmse(predictions_KNN)
    accuracy_mae = accuracy.mae(predictions_KNN)
    performance.append(accuracy_rmse)
    performance.append(accuracy_mae)

    end = time.time()
    performance.append(end - start)

    return performance
Example #6
0
def KNN_Tester(trainset, testset, algo):
    param_grid = {
        'k': [50, 100],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson']
        }
    }

    gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)
    params = gs.best_params['rmse']
    algo = KNNBasic(k=params['k'], sim_options=params['sim_options'])
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)
    avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    metrics = {
        'rmse': rmse,
        'avg_precision': avg_precision,
        'avg_recall': avg_recall,
        'best_parameters': params
    }
    return metrics
Example #7
0
    def content(self):
        # content based
        surprise_data = self.prepare_Data()
        if surprise_data == []:
            print("No data provided")
            return

        sim_options = {
            'name': 'cosine',
            'user_based': False  # compute  similarities between items
        }
        algo = KNNBasic(sim_options=sim_options)
        trainset = surprise_data.build_full_trainset()
        algo.fit(trainset)
        testset = trainset.build_testset()
        predictions = algo.test(testset)
        recommendation = self.get_top_n(predictions)
        new_list = []
        k = 0
        for i, j in recommendation[self.user_id]:
            data_to_append = {}
            data_to_append.update({'id': k})
            data_to_append.update({'business id': i})
            new_list.append(data_to_append)
            k += 1
        recommend = {}
        recommend = {item['id']: item for item in new_list}
        return (recommend)
Example #8
0
def run_KNN(x_train, x_test, k):
    reader = Reader(rating_scale=(1, 5))
    data_train_df = Dataset.load_from_df(
        x_train[['userId', 'movieId', 'rating']], reader)
    data_test_df = Dataset.load_from_df(
        x_test[['userId', 'movieId', 'rating']], reader)
    data_train = data_train_df.build_full_trainset()
    data_test = data_test_df.build_full_trainset()
    data_testset = data_test.build_testset()
    algo = KNNBasic()
    algo.fit(data_train)
    pr = algo.test(data_testset)
    rec = format_baselines(pr)
    seen = format_baselines_apk(pr, x_test)
    predicted, actual = format_baselines_third(pr, x_test)
    print(predicted)
    print(actual)
    print(f'Alternative Precision {recommender_precision(predicted, actual)}')
    print(f'Alternative Recall {recommender_recall(predicted, actual)}')
    print(f'APK: {yallah(seen, k)}')
    precisions, recalls = precision_recall_at_k(rec, k)
    print(
        f'|KNN : Precision| = {sum(prec for prec in precisions.values()) / len(precisions)}'
    )
    print(
        f'|KNN : Recall| = {sum(rec for rec in recalls.values()) / len(recalls)}'
    )
Example #9
0
class KNN_Basic(BaseSurpriseSTLEstimator):
    """
    Args:
        :attr:`k` (int):
            number of neighbors
        :attr:`sim_options` (optional):
            option from surprise for a similarity metric
    
    """
    def __init__(self, k, name='KNN_Basic', sim_options=None):
        super().__init__(name, 'non_feature_based')
        self.k = k
        if sim_options is not None:
            self.model = KNNBasic(k=self.k,
                                  verbose=False,
                                  sim_options=sim_options)
        else:
            self.model = KNNBasic(k=self.k, verbose=False)

    def _fit(self, x):
        self.model.fit(x)

    def _predict(self, x):
        return self.model.test(x)

    def get_hyper_params(self):
        hparams = {'k': {'type': 'integer', 'values': [2, 13]}}
        return hparams

    def set_hyper_params(self, **kwargs):
        self.k = kwargs['k']

    def similarity_matrix(self):
        return self.model.compute_similarities()
def results():
    names = ['userID', 'itemID', 'rating']
    df = pd.read_csv('~/.surprise_data/ratings.csv', names=names)

    names1 = ['itemID', 'Profession', 'City']
    df1 = pd.read_csv('~/.surprise_data/workers1.csv', names=names1)

    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

    trainset = data.build_full_trainset()

    sim_options = {'name': 'cosine', 'user_based': False}
    algo = KNNBasic(k=40, min_k=1, sim_options={})
    algo.fit(trainset)
    testset = trainset.build_anti_testset()

    predictions = algo.test(testset)
    top_n = get_top_n(predictions, n=10)
    myArray = []
    for uid, user_ratings in top_n.items():
        abcd = []
        #abcd.append(iid for (iid, _) in user_ratings)
        for w in user_ratings:
            abcd.append(w)
        myArray.append([uid, abcd])

    print(myArray)

    return render_template('secondpage.html', returned={'data': myArray})
    return ('results working')
def get_accuracy(df,
                 genre,
                 neighbors=30,
                 min_neighbors=5,
                 seed=12345,
                 kfolds=5,
                 k=5,
                 threshold=4):
    """ Gets the precision and accuracy of the model for each genre using cross validation
        
        Args:
            df (pandas.DataFrame): the dataset of actual ratings
            genre (str): the genre for the model
            neighbors (int): the number of neighbors to take into account when training the model
                             Default is 30.
            min_neighbors (int): the number of neighbors a user must have in order to get a prediction.
                                Default is 5.
            seed (int): setting the random state. Default is 12345.
            kfolds (int): the number of folds for cross validation. Default is 5.
            k (int): number of recommendations for each user. default is 5.
            threshold (int): the cutoff rating at which an item will be considered 'enjoyed.'
        Returns:
            prec (int): The average of precision across the kfolds cross validation
            rec (int): The average of recall across the kfolds cross validation
 	"""

    data = df[df['genre'] == genre]
    data = data[['user_id', 'book_id', 'rating']]
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(data[['user_id', 'book_id', 'rating']], reader)
    algo_KNNbasic = KNNBasic(k=neighbors,
                             min_k=min_neighbors,
                             random_state=seed)

    kf = KFold(n_splits=kfolds, random_state=seed)
    prec_list = []
    recalls_list = []
    for trainset, testset in kf.split(data):
        algo_KNNbasic.fit(trainset)
        predictions = algo_KNNbasic.test(testset)
        precisions, recalls = precision_recall_at_k(predictions,
                                                    k=k,
                                                    threshold=threshold)

        # Precision and recall can then be averaged over all users
        logger.info("Precision:")
        logger.info(
            sum(prec for prec in precisions.values()) / len(precisions))
        precision = (sum(prec
                         for prec in precisions.values()) / len(precisions))
        logger.info("Recall")
        logger.info(sum(rec for rec in recalls.values()) / len(recalls))
        recall = (sum(rec for rec in recalls.values()) / len(recalls))
        prec_list.append(precision)
        recalls_list.append(recall)

    prec = (sum(prec_list) / len(prec_list))
    rec = (sum(recalls_list) / len(recalls_list))
    return prec, rec
Example #12
0
def executeTraining(modelFileName, simOptions):
    knn = KNNBasic(sim_options=sim_options, k=3)
    knn.train(trainingSet)
    testSet = trainingSet.build_anti_testset()
    predictions = knn.test(testSet)

    os.makedirs('./outputs', exist_ok=True)

    with open(modelFileName, "wb") as file:
        joblib.dump(knn, os.path.join('./outputs/', modelFileName))
Example #13
0
def main():
    data = Dataset.load_builtin('ml-100k')
    trainset, testset = train_test_split(data, test_size=.25)
    algo = KNNBasic()
    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Then compute RMSE
    score = accuracy.rmse(predictions)
    print('rmse: ', score)
def knn_basic_movie(train, test, ids, Xtest, Xids):
    """
    kNN basic approach on movies
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """

    print('kNN Basic Movie')
    algo = KNNBasic(k=21,
                    name='msd',
                    min_support=2,
                    user_based=False,
                    verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Example #15
0
def run_collaborative_filtering():
    global top_recommendations
    global knn
    data = Dataset.load_builtin("ml-100k")
    training_set = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': True}
    knn = KNNBasic(sim_options=sim_options)
    knn.fit(training_set)
    test_set = training_set.build_anti_testset()
    predictions = knn.test(test_set)
    top_recommendations = get_top_recommendations(predictions)
    return 'OK'
def do_knn(trainingSet, start_time):
    knn = KNNBasic(sim_options=sim_options)
    # evaluate(knn, Dataset.load_builtin("ml-100k"), measures=['RMSE', 'MAE'])
    knn.fit(trainingSet)
    testSet = trainingSet.build_anti_testset()
    print("Training complete")
    predictions = knn.test(testSet)
    print("Predictions ready")
    LOGGER.info("0;Data prediction completed in '%s' minutes",
                str((time.time() - start_time) / 60))
    print("Rmse values for doing model based recomm on movielens data is " +
          str(accuracy.rmse(predictions)))
    return predictions
Example #17
0
def algoFunc(train_data, test_data):
    SVD_var = SVD()
    print("Singular Value Decomposition :\n")
    SVD_var.fit(train_data)
    predict_var = SVD_var.test(test_data)
    SVD_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    SVD_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nProbabilistic Matrix Factorization :\n")
    PMF_var = SVD(biased=False)
    PMF_var.fit(train_data)
    predict_var = PMF_var.test(test_data)
    PMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    PMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nNon-negative Matrix Factorization :\n")
    NMF_var = NMF()
    NMF_var.fit(train_data)
    predict_var = NMF_var.test(test_data)
    NMF_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    NMF_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nUser based Collaborative Filtering algorithm :\n")
    UB_var = KNNBasic(sim_options={'user_based': True})
    UB_var.fit(train_data)
    predict_var = UB_var.test(test_data)
    user_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    user_MAE_var = accuracy.mae(predict_var, verbose=True)

    print("\nItem based Collaborative Filtering algorithm :\n")
    IB_var = KNNBasic(sim_options={'user_based': False})
    IB_var.fit(train_data)
    predict_var = IB_var.test(test_data)
    item_RMSE_var = accuracy.rmse(predict_var, verbose=True)
    item_MAE_var = accuracy.mae(predict_var, verbose=True)
    print("\n")

    return SVD_RMSE_var, SVD_MAE_var, PMF_RMSE_var, PMF_MAE_var, NMF_RMSE_var, NMF_MAE_var, user_RMSE_var, user_MAE_var, item_RMSE_var, item_MAE_var
Example #18
0
def user_based_cf(co_pe):
    # INITIALIZE REQUIRED PARAMETERS
    # path = 'ml-100k/u.user'
    prnt = "USER"
    sim_op = {'name': co_pe, 'user_based': True}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating",
                    sep='\t',
                    rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    algo.train(trainset)
    print "ALGORITHM USED", co_pe

    # -------------------------------`-------------- MARKERS

    f = io.open("_AlgoHist_ub.txt", "wb")
    f.write(repr(co_pe))
    f.close()

    # --------------------------------------------- MARKERS END

    print "CF Type:", prnt, "BASED"

    # PEEKING PREDICTED VALUES
    search_key = raw_input("Enter User ID:")
    item_id = raw_input("Enter Item ID:")
    actual_rating = input("Enter actual Rating:")

    print algo.predict(str(search_key), item_id, actual_rating)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions, 5)
    result_u = True

    k = input("Enter size of Neighborhood (Min:1, Max:40)")

    inner_id = algo.trainset.to_inner_iid(search_key)
    neighbors = algo.get_neighbors(inner_id, k=k)
    print "Nearest Matching users are:"
    for i in neighbors:
        print "\t " * 6, i
    return top_n, result_u
Example #19
0
    def CosineAlgorithmSurprise(self):
        sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 1}
        model = KNNBasic(sim_options=sim_options)
        model.fit(self.Train)
        #testset = self.Train.build_anti_testset()
        #predictions = model.test(testset)
        predictions = model.test(self.Test)

        df = pd.DataFrame(predictions,
                          columns=[
                              'user_id', 'song_id', 'listen_count',
                              'prediction', 'details'
                          ])
        return model, df
Example #20
0
class MusicRecommend():
    def __init__(self):
        self.current = 0
        self.updateTimeStamp = [(self.current, time.time())]
        self.top_n = defaultdict(list)
        reader = Reader(line_format=READER_OPT["line_format"],
                        sep=READER_OPT["sep"],
                        rating_scale=READER_OPT["rating_scale"],
                        skip_lines=READER_OPT["skip_lines"])
        self.data = Dataset.load_from_file(RATE_PATH, reader=reader)
        if os.path.isfile(RECOMMEND_PATH):
            self.predictions, self.algo = dump.load(RECOMMEND_PATH)
        else:
            sim_opt = {
                "name": ALGO_OPT["similarity"],
                "user_based": ALGO_OPT["user_based"]
            }
            self.algo = KNNBasic(sim_options=sim_opt)
            self.predictions = []

    def __del__(self):
        # dump.dump(RECOMMEND_PATH, predictions=self.predictions, algo=self.algo, verbose=0)
        StdError.info(
            'The dump has been saved as file {}'.format(RECOMMEND_PATH))

    def calculate(self, n=100):
        trainset = self.data.build_full_trainset()
        self.algo.fit(trainset)
        testset = trainset.build_anti_testset()
        self.predictions = self.algo.test(testset)
        self.current += 1
        self.updateTimeStamp.append((self.current, time.time()))
        self.top_n = defaultdict(list)
        for uid, iid, t_rating, est, _ in self.predictions:
            self.top_n[uid].append((iid, est))
        for uid, user_ratings in self.top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            self.top_n[uid] = user_ratings[:n]
        return (self.predictions, self.top_n)

    def get_top_n(self, uid, start=0, end=RECOMMEND_NUM):
        tmplist = self.top_n[str(uid)][start:end]
        return [iid for iid, _ in tmplist]

    def show(self):
        if self.current > 0:
            StdError.info("recommend current version={}".format(self.current))
            for uid, user_ratings in self.top_n.items():
                StdError.info(
                    str(uid) + ":" + str([iid for iid, _ in user_ratings]))
Example #21
0
def gen_pred_matrix_ibcf(co_pe):
    # ---------------------------------------------------- IBCF as is

    # INITIALIZE REQUIRED PARAMETERS
    path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item'
    prnt = "ITEM"
    sim_op = {'name': co_pe, 'user_based': False}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating",
                    sep='\t',
                    rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    res = algo.train(trainset)
    print "\t\t >>>TRAINED SET<<<<\n\n", res

    # Read the mappings raw id <-> movie name
    # rid_to_name, name_to_rid = read_item_names(path)
    print "CF Type:", prnt, "BASED"
    print "Please be Patient while 'pred_matrix-full_ibcf.csv' is being Generated"
    for i in range(5):
        print "."
        time.sleep(0.5)
    # --------------------------------------------------------- EXPERIMENTAL

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions, 5)

    # --------------------------------------------------------- EXPERIMENTAL

    # ---------------------------------------------------- IBCF as is

    csvfile = 'pred_matrix-full_ibcf.csv'
    with open(csvfile, "w") as output:
        writer = csv.writer(output, delimiter=',', lineterminator='\n')
        writer.writerow(['uid', 'iid', 'rat'])
        for uid, user_ratings in top_n.items():
            for (iid, r) in user_ratings:
                value = uid, iid, r
                writer.writerow(value)
    print "Done! You may now check the file in same Dir. as of Program"
Example #22
0
def loadTrainPredict():

    data = Dataset.load_builtin("ml-100k")
    trainingSet = data.build_full_trainset()

    sim_options = {'name': 'cosine', 'user_based': False}

    knn = KNNBasic(sim_options=sim_options)

    knn.fit(trainingSet)

    testSet = trainingSet.build_anti_testset()

    predictions = knn.test(testSet)

    return predictions
Example #23
0
def main():
    data = Dataset.load_builtin("ml-100k")
    trainingSet = data.build_full_trainset()
    sim_options = {
        'name': 'cosine',
        'user_based': True
    }

    knn = KNNBasic(sim_options=sim_options)
    knn.fit(trainingSet)
    testSet = trainingSet.build_anti_testset()
    predictions = knn.test(testSet)

    top3_recommendations = get_top3_recommendations(predictions)
    rid_to_name = read_item_names()
    for uid, user_ratings in top3_recommendations.items():
        print(uid, [rid_to_name[iid] for (iid, _) in user_ratings])
def main():
    row_num = 5000
    #reading the important ratings file to make it a pandas dataframe in order to be used by surprise
    ratings_data = pd.read_csv('datasets/song_dataset_ranking.txt', sep="\t", header=None, nrows = row_num)
    #define the document's columns
    ratings_data.columns = ['userId', 'songId', 'rating']
    #read the csv where it is the songs data
    song_data = open('datasets/song_data.csv', 'rt')
    c_reader = csv.reader(song_data, delimiter=',', quotechar='|')
    #create a hash where we will store the important info from all songs
    song_dict = {}
    #update the hash, example
    #keysonisonioiaofnai: ['Smoke on the water', 'Deep purple']
    for row in c_reader:
        song_dict.update({row[0]: [row[1], row[3]]})
    #surprise reader, define the rating scale to use
    reader = Reader(rating_scale=(1,100))
    #transform info to a surprise dataset
    data = Dataset.load_from_df(ratings_data, reader)
    #split data into training and testSet
    training_set, testSet = train_test_split(data, test_size=.25)
    #define the algorithm to use
    knn = KNNBasic(name="cosine", user_based=False)
    #train the algorithm
    knn.fit(training_set)
    print("Done training")
    print("Test set length", len(testSet))
    print("testing")
    #make predictions
    predictions = knn.test(testSet)
    print("getting recommendations")
    #measure accuracy, Compute FCP (Fraction of Concordant Pairs).
    accuracy.fcp(predictions)
    #get top n predictions
    top_n = get_top_n(predictions,4)
    file = open('predictions.txt', 'w')

    for uid, user_ratings in top_n.items():
        file.write("prediction for " +str(uid) +":\n")
        result_array = [find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]
        for item in result_array:
            file.write("\t")
            file.write('-'.join(item))
            file.write("\n")
        #print("prediction for " +str(uid) +"\n" +str([find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]) + "\n")
    file.close()
Example #25
0
def kNNBasic(trainset, testset):
    # KNN basic
    print("\n" + "-" * 5 + " KNNBasic algorithm using surprise package " +
          "-" * 5)
    sim_options = {
        'name': 'MSD',  # MSD similarity measure gives the best result
        #  'user_based': True  # compute  similarities between users: MAE = 0.7744112391896695
        'user_based':
        False  # compute  similarities between items: MAE = 0.7685376263051
    }
    algo = KNNBasic(sim_options=sim_options)
    # algo = KNNBasic()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
def KNN_top_n(data):
    # First train an SVD algorithm on the movielens dataset.
    # data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    algo = KNNBasic()
    algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    top_n = get_top_n(predictions, n=10)

    # Dump algorithm and reload it.
    file_name = os.path.expanduser('./KNNBasic_model_couchDB')
    dump.dump(file_name, algo=algo)
    print("file dumped")
class RecommenderItemBased(Recommender):
    def __init__(self, recommendation_dataset: RecommendationDataSet, similarity='cosine'):
        super(RecommenderItemBased, self).__init__(recommendation_dataset.movies)
        self.recommendation_dataset = recommendation_dataset
        sim_options = {'name': similarity,
                       'user_based': False
                       }
        self.algorithm = KNNBasic(sim_options=sim_options)

    def get_recommendation(self, watched,  k=20, k_inner_item=100):
        similar_items = self.get_similar_movie_ids(watched, k=k, k_inner_item=k_inner_item)
        return self.movies.get_movie_by_movie_ids(similar_items)

    def fit(self, dataset):
        return self.algorithm.fit(dataset)

    def test(self, test_set):
        return self.algorithm.test(test_set)

    def get_similar_movie_ids(self, watched, k=20,  k_inner_item=100):
        """
            Based on similar item movies, find nearest movies to the watched
            :param
        """
        full_dataset = self.algorithm.trainset

        # watched movies
        watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()}

        # Get most liked movies
        # inner_item_ratings = full_dataset.ur[inner_user_id]
        most_liked = heapq.nlargest(k_inner_item, watched, key = watched.get) #['Ocena','OcenaImdb','averageRating'])[['movieId','OcenaImdb']]
        
        # Get the stuff they rated, and add up ratings for each item, weighted by user similarity
        candidates = defaultdict(float)
        for most_liked_inner_id in most_liked:
            rating = watched[most_liked_inner_id]
            similarity_row = self.algorithm.sim[most_liked_inner_id]

            for inner_id, score in enumerate(similarity_row):
                if inner_id!=most_liked_inner_id:
                    candidates[inner_id] += score * (rating / 5.0)

        # return top-n movies
        similar_items = [full_dataset.to_raw_iid(i) for i in heapq.nlargest(k, candidates, key = candidates.get)]
        return similar_items
Example #28
0
    def Basic_CF(self):
        kf = KFold(n_splits=5)
        sim_options = {'name': 'cosine', 'user_based': True}
        algo = KNNBasic(k=40, min_k=1, sim_options=sim_options)

        for trainset, testset in kf.split(self.data):
            algo.fit(trainset)
            predictions = algo.test(testset)

            precisions, recalls = self.precision_recall_at_k(predictions)

            P = sum(prec for prec in precisions.values()) / len(precisions)
            R = sum(rec for rec in recalls.values()) / len(recalls)
            F1 = 2 * P * R / (P + R)

            print("Precision : ", P)
            print("Recall    : ", R)
            print("F1        : ", F1)
Example #29
0
def GetAccuracy():
    d = Data()
    data = d.loadData()

    trainSet = data.build_full_trainset()

    _, testSet = train_test_split(data, test_size=.25, random_state=1)

    model = KNNBasic(sim_options=sim_options, verbose=False)
    model.fit(trainSet)
    predictions = model.test(testSet)


    mae = accuracy.mae(predictions, verbose=False)

    rmse = accuracy.rmse(predictions, verbose=False)

    return mae, rmse
def recommendation_base_on_itemCF(train_data, user_item_matrix, user_ID, N):
    # 阅读器
    reader = Reader(line_format='user item rating', sep=',')
    # 载入数据
    raw_data = Dataset.load_from_df(user_item_matrix, reader=reader)

    # 构建模型
    raw_data.split(n_folds=5)
    # kf = KFold(n_splits=5)
    knn_item = KNNBasic(k=40, sim_options={'user_based': False})
    # 训练数据,并返回rmse误差
    for train_set, test_set in raw_data.folds():
        knn_item.fit(train_set)
        predictions = knn_item.test(test_set)
        accuracy.rmse(predictions, verbose=True)

    # 用户听过的歌曲合集
    user_songs = {}
    for user, group in user_item_matrix.groupby('user'):
        user_songs[user] = group['item'].values.tolist()
    # 歌曲合集
    songs = user_item_matrix['item'].unique().tolist()
    # 歌曲ID和歌曲名称对应关系
    songID_titles = {}
    for index in train_data.index:
        songID_titles[train_data.loc[index, 'song']] = train_data.loc[index,
                                                                      'title']

    # itemCF
    # 用户听过的音乐集
    user_items = user_songs[user_ID]

    # 用户对未听过音乐的评分
    item_rating = {}
    for item in songs:
        if item not in user_items:
            item_rating[item] = knn_item.predict(user_ID, item).est

    # 找出评分靠前的N首歌曲
    song_id = dict(
        sorted(item_rating.items(), key=lambda x: x[1], reverse=True)[:N])
    song_topN = [songID_titles[s] for s in song_id.keys()]

    return song_topN