def trainer(self):
     # Set the random seed that numpy (used internally by Surprise) will use.
     my_seed = random.randint(0, 2**32)
     random.seed(my_seed)
     numpy.random.seed(my_seed)
     # Reassurance that the script is actually running.
     self.printer(
         "\nNow training on the MovieLens latest small dataset. (8 folds used)"
     )
     self.printer("Please wait...\n")
     # Define the file's format
     reader = Reader(line_format='user item rating timestamp', sep=',')
     # Load the data from the ratings.csv file
     data = Dataset.load_from_file('./ml-latest-small/ratings.csv',
                                   reader=reader)
     # Use the SVD algorithm for prediction
     method = SVD()
     start = time.time()
     # Use 8-fold cross validation and evaluate the results with RMSE and MAE
     measurements = cross_validate(method,
                                   data,
                                   measures=['RMSE', 'MAE'],
                                   cv=8,
                                   verbose=False,
                                   n_jobs=-2,
                                   return_train_measures=True)
     # Print the random seed used for fold assignments
     self.printer(
         "Random seed used for fold assignment: {}\n".format(my_seed))
     # Show the stats
     meanFitTime = numpy.mean(measurements["fit_time"])
     meanTestTime = numpy.mean(measurements["test_time"])
     meanTestMAE = numpy.mean(measurements["test_mae"])
     meanTestRMSE = numpy.mean(measurements["test_rmse"])
     meanTrainMAE = numpy.mean(measurements["train_mae"])
     meanTrainRMSE = numpy.mean(measurements["train_rmse"])
     self.printer(
         "Mean fit time per fold: {:0.5f} seconds".format(meanFitTime))
     self.printer(
         "Mean test time per fold: {:0.5f} seconds".format(meanTestTime))
     self.printer("Mean train MAE per fold: {:0.5f}".format(meanTrainMAE))
     self.printer("Mean train RMSE per fold: {:0.5f}".format(meanTrainRMSE))
     self.printer("Mean test MAE per fold: {:0.5f}".format(meanTestMAE))
     self.printer("Mean test RMSE per fold: {:0.5f}\n".format(meanTestRMSE))
     # Train with the dataset
     trainset = data.build_full_trainset()
     method.fit(trainset)
     end = time.time()
     spent = end - start
     self.printer(
         "Training and testing time: {:0.3f} seconds\n".format(spent))
     process = psutil.Process(os.getpid())
     self.printer("Memory used:")
     self.printer("{:0.5f}".format(process.memory_info().rss / 1048576.0) +
                  " MB Physical")
     self.printer("{:0.5f}".format(process.memory_info().vms / 1048576.0) +
                  " MB Virtual")
     return method, trainset
Beispiel #2
0
def main():
    # Initialize  dataset (from old code)
    Y_train = np.loadtxt('data/train.txt').astype(int)
    Y_test = np.loadtxt('data/test.txt').astype(int)

    M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int)  # users
    N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int)  # movies
    print("Factorizing with M: ", M, " users, N: ", N, " movies.")

    # Load data with Surprise
    reader = Reader(line_format='user item rating', sep='\t')
    Y_train = Dataset.load_from_file('data/train.txt', reader=reader)
    Y_test = Dataset.load_from_file('data/test.txt', reader=reader)

    trainset = Y_train.build_full_trainset()
    testset = Y_test.build_full_trainset().build_testset()

    K = 20
    reg = 0.1
    lr = 0.01

    # PART 5-3: INTRODUCE MEAN AND REGULARIZED BIAS TERMS
    # (based off of Step 1c in the guide)
    # Create model and fit it
    algo = SVD(n_factors=K, lr_all=lr, reg_all=reg, n_epochs=30, biased=True)
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Evaluate error using err function from problem set
    E_out = get_err(predictions)
    print('E_out (MSE): ', E_out)

    # Try GridSearchCV
    '''
    param_grid = {'n_epochs': [10, 15, 20, 25, 30],
                 'lr_all':   [0.002, 0.005, 0.01, 0.02, 0.03],
                 'reg_all':  [0.005, 0.01, 0.05, 0.1, 0.2, 0.3]}
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
    gs.fit(Y_train)

    print('Grid Search:')
    print(0.5 * gs.best_score['rmse'] ** 2)
    print(gs.best_params['rmse'])
    '''
    # Results: best params were n_epochs=30, reg=0.1, lr=0.01

    # Apply SVD to V
    V = algo.qi.T
    U = algo.pu
    A, s, B = np.linalg.svd(V)
    # Use first 2 columns of A
    A2 = A[:, :2]
    U_projected = np.dot(A2.T, U.T)
    V_projected = np.dot(A2.T, V).T
    X = V_projected[:, 0]
    Y = V_projected[:, 1]

    visualize(X, Y, '5-3')
def Liked(id):
	testSubject = id
	ml = MovieLens()

	print("Loading movie ratings...")
	data = ml.loadMovieLensLatestSmall()

	userRatings = ml.getUserRatings(testSubject)
	loved = []
	hated = []
	for ratings in userRatings:
		if (float(ratings[1]) > 4.0):
			loved.append(ratings)
		if (float(ratings[1]) < 3.0):
			hated.append(ratings)

	print("\nUser ", testSubject, " loved these movies:")
	for ratings in loved:
		print(ml.getMovieName(ratings[0]))
	print("\n...and didn't like these movies:")
	for ratings in hated:
		print(ml.getMovieName(ratings[0]))

	print("\nBuilding recommendation model...")
	trainSet = data.build_full_trainset()

	algo = SVD()
	algo.fit(trainSet)

	print("Computing recommendations...")
	testSet = BuildAntiTestSetForUser(testSubject, trainSet)
	predictions = algo.test(testSet)

	recommendations = []

	print ("\nWe recommend:")
	for userID, movieID, actualRating, estimatedRating, _ in predictions:
		intMovieID = int(movieID)
		recommendations.append((intMovieID, estimatedRating))

	recommendations.sort(key=lambda x: x[1], reverse=True)
	s="\n"+str(id)
	for ratings in recommendations[:10]:
		s+=","+ml.getMovieName(ratings[0])
	file = open("E:\\Neeraj\\LikhedBase.txt", "r") 
	alld=file.readlines()
	file.close()
	file1 = open("E:\\Neeraj\\LikhedBase.txt", "w")
	for r1 in alld:
		print(r1)
		u=r1.find(",")
		if(r1[0:u]==str(id)):
			pass
		else:
			file1.write(r1)
	file1.write(s)
	file1.close()
	print ("\nDone")
Beispiel #4
0
def recommender_testing(file_name):
    '''Perform testing on the recommender, main function
    
    :param file_name: dataset file name
    :return: 
    '''
    print('began testing')
    listening_data = pd.read_table(file_name)

    raw_data = listening_data.drop(listening_data.columns[1], axis=1)
    raw_data.columns = ['user', 'artist', 'plays']

    # Drop NaN columns
    data = raw_data.dropna()

    data = data.copy()

    # Create a numeric user_id and artist_id column
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")
    data['user_id'] = data['user'].cat.codes
    data['artist_id'] = data['artist'].cat.codes

    # from Surprise documentation
    algo = SVD()
    reader = Reader(rating_scale=(1, 50))
    sampled_data = data.sample(500000)
    surprise_data = Dataset.load_from_df(sampled_data[['user_id', 'artist_id', 'plays']], reader)
    trainset = surprise_data.build_full_trainset()
    algo.fit(trainset=trainset)
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    top_n = get_top_n(predictions, n=10)

    # Print the recommended items for each user
    for uid, user_ratings in top_n.items():
        print(uid, [iid for (iid, _) in user_ratings])

    cross_validate(algo, surprise_data, ['RMSE', 'MAE'], cv=4, verbose=True)

    # Create a numeric user_id and artist_id column
    data['user'] = data['user'].astype("category")
    data['artist'] = data['artist'].astype("category")
    data['user_id'] = data['user'].cat.codes
    data['artist_id'] = data['artist'].cat.codes

    # The implicit library expects data as a item-user matrix so we
    # create two matricies, one for fitting the model (item-user)
    # and one for recommendations (user-item)
    sparse_item_user = scipy.sparse.csr_matrix((data['plays'].astype(float), (data['artist_id'], data['user_id'])))
    sparse_user_item = scipy.sparse.csr_matrix((data['plays'].astype(float), (data['user_id'], data['artist_id'])))

    # Initialize the als model and fit it using the sparse item-user matrix
    model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

    data_conf = (sparse_item_user).astype('double')
    model.fit(data_conf)
    surprise_testing(surprise_data)
def train_user_base(movies):
	reader = Reader()
	ratings = pd.read_csv('ratings_small.csv')
	data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)
	data.split(n_folds=5)
	svd = SVD()
	trainset = data.build_full_trainset()
	svd.fit(trainset)
	return svd
Beispiel #6
0
def recomendacion(usuario):
    array = []
    for rate in Calificacion.objects.all():
        array.append([rate.usuario_id, rate.asignatura_id, rate.calificacion])

    df = pd.DataFrame(data=array)
    reader = Reader(rating_scale=(0, 10))
    data = Dataset.load_from_df(df, reader)
    trainingSet = data.build_full_trainset()
    param_grid = {
        'n_factors': [50, 100, 150],
        "n_epochs": [40, 50, 60],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }

    gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
    gs.fit(data)
    #Parametros optimos
    params = gs.best_params["rmse"]
    SVDoptimized = SVD(n_factors=params['n_factors'],
                       n_epochs=params['n_epochs'],
                       lr_all=params['lr_all'],
                       reg_all=params['reg_all'])
    SVDoptimized.fit(trainingSet)

    asig = Asignatura.objects.all()

    asig_user = Calificacion.objects.all().filter(usuario_id=usuario.id)

    #Asignaturas sin calificar
    asignaturas_SinC = []
    for asignatura in asig:
        encontrado = False
        for asignatura_usuario in asig_user:
            if (asignatura_usuario.asignatura_id == asignatura.codigo):
                encontrado = True
        if (not encontrado):
            asignaturas_SinC.append(asignatura)

    #asignaturas_recomendados
    asignaturas_rec = []

    for asignatura in asignaturas_SinC:
        asignaturas_rec.append({
            'asignatura':
            asignatura,
            'svd':
            SVDoptimized.predict(usuario.id, asignatura.codigo).est
        })
    # A function that returns the 'year' value:
    def ordenador(e):
        return e['svd']

    asignaturas_rec.sort(reverse=True, key=ordenador)

    return asignaturas_rec
def get_svd(df_ratings):
    reader = Reader()
    #training
    data = Dataset.load_from_df(df_ratings, reader)
    data.split(n_folds=5)
    svd = SVD()
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    return svd
Beispiel #8
0
def train(data):
    reader = Reader()
    svd = SVD()

    data_sp = Dataset.load_from_df(data[['user_id', 'movie_id', 'rating']],
                                   reader)
    train = data_sp.build_full_trainset()
    svd.fit(train)
    return svd
Beispiel #9
0
def train_model(data):
    """
    Accepts dataset and returns trained model
     """
    trainsetfull = data.build_full_trainset()
    algo = SVD()
    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    algo.fit(trainsetfull)
    return algo
Beispiel #10
0
def svdalgorithm(trainset, testset):

    print("\n" + "-" * 5 + " SVD algorithm using surprise package " + "-" * 5)
    algo = SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    mae = accuracy.mae(predictions)
    return rmse, mae, predictions
Beispiel #11
0
class SVDCollaborativeFiltering:

    # Based on Singular Value Decomposition (SVD) implementation built into surprise library
    # Uses a matrix factorization method to reduce a matrix into lower dimension parts simplifying the calculations

    def __init__(self, ratings):
        # Surprise library does not allow using data frames as training and test set values
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']],
                                    reader)

        self.train, self.test = train_test_split(data, test_size=.20)
        self.model = SVD()

    def test_model(self):
        # Checks the predicted values against the test set
        # Returns Root Mean Square Error (RMSE) accuracy
        predictions = self.model.test(self.test)
        return accuracy.mae(predictions,
                            verbose=False), accuracy.rmse(predictions,
                                                          verbose=False)

    def train_model(self):
        # Trains the model on the training set (80% of the total ratings data)
        self.model.fit(self.train)

    def predict(self, user_id, books, ratings, already_read=None):
        # Predicts recommended books for a given user

        # Gets all unread books
        if already_read is None:
            already_read = ratings[ratings['user_id'] ==
                                   user_id]['book_id'].unique()

        prediction = books[[
            'book_id', 'title', 'authors', 'average_rating', 'image_url'
        ]].copy()
        prediction = prediction[~prediction['book_id'].isin(already_read)]

        # Predicts a rating for each book and sorts them
        prediction['predict'] = prediction['book_id'].apply(
            lambda x: self.model.predict(user_id, x).est)
        prediction = prediction.sort_values('predict', ascending=False)
        return convert(prediction)

    def save(self, location):
        # Fully saves the model
        pickle.dump(self, open(location, 'wb'))

    @staticmethod
    def load(location):
        # Loads the model
        infile = open(location, 'rb')
        obj = pickle.load(infile)
        infile.close()
        return obj
Beispiel #12
0
    def create_utility_matrix(self):
        trainsetfull = self.data.build_full_trainset()
        testsetfull = trainsetfull.build_anti_testset()

        algo = SVD(n_factors=85)

        #n_factors=80,reg_all=0.05

        algo.fit(trainsetfull)
        self.predictions = algo.test(testsetfull)
Beispiel #13
0
 def SVDAlgorithmPredict(self, k):
     model = SVD(n_factors=k)
     model.fit(self.Train)
     predictions = model.test(self.Test)
     df = pd.DataFrame(predictions,
                       columns=[
                           'user_id', 'song_id', 'listen_count',
                           'prediction', 'details'
                       ])
     return model, df
Beispiel #14
0
class GlobalProportionAlgo(AlgoBase):
    def __init__(self, cat_products, cat_target):
        """
        Cette méthode consiste à recommander peu à peu des objets en prenant à chaque fois l'objet avec la meilleure similarité
        dans la catégorie des objets qui est le plus loin de sa valeur cible en proportion parmi les résultat déjà obtenus
        """

        AlgoBase.__init__(self)
        # Le modèle qui nous donne les \hat{r}_ij.
        self.SVD = SVD()

        # Les informations pour la partnership
        self.cat_products = cat_products
        self.cat_target = cat_target

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        self.SVD.fit(trainset)
        return self


    def preprocess(self, test_set):
        C = len(self.cat_target)
        self.predicted = dict()
        heaps = [[] for _ in range(C)]
        n = 0
        current_prop = np.zeros(C)
        # We use C heaps to gather the similarities
        for u, i, _ in test_set:
            heapq.heappush(heaps[self.cat_products[i]],(self.SVD.estimate(u,i),u,i))
        while 1:
            if n == 0:
                selected_category = np.argmax(np.array([heap==[] for heap in heaps]))
            else:
                status = current_prop - self.cat_target*(n+1)
                status = np.abs(np.clip(status,a_min = None, a_max = 0))
                for c in range(C):
                    if heaps[c]==[]:
                        status[c]=-1
                selected_category = np.argmax(status)

            continu = True
            while heaps[selected_category]!=[] and continu:
                est, u, i = heapq.heappop(heaps[selected_category])
                if not (int(u) in self.predicted):
                    self.predicted[int(u)]=int(i)
                    current_prop[selected_category]+=1
                    n+=1
                    continu = False
            if heaps == [[] for _ in range(C)]:
                return


    def estimate(self, u, i):
        return -1
Beispiel #15
0
def main():
    train_y = np.loadtxt('data/train.txt').astype(int)
    test_y = np.loadtxt('data/test.txt').astype(int)

    reader = Reader()
    Y_train = Dataset.load_from_file('data/train.txt', reader)
    Y_train = Y_train.build_full_trainset()

    # regularization factor (0.1 was the best)
    regs = [10**-4, 10**-3, 10**-2, 10**-1, 1]
    # learning rates (0.01 was the best)
    eta = [0.005, 0.01, 0.03, 0.05, 0.07]
    # number of Latent Factors (5 is the best)
    Ks = [5, 10, 15, 20, 30, 40]
    E_ins = []
    E_outs = []

    # Use to compute Ein and Eout
    for reg in regs:
        E_ins_for_lambda = []
        E_outs_for_lambda = []

        for k in Ks:

            print('MODEL')
            algo = SVD(n_factors=k,
                       n_epochs=300,
                       biased=True,
                       lr_all=0.01,
                       reg_all=reg)
            algo.fit(Y_train)
            e_in = error(train_y, algo)
            E_ins_for_lambda.append(e_in)
            eout = error(test_y, algo)
            E_outs_for_lambda.append(eout)

        E_ins.append(E_ins_for_lambda)
        E_outs.append(E_outs_for_lambda)

    for i in range(len(regs)):
        plt.plot(Ks, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i]))
    plt.title('$E_{in}$ vs. Number of Latent Factors (K)')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('E_in_SURPRISE(Latent Factors).png')
    plt.clf()

    for i in range(len(regs)):
        plt.plot(Ks, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i]))
    plt.title('$E_{out}$ vs. Number of Latent Factors (K)')
    plt.xlabel('K')
    plt.ylabel('Error')
    plt.legend()
    plt.savefig('E_out_SURPRISE(Latent Factors).png')
def SVD_surprise_only(Trainset, N=30):
    reader = Reader()
    Trainset_changetype = Dataset.load_from_df(
        Trainset[['Member_encoding', 'Game_encoding', 'score']], reader)
    Trainset_changetype_result = Trainset_changetype.build_full_trainset()
    svd = SVD(
        n_factors=20,
        n_epochs=20,
        lr_all=0.01,  #0.0001,
        random_state=1234)
    svd.fit(Trainset_changetype_result)

    games = list(Trainset.Game_encoding.unique()
                 )  # Get our unique games that were purchased

    #model SVD_New
    data = np.transpose(np.dot(svd.pu, np.transpose(svd.qi)))
    x = cosine_similarity(data, data)
    cosine_sim_x = pd.DataFrame(data=x, index=games, columns=games)
    gamesplayed = Trainset.groupby([
        'Member_encoding'
    ])['Game_encoding'].apply(list).reset_index(name='games')
    gamesmax = np.array(
        gamesplayed.games.map(lambda x:
                              ((cosine_sim_x.loc[x, :].values).max(axis=0))))
    gamelist = np.array(cosine_sim_x.columns)

    def Get_neighbor_30(x):
        # x[x>0.99] = 0.0
        return (gamelist[np.flip(np.argsort(x, axis=0))[0:N, ]])

    filtered = list(map(Get_neighbor_30, gamesmax))
    filtered_array = np.array(filtered)
    filtered_array = filtered_array.reshape(
        filtered_array.shape[0] * filtered_array.shape[1], -1)
    filtered_array = filtered_array.reshape(-1, )
    SVD_Neighbor = pd.DataFrame({
        'Member_encoding':
        np.repeat(np.array(np.unique(Trainset.Member_encoding)), N, axis=0),
        'Game_encoding':
        filtered_array
    })
    #SVD_Neighbor_result = SVD_Neighbor.groupby('member_id').head(12)
    SVD_Neighbor_result = SVD_Neighbor.merge(
        Trainset[['Member_encoding', 'Game_encoding', 'score']],
        how='left',
        on=['Member_encoding', 'Game_encoding'])
    SVD_Neighbor_result.score = np.where(SVD_Neighbor_result.score.isna(), 0,
                                         SVD_Neighbor_result.score)
    SVD_Neighbor_result = SVD_Neighbor_result.sort_values(
        by=['Member_encoding', 'score'], ascending=False)
    SVD_Neighbor_result = SVD_Neighbor_result.groupby('Member_encoding').head(
        12)

    return SVD_Neighbor, SVD_Neighbor_result
Beispiel #17
0
def hybrid_rec(userid, favemovie, n):
    '''this takes in a userid, favemovie and n number of recs and outputs those in a sorted list'''
    rec_hybrid = content_recommendations(favemovie, n)
    svd = SVD(n_factors=50, reg_all=0.05, random_state=150)
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    for index, row in rec_hybrid.iterrows():
        pred = svd.predict(userid, index)
        rec_hybrid.at[index, 'score'] = pred.est
    rec_hybrid = rec_hybrid.sort_values('score', ascending=False)
    return rec_hybrid
def get_predictions(data):
    # First train an SVD algorithm on the movielens dataset.
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)

    # Than predict ratings for all pairs (u, i) that are NOT in the training set.
    testset = trainset.build_testset()
    predictions = algo.test(testset)

    return predictions
Beispiel #19
0
def biassvd(dataset):
    start = time.time()
    algo = SVD(biased=True)
    kf = KFold(n_splits=5)
    for trainset, testset in kf.split(dataset):
        algo.fit(trainset)
        predictions = algo.test(testset)
        acc = accuracy.rmse(predictions, verbose=True)
    end = time.time()
    print('biassvd花分钟数为:', (end - start) / 60)
    return acc
Beispiel #20
0
class SVDRS(AbstractRS):
    def __init__(self, path):
        self.path = path
        self.algo = SVD()

    def train(self):
        try:
            trainset = self.data.build_full_trainset()
            self.algo.fit(trainset)
        except Exception as ex:
            Logger('error').get_log().error(ex)
Beispiel #21
0
 def create_model(self):
     n = 1000000
     raw_data = self.get_ratings()[:n].fillna(0)[["userId", "id", "rating"]]
     reader = Reader()
     data = Dataset.load_from_df(raw_data, reader)
     data.split(n_folds=5)
     svd = SVD()
     trainset = data.build_full_trainset()
     svd.fit(trainset)
     filename = "C:/datasets/the-movies-dataset/models/collaborative_based/coll_svd.sav"
     joblib.dump(svd, filename)
Beispiel #22
0
def SVD_calculation(data , trainset, testset, time, cv):
    start = time.time()
    algo = SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    #svd_accuracy = accuracy.rmse(predictions)
    cross_validate_svd_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True)
    end = time.time()
    time = end-start
    
    return time, cross_validate_svd_dict
def fit_model(data):
    train, test = train_test_split(data, test_size=0.25)
    svd = SVD(n_epochs=25, lr_all=0.01, reg_all=0.4)
    svd.fit(train)
    pred = svd.test(test)
    print('RMSE for test set: {}'.format(accuracy.rmse(pred)))
    print('MAE for test set: {}'.format(accuracy.mae(pred)))
    # save model
    path = '../Models/Collaborative_filtering2.model'
    pickle.dump(svd, open(path, 'wb'))
    print("Model is saved to: {}".format(path))
Beispiel #24
0
def func2():
    from surprise import SVD
    from surprise import Dataset
    from surprise import accuracy
    from surprise.model_selection import train_test_split

    data = Dataset.load_builtin('ml-100k')
    trainset, testset = train_test_split(data, test_size=.25)
    algo = SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
Beispiel #25
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
Beispiel #26
0
 def setCFItemBased(self):
     reader = Reader(line_format='item user rating',
                     sep=',',
                     rating_scale=(1, 5))
     data_folds = Dataset.load_from_df(
         self.ratings[['shop_id', 'ch_id', 'rating']], reader)
     trainset = data_folds.build_full_trainset()
     algo = SVD(n_factors=50,
                n_epochs=20)  # Matrix Factorization; 파라미터 튜닝 결과 최적 값으로 조정
     algo.fit(trainset)
     dump.dump('./model/cf_itembase_ForShop.py', algo=algo)
     """
	def get(self, algorithm, user_id):
		# SQL query
		conn = mysql.connect()
		cursor = conn.cursor()
		df = pd.read_sql_query("SELECT * FROM story_reviews", conn)

		# Data and Model
		reader = Reader(rating_scale=(1, 5))
		data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader)

		if algorithm=='svd':
			print('Using SVD')
			model = SVD()
		elif algorithm=='svdpp':
			print('Using SVD++')
			model = SVDpp()
		elif (algorithm=='nmf'):
			print('Using NMF')
			model = NMF()
		elif (algorithm=='slopeone'):
			print('Using Slope One')
			model = SlopeOne()
		elif (algorithm=='coclustering'):
			print('Using Co-Clustering')
			model = CoClustering()
		else:
			print('Using SVD')
			model = SVD()
		
		# Training
		training_set = data.build_full_trainset()
		model.fit(training_set)

		# Prediction
		anti_training_set = training_set.build_anti_testset()
		prediction_set = [x for x in anti_training_set if x[0]==user_id]
		predictions = model.test(prediction_set)

		# TESTING : Run 5-fold Cross Validation using Root Mean Square Error and Mean Average Error
		# cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
		
		# Return Top N Recommendations
		n = 10
		predictions.sort(key=lambda x:x.est, reverse=True)
		top_n_predictions = predictions[:n]

		story_recommendations = []
		
		for predictionItem in top_n_predictions:
			story_recommendations.append(predictionItem.iid)

		return jsonify(recommendations = story_recommendations)
Beispiel #28
0
def get_top_n(teaid, score, qq, n=10):
    # A reader is still needed but only the rating_scale param is requiered.
    qq = pd.concat([
        qq,
        pd.DataFrame([[score, teaid, 'user1']],
                     columns=['Score', 'Tea Name', 'User Name'])
    ],
                   ignore_index=True)
    reader = Reader(rating_scale=(0, 100))
    algo = SVD()
    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(qq[['User Name', 'Tea Name', 'Score']], reader)
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    want = []
    for i in predictions:
        if i[0] == 'user1':
            want.append(i)
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in want:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:10]
    teadist = []
    mindist = []
    eudist = 0

    for i in top_n['user1']:
        eudis=(euclidean_distances(teadf.iloc[itemdf[itemdf['Tea Name']==teaid].index,:], \
            teadf.iloc[(itemdf[itemdf['Tea Name']==i[0]].index),:]))
        teadist.append((i[0], eudist))
    mindist = sorted(teadist, key=lambda x: x[1])
    return mindist[0], mindist[1], mindist[2]
def do_svd(trainingSet, start_time):
    svd = SVD()
    # evaluate(svd, Dataset.load_builtin("ml-100k"), measures=['RMSE', 'MAE'])
    svd.fit(trainingSet)
    testSet = trainingSet.build_anti_testset()
    print("Training complete")
    predictions = svd.test(testSet)
    print("Predictions ready")
    LOGGER.info("0;Data prediction completed in '%s' minutes",
                str((time.time() - start_time) / 60))
    print("Rmse values for doing svd based recomm on movielens data is " +
          str(accuracy.rmse(predictions)))
    return predictions
Beispiel #30
0
def func6():
    from surprise import SVD
    from surprise import Dataset
    from surprise import accuracy
    from surprise.model_selection import KFold

    data = Dataset.load_builtin('ml-100k')
    kf = KFold(n_splits=3)
    algo = SVD()
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
    def collaborative(self,ratings,user_id):

        reader = Reader()
        #ratings.head()

        temp_ratings = ratings



        data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
        data.split(n_folds=2)

        ## Training the data ##
        svd = SVD()
        evaluate(svd, data, measures=['RMSE', 'MAE'])

        trainset = data.build_full_trainset()

        algo = SVD()
        algo.fit(trainset)

        #svd.train(trainset)
        ## Testing the data ##

        from collections import defaultdict
        testset = trainset.build_anti_testset()
        predictions = algo.test(testset)

        count = 0
     
        for uid, iid, true_r, est, _ in predictions:

             if uid == user_id:
                count = count+1
                temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]

        #print("count\n")
        #print(count)
        #print("\n--------here-------\n")	
        #print(temp_ratings)

        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]
        #print("\n--------here-------\n")
        #print(cb)
        
        cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']]

        return(cb)
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import Dataset
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold


data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

kf = KFold(n_splits=3)
for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold number', i + 1)
    algo.fit(trainset_cv)

    print('On testset,', end='  ')
then reloaded and can be used again for making predictions.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
import os

from surprise import SVD
from surprise import Dataset
from surprise import dump


data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

# Compute predictions of the 'original' algorithm.
predictions = algo.test(trainset.build_testset())

# Dump algorithm and reload it.
file_name = os.path.expanduser('~/dump_file')
dump.dump(file_name, algo=algo)
_, loaded_algo = dump.load(file_name)

# We now ensure that the algo is still the same by checking the predictions.
predictions_loaded_algo = loaded_algo.test(trainset.build_testset())
assert predictions == predictions_loaded_algo
print('Predictions are the same')
def hybrid(userId,train_rd):
    #get_ipython().magic('matplotlib inline')
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate

    import warnings; warnings.simplefilter('ignore')


    # In[2]:


    #Popularity#

    md = pd.read_csv('CustomData/FinalData.csv')

    fd = pd.read_csv('avg_ratings1.csv')



    fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float')
    vote_averages= fd[fd['rating'].notnull()]['rating']
    C = vote_averages.mean()


    fd1 = pd.read_csv('ratings_count.csv')


    fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float')
    vote_counts = fd1[fd1['rating'].notnull()]['rating']


    # In[3]:


    m = vote_counts.quantile(0.75)



    # In[4]:


    md['ratings_count'] = fd1['rating']
    md['average_rating'] = fd['rating']


    # In[28]:


    #print(md.shape)
    qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']]

    qualified['ratings_count'] = qualified['ratings_count'].astype('float')

    qualified['average_rating'] = qualified['average_rating'].astype('float')

    #qualified.shape


    # In[29]:


    def weighted_rating(x):
        v = x['ratings_count']
        R = x['average_rating']
        return (v/(v+m) * R) + (m/(m+v) * C)


    # In[30]:


    qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1)
    #qualified['wr']
    #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250)
    pop = qualified[['book_id','popularity_rating']]
    #print(qualified.shape)
    #print(pop.shape)


    # In[11]:


    ### Collaborative ##

    reader = Reader()
    ratings=train_rd
    #ratings = pd.read_csv('ratings.csv')
    #ratings.head()

    temp_ratings = ratings[0:1000]

    #print(temp_ratings)
    data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader)
    data.split(n_folds=2)


    # In[12]:


    svd = SVD()
    evaluate(svd, data, measures=['RMSE', 'MAE'])


    # In[13]:


    trainset = data.build_full_trainset()
    #svd.train(trainset)
    algo = SVD()
    algo.fit(trainset)

    ## usefule = temp_rating[rating]


    # In[14]:


#print(len(temp_ratings[temp_ratings['user_id']==userId]))


    # In[ ]:


    def get_top_n(predictions, n=10):
        '''Return the top-N recommendation for each user from a set of predictions.

        Args:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            #user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n


    # In[15]:


    from collections import defaultdict
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    '''
    top_n = get_top_n(predictions, n=10000)

    #print(top_n)
    #result = pd.DataFrame(top_n)
    #print(result)
    for uid, user_ratings in top_n.items():
    
        #print(uid, [iid for (iid  , _) in user_ratings])
        for uid, iid, true_r, est, _ in predictions:
        
            temp_ratings.loc[uid]= [uid,iid,est]
        #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']]
        
    '''
    count = 0
    for uid, iid, true_r, est, _ in predictions:
        
         if uid == userId:
            count = count+1
            temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est]
            #print('here')

            #print(uid)
            #temp_ratings.append([uid,iid,est],ignore_index=True)

    #print(count)
    #print(temp_ratings)



    # In[16]:


    #print(len(temp_ratings[temp_ratings['user_id']==2]))


    # In[ ]:





    # In[46]:


    ##### CONTENT ######

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import stats
    from ast import literal_eval
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
    from nltk.stem.snowball import SnowballStemmer
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import wordnet
    from surprise import Reader, Dataset, SVD, evaluate
    import csv
    import warnings; warnings.simplefilter('ignore')


    # In[48]:



    md=pd.read_csv('CustomData/FinalData.csv')
    rd=train_rd
    #rd=pd.read_csv('ratings.csv')
    md['book_id'] = md['book_id'].astype('int')
    rd['book_id'] = rd['book_id'].astype('int')
    rd['user_id'] = rd['user_id'].astype('int')
    rd['rating'] = rd['rating'].astype('int')

    #print(md.head())


    md['authors'] = md['authors'].str.replace(' ','')
    md['authors'] = md['authors'].str.lower()
    md['authors'] = md['authors'].str.replace(',',' ')

    #print(md.head())

    md['authors'] = md['authors'].apply(lambda x: [x,x])
    #print(md['authors'])

    md['Genres']=md['Genres'].str.split(';')
    #print(md['Genres'])

    md['soup'] = md['authors'] + md['Genres']
    #print(md['soup'])

    md['soup'] = md['soup'].str.join(' ')

    #md['soup'].fillna({})
    #print(md['soup'])

    count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(md['soup'])
    #print (count_matrix.shape)
    #print np.array(count.get_feature_names())
    #print(count_matrix.shape)

    cosine_sim = cosine_similarity(count_matrix, count_matrix)


    # In[91]:


    def build_user_profiles():
        user_profiles=np.zeros((53421,999))
        #print(rd.iloc[0]['user_id'])
	#len(rd['book_id'])
        for i in range(0,1000):
            u=rd.iloc[i]['user_id']
            b=rd.iloc[i]['book_id']
            #print(u,b)
            #print(i)
            #if b<999:
                #print("match at "+str(b))
            user_profiles[u][b-1]=rd.iloc[i]['rating']
        #print(user_profiles)
        return user_profiles

    user_profiles=build_user_profiles()
    def _get_similar_items_to_user_profile(person_id):
            #Computes the cosine similarity between the user profile and all item profiles
            #print(user_profiles[person_id])
        #print("\n---------\n")
        #print(cosine_sim[0])
        user_ratings = np.empty((999,1))
        cnt=0
        for i in range(0,998):
            book_sim=cosine_sim[i]
            user_sim=user_profiles[person_id]
            user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i])
        maxval = max(user_ratings)
    #print(maxval)

        for i in range(0,998):
            user_ratings[i]=((user_ratings[i]*5.0)/(maxval))
            #print(user_ratings[i])
            if(user_ratings[i]>3):
                #print("MILA KUCCHHH")
                cnt+=1
        #print(max(user_ratings))
        #print (cnt)
       
            #print(cosine_similarities)
            
            #return similar_items
        return user_ratings
    content_ratings = _get_similar_items_to_user_profile(userId)



    # In[100]:


    num = md[['book_id']]
    #print(num)

    num1 = pd.DataFrame(data=content_ratings[0:,0:])


    frames = [num, num1]
    #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])

    mer = pd.concat(frames, axis =1,join_axes=[num.index])
    mer.columns=['book_id', 'content_rating']
    #print(mer.shape)
    #print('here')
    #print(mer)





    # In[102]:


    ## for user 2 #

#print(temp_ratings.shape)
    cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']]
#   print(cb.shape)
#   print(pop.shape)
    hyb = md[['book_id']]
    hyb = hyb.merge(cb,on = 'book_id')
    hyb = hyb.merge(pop, on='book_id')
    hyb = hyb.merge(mer, on='book_id')
    #hyb.shape


    # In[106]:


    def weighted_rating(x):
        v = x['rating']
        R = x['popularity_rating']
        c = x['content_rating']
        return 0.4*v + 0.2*R + 0.4 * c


    # In[107]:


    print(hyb)
    hyb['final'] = hyb.apply(weighted_rating, axis=1)
    hyb = hyb.sort_values('final', ascending=False).head(999)
    #print(hyb['final'])

    print(hyb)
    return hyb