Beispiel #1
0
def get_recommendation(user):

    conn = pymysql.connect(Account.link,
                           Account.user,
                           Account.password,
                           Account.db,
                           charset="utf8mb4")

    df = pd.read_sql_query('SELECT * FROM USERS', conn)

    if (df.empty):
        return "Error - empty DF"

    conn.close()

    # Anime can be rated from 1 - 10
    data = Dataset.load_from_df(df, Reader(rating_scale=(1, 10)))

    data.split(n_folds=10)
    algo = SVD()

    trainset = data.build_full_trainset()
    algo.train(trainset)

    # predict ratings for all pairs (user, score) that are NOT in the train set
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    # Get top 15 predictions
    top_n = get_top_n(predictions, n=15)

    if top_n.get(user) is None:
        return "Error - cannot find User"

    return [iid for (iid, _) in top_n.get(user)]
Beispiel #2
0
def svd(data, training, testing):
    '''
        Tune SVD parameters then calculates RMSE, coverage and running time of SVD

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of SVD with Z-score with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]}

    # optimize parameters
    grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data)
    param = grid_search.best_params['RMSE']
    print('SVD:', param)

    # fit model using the optimized parameters
    svd = SVD(n_factors=param['n_factors'], n_epochs=param['n_epochs'])
    svd.train(training)

    # evaluate the model using test data
    predictions = svd.test(testing)
    top_n = get_top_n(predictions, n=5)
    rmse = accuracy.rmse(predictions, verbose=True)
    return rmse, top_n
Beispiel #3
0
    def train_from_dataset(self, filepath):
        """
        train algorithm from a ratings dataset.

        Use to rebuild a dump of the trained algorithm if it's ever lost
        """
        print("start training")
        # path to dataset file
        file_path = os.path.expanduser(filepath)

        reader = Reader(
            line_format='user item rating timestamp',
            sep=',',
            rating_scale=(1, 10))

        data = Dataset.load_from_file(file_path, reader=reader)

        trainset = data.build_full_trainset()
        # SVD style
        algo = SVD()

        # KNN style
        # sim_options = {'name': 'pearson_baseline', 'user_based': True}
        # algo = KNNBaseline(k=1, min_k=1, sim_options=sim_options)

        algo.train(trainset)

        print("end training")
        self.data = data
        self.algorithm = algo
Beispiel #4
0
    def model(self, alg_key):

        reader = Reader(rating_scale = (1, 5))

        data_result = Dataset.load_from_df(self.make_df()[['user_id', 'place_id', 'score']], reader)

        # split data into 5 folds

        data_result.split(n_folds=10)

        # evaluation

        if alg_key.lower() == "svd":
            alg = SVD()
        elif alg_key.lower() == "knn":
            alg = KNNBasic()
        elif alg_key.lower() == "nmf":
            alg = NMF()

        evaluate(alg, data_result, measures=['RMSE', 'MAE'])

        # prediction
        # user_0	smallShop_5645	2
        test_user = '******'
        test_id = 'smallShop_7089'
        real_score = 4

        trainset = data_result.build_full_trainset()

        alg.train(trainset)
        print(alg.predict(test_user, test_id, real_score))
Beispiel #5
0
def boost(examples, rounds=10):
    distr = normalize([1.] * l)
    hypotheses = [None] * rounds
    alpha = [0] * rounds

    for t in range(rounds):

        #create a training set based on the weight distribution
        for i in range(l):
            examples[i] = examples[draw(distr)]

        # create a trainset object
        reader = Reader()
        data = Dataset.load_from_df(examples, reader)
        trainset = data.build_full_trainset()

        # Use SVD with surprise
        algo = SVD()algo.train(trainset)
        hypotheses[t] = algo

        for i in range(l):
            abserr[i] = math.abs(examples.at[i,'rating'] - algo.predict(examples.at[i,'user_id'],examples.at[i,'business_id']).est)

        # update weights 
        delta = sum(x*y for x,y in zip(distr,abserr) if abserr > delta)
        hypRes = np.where(abserr > delta,-1,1)
        alpha[t] = 0.5 * math.log((1 - delta) / (.0001 + delta))

        distr = normalize([d * math.exp(-alpha[t] * h) for (d,h) in zip(distr, hypRes)]) 

       
    def finalHypothesis(x):
        return sign(sum(a * h(x) for (a, h) in zip(alpha, hypotheses))) 

    return finalHypothesis
def svd(trainset, testset, predset):

    modelname = 'svd'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return

    algo = SVD(n_factors=100, n_epochs=40, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, reg_bu=0.05, reg_bi=0.05, reg_pu=0.09, reg_qi=0.1)
    print('SVD Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
Beispiel #7
0
def train_cf_algo(model_data):
    print(">>> training cf model...")
    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(
        model_data[['msno', 'song_id', 'target']], reader)
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.train(trainset)
    return algo
def grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls,
                    init_mean, n_factors, file_name):

    print('SVD Surprise manual grid search')

    result_train = pd.DataFrame()
    result_test = pd.DataFrame()
    # loops on the parameters
    for n_epoch in n_epochs:
        for lr_all in lr_alls:
            for reg_all in reg_alls:
                for n_factor in n_factors:

                    algo = SVD(reg_all=reg_all,
                               init_mean=init_mean,
                               n_epochs=n_epoch,
                               lr_all=lr_all,
                               n_factors=n_factor)

                    # Retrieve the trainset.
                    trainset = data_train.build_full_trainset()

                    # Build an algorithm, and train it.
                    algo.train(trainset)
                    # Evaluate the performance
                    perf_train = evaluate(algo, data_train, measures=['RMSE'])
                    perf_test = evaluate(algo, data_test, measures=['RMSE'])

                    perf_train["n_epoch"] = n_epoch
                    perf_train["lr_all"] = lr_all
                    perf_train["reg_all"] = reg_all
                    perf_train["init_mean"] = init_mean
                    perf_train["n_factor"] = n_factor
                    # Store the mean performance RMSE on train
                    perf_train["rmse"] = np.mean(perf_train['rmse'])

                    perf_test["n_epoch"] = n_epoch
                    perf_test["lr_all"] = lr_all
                    perf_test["reg_all"] = reg_all
                    perf_test["init_mean"] = init_mean
                    perf_test["n_factor"] = n_factor
                    # Store the mean performance RMSE on test
                    perf_test["rmse"] = np.mean(perf_test['rmse'])

                    # Store on a dataframe
                    result_train = result_train.append(perf_train,
                                                       ignore_index=True)
                    result_test = result_test.append(perf_test,
                                                     ignore_index=True)

    # Save the dataframe so we will see or plot the differencies if it's interesting
    writer = pd.ExcelWriter(file_name, engine='xlsxwriter')
    result_train.to_excel(writer, 'Sheet1')
    result_test.to_excel(writer, 'Sheet2')
    writer.save()
Beispiel #9
0
 def _train_predict(self, node):
     file_name = 'file/%s.dat' % time.time()
     with open(file_name, 'w') as f:
         f.writelines(
                 ['%s\t%s\t%s\t%s\n' % (line[0], line[1], line[2], line[3])
                     for line in node.data]
                 )
     reader = Reader(line_format='user item rating timestamp', sep='\t')
     surprise_data = Dataset.load_from_file(file_name, reader=reader)
     train_set = surprise_data.build_full_trainset()
     algo = SVD()
     algo.train(train_set)
     node.algo = algo
Beispiel #10
0
def startPredModel(ratings, fileOutput):

    reader = Reader()

    data = Dataset.load_from_df(ratings[['userId', 'imdbId', 'rating']],
                                reader)
    data.split(n_folds=5)  # 5

    svd = SVD()
    evaluate(svd, data, measures=['RMSE', 'MAE'])

    trainset = data.build_full_trainset()
    svd.train(trainset)

    dump.dump(fileOutput, None, svd, 1)
def svd_surprise(data_train, reg_all, init_mean, n_epochs, lr_all, n_factors, name_file):
    print('SVD Surprise')

    # We construct our SVD algo with surprise and the best parameters
    algo = SVD(reg_all= reg_all, init_mean = init_mean, n_epochs = n_epochs, lr_all= lr_all, n_factors = n_factors)

    # Retrieve the trainset.
    trainset = data_train.build_full_trainset()

    # Build an algorithm, and train it.
    algo.train(trainset)
    # Evaluate the RMSE of the algo
    evaluate(algo, data_train, measures=['RMSE'])
    # Make the prediction
    make_prediction_surprise(algo, name_file)
Beispiel #12
0
    def latentFeatures(self, ):
        # Load the movielens-100k dataset (download it if needed),
        # and split it into 3 folds for cross-validation.
        reader = Reader(line_format='user item rating timestamp', sep=',')
        data = Dataset.load_from_file("../data/ml20m_train.csv", reader=reader)
        trainset = data.build_full_trainset()
        algo = SVD(n_factors=10)
        algo.train(trainset)

        userLatentFeatures = pd.DataFrame(
            algo.pu,
            columns=["SVD_user_feature_" + str(i) for i in range(0, 10)])
        userLatentFeatures["userId"] = self.r.userId.unique()

        return userLatentFeatures.set_index("userId")
Beispiel #13
0
def inicializar_algoritmo():
    csv_df = pd.read_csv('Data/ratings.csv')
    users_mongo = list(users.get_users_ratings().find())
    mongo_df = pd.DataFrame(users_mongo)
    mongo_df_new = mongo_df[['userId', 'movieId', 'rating']]
    csv_df_new = csv_df[['userId', 'movieId', 'rating']]
    final_df = csv_df_new.append(mongo_df_new)
    final_df.columns = ['userID', 'itemID', 'rating']
    reader = Reader()
    data = Dataset.load_from_df(final_df, reader)
    data.split(2)  # data can now be used normally0
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.train(trainset)
    testset = trainset.build_anti_testset()
    return (algo, testset)
Beispiel #14
0
def model_train(rating_dataset=None):
    if rating_dataset is None:
        data = Dataset.load_builtin('ml-100k')
    else:
        # path to dataset file
        file_path = os.path.expanduser(rating_dataset)

        # As we're loading a custom dataset, we need to define a reader. In the
        # movielens-100k dataset, each line has the following format:
        # 'user item rating timestamp', separated by '\t' characters.
        reader = Reader(line_format='user item rating timestamp', sep='\t')

        data = Dataset.load_from_file(file_path, reader=reader)
    # Retrieve the trainset.
    trainset = data.build_full_trainset()
    # Build an algorithm, and train it.
    algo = SVD()
    algo.train(trainset)
    return algo
Beispiel #15
0
def restore(x):
    from scipy.sparse import coo_matrix
    sparse_mat = coo_matrix(x)

    data = np.stack([sparse_mat.col,
                     sparse_mat.row,
                     sparse_mat.data
                     ], axis=1).astype('int')
    np.savetxt('tmp.txt', data, fmt='%d')
    reader = Reader(line_format='user item rating', sep=' ', rating_scale=(0, 255))
    dataset = Dataset.load_from_file('tmp.txt', reader=reader)
    trainset = dataset.build_full_trainset()

    algo = SVD()
    algo.train(trainset)

    xx = np.arange(0, x.shape[0])
    yy = np.arange(0, x.shape[1])

    y3, x3 = np.meshgrid(yy, xx)
    testset = zip(x3.ravel().tolist(), y3.ravel().tolist())
    testset = [str(a) + ' ' + str(b) for (a, b) in testset]
    print testset[:10]

    # pool = mp.Pool(mp.cpu_count() * 2)

    def my_predict(test):
        a, b = test.split()
        return algo.predict(uid=a, iid=b)

    predictions = []
    for test in testset:
        predictions.append(int(my_predict(test).est))
    # predictions=pool.map(my_predict, testset)

    # pool.close()
    # pool.join()
    # print predictions[:10]

    return np.array(predictions).reshape(x.shape)
Beispiel #16
0
def svd_running_time(data):
    '''
        Calculates the running times for training and predictions for SVD

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_SVDtrain: running time for training
            elapsedtime_SVDtest: running time for predictions on testset
    '''
    elapsedtime_SVDtrain = []
    elapsedtime_SVDtest = []

    # tune the parameters on the entire data
    param_grid = {
        'n_factors': [25, 50, 100, 250],
        'n_epochs': [10, 20, 30, 40, 50]
    }
    grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    n_factors = param['n_factors']
    n_epochs = param['n_epochs']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        svd = SVD(n_factors=n_factors, n_epochs=n_epochs)
        svd.train(training)
        elapsedtime_SVDtrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        svd.test(testing)
        elapsedtime_SVDtest.append(time.time() - test_start)
    return elapsedtime_SVDtrain, elapsedtime_SVDtest
def collab_filter(md, ratings, links_small, credits, keywords, smd):
    # data pre-processing
    id_map = links_small[['movieId', 'tmdbId']]
    links_small = links_small[
        links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

    reader = Reader()

    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    data.split(n_folds=5)

    svd = SVD()
    #evaluate(svd, data, measures=['RMSE', 'MAE'])

    trainset = data.build_full_trainset()
    svd.train(trainset)

    id_map['tmdbId'] = id_map['tmdbId'].apply(convert_int)
    id_map.columns = ['movieId', 'id']
    id_map = id_map.merge(smd[['title', 'id']], on='id').set_index('title')

    return svd, id_map
Beispiel #18
0
# Evaluate performances of our algorithm on the dataset.
grid_search.evaluate(data)

# best MAE
print('best: ' + str(grid_search.best_score['RMSE']))

# combination of parameters that gave the best FCP score
print('best params: ' + str(grid_search.best_params['RMSE']))

params = grid_search.best_params['RMSE']
algo_SVD = SVD(verbose=True,
               n_factors=params['n_factors'],
               n_epochs=params['n_epochs'],
               lr_all=params['lr_all'],
               reg_all=params['reg_all'])
algo_SVD.train(data_full)

#%%

datamat_filled_SVD = datamat_missing.copy().astype(np.float)
datamat_filled_NMF = datamat_missing.copy().astype(np.float)
for i in range(0, datamat_full.shape[0]):  # movie
    for j in range(0, datamat_full.shape[1]):  # user

        val = algo_SVD.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est
        datamat_filled_SVD[i, j] = val

        val = algo_NMF.predict('u%i' % (j + 1), 'i%i' % (i + 1)).est
        datamat_filled_NMF[i, j] = val

#%% compute correlations between real and recovered ratings
Beispiel #19
0
    def make_reccomendation(self, user_pref):

        #the filepath to the dataset
        file_path = 'ml-100k/u.data'
        #setting the Reader obj
        reader = Reader(line_format='user item rating timestamp', sep='\t')
        data = Dataset.load_from_file(file_path, reader=reader)

        # Retrieve the trainset.
        trainset = data.build_full_trainset()

        # pick an algorithm

        #we're using a K nearest-neighbors algorithm
        #algo = KNNBasic()

        #we're using a singular value decomposition algorithm
        algo = SVD()

        #train the algo on our data
        algo.train(trainset)

        #read the ratings file
        r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
        ratings = pd.read_csv('ml-100k/u.data',
                              sep='\t',
                              names=r_cols,
                              encoding='latin-1')

        #the as of now empty dict that will contain the raters and their likenesses
        rater_likeness = {}

        #loop through the ratings table, if our user and the rater agree, give '1 pt' to
        #the rater's likeness to our user. In the end the rater with the most
        #'likeness' wins out and their reccomendation will be queried
        for index, row in ratings.iterrows():
            if row['movie_id'] in user_pref and user_pref[
                    row['movie_id']] == row['rating']:
                if not row['user_id'] in rater_likeness:
                    rater_likeness[row['user_id']] = 1
                else:
                    rater_likeness[row['user_id']] += 1

        #determine which rater in the dict has the highest likeness
        thing = 0
        for key, value in rater_likeness.iteritems():
            if value >= thing:
                thing = value
                best_rater = key

        user_id = str(
            best_rater)  #we need this to be a string for the predict func

        #loop through the list of movies until we find one that our rater would
        #give more than a 4.6
        for i in range(1, 1683):
            item_id = str(i)
            if not i in user_pref:
                pred = algo.predict(user_id, item_id, r_ui=3, verbose=False)
                if pred.est >= 4.6:
                    break

        return i
def compute_recommendations(user_id, prediction_table,
                            numeric_prediction_table):

    algo = 'SVD'

    algorithm = SVD()

    # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview

    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(
        sessionmaker(bind=engine, autocommit=False, autoflush=False))

    #reading in the database

    df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine)
    df_ratings = df_ratings[['user_id', 'item_id', 'rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()

    df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False)
    df_ratings2 = df_ratings2.rename(columns={'movie_id': 'item_id'})
    df_ratings2 = df_ratings2[['user_id', 'item_id', 'rating']]
    df_ratings2 = df_ratings2.dropna()
    df_ratings2 = df_ratings2.drop_duplicates()

    df_ratings = pd.concat([df_ratings, df_ratings2], axis=0)

    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 10))
    data = Dataset.load_from_df(df_ratings, reader=reader)

    trainset = data.build_full_trainset()

    #     algorithm = eval(algo + "()")# set the algorithm...............................................

    algorithm.train(trainset)

    items = pd.read_sql('SELECT distinct id FROM items;', con=engine)
    df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id]
    total_items = items.id.unique()
    user_items = df_user_items.item_id.unique()
    # user_id = str(user_id)
    prediction_items = [x for x in total_items if x not in user_items]

    predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction'])

    predicted_ratings = []

    for i in prediction_items:
        a = user_id
        b = i
        est = algorithm.predict(a, b)
        predicted_ratings.append(est[3])

    predictions['item_id'] = prediction_items
    predictions['user_id'] = pd.Series(
        [user_id for x in range(len(predictions.index))],
        index=predictions.index)

    predictions['prediction'] = predicted_ratings

    predictions = predictions.sort_values('prediction', ascending=False)
    test_prediction = predictions
    predictions = predictions.head(n=10)

    cols = [
        'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7',
        'pred_8', 'pred_9', 'pred_10'
    ]

    df_pred = predictions[['item_id']].T

    df_pred.columns = cols

    df_pred['id'] = user_id

    df_pred = df_pred[[
        'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10'
    ]]

    df_pred['id'] = df_pred['id'].astype(int)

    df_pred.to_sql(prediction_table, engine, if_exists='append',
                   index=False)  #if_exists='append'
    session.commit()

    df_num_ratings = test_prediction

    df_num_ratings = df_num_ratings.head(n=20)

    df_num_ratings['algorithm'] = algo
    df_num_ratings.rename(columns={'prediction': 'predicted_rating'},
                          inplace=True)

    df_num_ratings.to_sql('numeric_predictions',
                          engine,
                          if_exists='append',
                          index=False)  #if_exists='append'
    session.commit()

    predcols = [
        'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8',
        'num_9', 'num_10'
    ]

    df_num_ratings_transpose = predictions[['prediction']].T
    df_num_ratings_transpose.columns = predcols

    df_num_ratings_transpose['id'] = user_id

    df_num_ratings_transpose = df_num_ratings_transpose[[
        'id', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7',
        'num_8', 'num_9', 'num_10'
    ]]

    df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int)

    df_num_ratings_transpose.to_sql(numeric_prediction_table,
                                    engine,
                                    if_exists='append',
                                    index=False)  #if_exists='append'
    session.commit()
Beispiel #21
0
class SurpriseFeatureBuilder():
    def __init__(self,
                 item_identifier='media_id',
                 train_file_path=TRAIN_FILE_PATH,
                 surprise_file_path=SURPRISE_FILE_PATH,
                 user_min_occurrence=20,
                 item_min_occurrence=20):
        """SupriseFeatureBuilder formats data for ingesting and uses SVD to build a feature for a given item_identifier.

        Arguments:     
            item_identifier: String
                colname of the item 
                       
            train_file_path: string
                train file

            surprise_file_path: string
                output filtered data to this location. to be read by Surprise
                
            user_min_occurrence: int
                user must appear at least this number of times to be included

            item_min_occurrence: int
                item must appear at least this number of times to be included
        """
        self.train_file_path = train_file_path
        self.surprise_file_path = surprise_file_path
        self.item_identifier = item_identifier
        self.user_min_occurrence = user_min_occurrence
        self.item_min_occurrence = item_min_occurrence
        self.svd = SVD()

    def make_surprise_file(self,
                           user_min_occurrence=None,
                           item_min_occurrence=None):
        """Generates file to be ingested by Surprise.

        Arguments:                     
            user_min_occurrence: int
                user must appear at least this number of times to be included

            item_min_occurrence: int
                item must appear at least this number of times to be included
        """
        if user_min_occurrence == None:
            user_min_occurrence = self.user_min_occurrence
        if item_min_occurrence == None:
            item_min_occurrence = self.item_min_occurrence
        data = pd.read_csv(self.train_file_path)
        filtered_data = (data.groupby('user_id').filter(
            lambda x: len(x) >= user_min_occurrence).groupby(
                self.item_identifier).filter(
                    lambda x: len(x) >= item_min_occurrence).groupby(
                        ['user_id', self.item_identifier]).mean())
        print(filtered_data.shape)
        filtered_data.to_csv(path_or_buf=self.surprise_file_path,
                             columns=['is_listened'],
                             header=False,
                             index=True)

    def make_file_if_missing(self):
        if not Path(self.surprise_file_path).is_file():
            print('File not found. Generating new input file')
            start_time = time.perf_counter()
            self.make_surprise_file()
            print('File generated in {}s'.format(time.perf_counter() -
                                                 start_time))

    def delete_surprise_file(self):
        if Path(self.surprise_file_path).is_file():
            os.remove(self.surprise_file_path)

    def read_data(self):
        reader = dataset.Reader(line_format="user item rating",
                                sep=',',
                                rating_scale=(0, 1),
                                skip_lines=0)
        self.data = dataset.Dataset.load_from_file(self.surprise_file_path,
                                                   reader=reader)
        self.data.split(n_folds=5)

    def eval(self):
        # Evaluate performances of our algorithm on the dataset.
        perf = evaluate(self.svd, self.data, measures=['RMSE'])
        print_perf(perf)

    def parameter_tuning(self):
        param_grid = {
            'n_epochs': [20, 40],
            'lr_all': [0.002, 0.005],
            'reg_all': [0.01, 0.02, 0.04],
            'n_factors': [20, 50, 100]
        }

        print("Starting grid search...")
        start_time = time.perf_counter()
        self.grid_search = GridSearch(SVD, param_grid, measures=['RMSE'])
        self.grid_search.evaluate(self.data)
        print('Grid search took {}s'.format(time.perf_counter() - start_time))

        self.svd = self.grid_search.best_estimator['RMSE']

        print(self.grid_search.best_score['RMSE'])
        print(self.grid_search.best_params['RMSE'])

    def train(self):
        trainset = self.data.build_full_trainset()
        self.svd.train(trainset)

    def _predict(self, user_lst, item_lst):
        assert len(user_lst) == len(item_lst)

        # gets predictions
        lst_length = len(user_lst)

        pred = [
            self.svd.predict(str(user_lst[idx]), str(item_lst[idx]))
            for idx in range(lst_length)
        ]
        prediction, unseen = zip(*([(est, details['was_impossible'])
                                    for (_, _, _, est, details) in pred]))

        # Replace unseen with number 0, 1, 2 based on whether user, item
        unseen = [
            sum([
                self.svd.trainset.knows_user(user_lst[i]),
                self.svd.trainset.knows_item(item_lst[i])
            ]) for i in range(len(user_lst))
        ]
        return prediction, unseen

    def get_predictions(self, test_file_path):
        """Use trained model on test file

        Arguments:
            test_file_path: String 
                location of test file
        """
        data = pd.read_csv(test_file_path)
        user_lst, item_lst = data['user_id'].tolist(), data[
            self.item_identifier].tolist()
        predictions, unseen = self._predict(user_lst, item_lst)
        return {
            "{}_svd".format(self.item_identifier): predictions,
            "{}_unseen".format(self.item_identifier): unseen
        }
Beispiel #22
0
                              
#ratings_dict = {'item': items,'rating': ratings,'user': users}
#df = pd.DataFrame(ratings_dict)
#reader = Reader(rating_scale=(1,10))
#obj = Dataset.load_from_df(df[['item','rating','user']], reader)

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating',sep=' ',rating_scale=(1,10))
dataobj = Dataset.load_from_file('D:/GoogleDrive/mydata.csv', reader=reader)

traindata = dataobj.build_full_trainset()
                                               
algo = SVD(
    verbose =True,
    n_factors = 5,
    n_epochs  = 100)                                                               
                                                                
algo.train(traindata)

data_fill=data.copy()
for col in range(0,siz[1]):
    for row in range(0,siz[0]):
        #data_fill[row,col]=algo.predict('user%i' % (col+1),'item%i' % (row+1)).est
        data_fill[row,col]=algo.predict((col+1),(row+1)).est

print((np.round(data_fill)).astype(np.int))

print(data_full)
dataset=Dataset.load_from_df(ratings_dataset[['userId','movieId','rating']],reader)

#Using the split function to perform cross validation 
dataset.split(n_folds=6)

#Intialising the SVD model and specifying the number of latent features
#we can tune this parameters according to our requirement
svd=SVD(n_factors=25)

#evaluting the model on the based on the root mean square error and Mean absolute error 
evaluate(svd,dataset,measures=['rmse','mae'])

#making the dataset to train our model
train=dataset.build_full_trainset()
#training our model
svd.train(train)



#Making a new series which have two columns in it 
#Movie name and movie id 
movies_dataset = movies_dataset.reset_index()
titles = movies_dataset['movie_name']
indices = pd.Series(movies_dataset.index, index=movies_dataset['movie_name'])
#Function to make recommendation to the user
def recommendataion(user_id,movie):
    result=[]
    #Getting the id of the movie for which the user want recommendation
    ind=indices[movie].iloc[0]
    #Getting all the similar cosine score for that movie
    sim_scores=list(enumerate(cosine_sim[ind]))
Beispiel #24
0
# print(improved_recommendations('Mean Girls', smd).head(10))

##############################################################################
#3

reader = Reader()
ratings = pd.read_csv('./data/ratings_small.csv')
#print(ratings.head())

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

svd = SVD()
#evaluate(svd, data, measures=['RMSE','MAE'])
trainset = data.build_full_trainset()
svd.train(trainset)  # trainset 생성
#uid 유저아이디 , iid는 영화 아이디
a = svd.predict(uid=1, iid=302)
#`print(a.est)

m_list = list(set(ratings['movieId']))


def CF_recsys(id):
    est_list = []
    for mv in m_list:
        est_list += [svd.predict(id, mv).est]

    df = pd.DataFrame({
        'id': m_list,
        'est': est_list
Beispiel #25
0
                sep=',',
                skip_lines=1)

data = Dataset.load_from_file(file_path, reader=reader)
data.split(n_folds=5)
svd_ambiente = SVD(n_epochs=100, lr_all=0.002, reg_all=0.2)

# Evaluate performances of our algorithm on the dataset.
perf = evaluate(svd_ambiente, data, measures=['RMSE', 'MAE'])

print_perf(perf)

# Retrieve the trainset.
trainset = data.build_full_trainset()

svd_ambiente.train(trainset)

from sklearn.externals import joblib
joblib.dump(svd_ambiente, 'svd_ambiente.pkl')

#con esto se carga

svd_ambiente = joblib.load('svd_ambiente.pkl')

test = pd.read_csv("/Volumes/Disco_SD/Set de datos/guia_oleo/ratings_test.csv",
                   sep=',',
                   encoding="ISO-8859-1")

test_ambiente = pd.DataFrame()

for i in range(0, len(test.index)):
class SurSVD:
    def __init__(self, k=5):
        if not isinstance(k, int) or k <= 0:
            raise IOError("Parameter k should be a positive integer.")
        self.data = None
        self.k = k
        self.algo = SVD(n_factors=self.k, biased=False, reg_all=0)
        self.predictions = pd.DataFrame()

    def fit_directly(self, data_long):
        """
        This function directly computes the predictions
        of the algorithm for the data provided. The
        data needs to be in the long shape format. It then
        add to the class attributes the predictions made
        by the algorithm (maintaining the long format)
        :param data_long: pd.DataFrame | DataFrame in the long
                                    shape format
        :return void:
        """
        # Run SVD++
        reader = Reader(rating_scale=(0, 1))
        data = Dataset.load_from_df(data_long, reader)
        trainset = data.build_full_trainset()
        self.algo.train(trainset)
        testset = trainset.build_anti_testset()
        predictions = self.algo.test(testset)

        # Reconstruct predictions
        users = []
        items = []
        ratings = []
        dataframe = pd.DataFrame()
        for uid, iid, r_ui, _, _ in predictions:
            users.append(uid)
            items.append(iid)
            ratings.append(r_ui)

        dataframe["userID"] = users
        dataframe["itemID"] = items
        dataframe["ratings"] = ratings

        self.predictions = dataframe

    def fit(self, rating_matrix):
        """
        Fits the instance to the rating matrix. The index must be
        the users and the columns the items.
        :param rating_matrix: pd.DataFrame | rating matrix
        :return: void
        """
        data_long = rating_matrix.stack().reset_index()
        data_long.columns = ["user_id", "item_id", "ratings"]

        # Run SVD
        reader = Reader(rating_scale=(0, 1))
        data = Dataset.load_from_df(data_long, reader)
        trainset = data.build_full_trainset()
        self.algo.train(trainset)
        testset = trainset.build_anti_testset()
        predictions = self.algo.test(testset)

        # Reconstruct predictions
        users = []
        items = []
        ratings = []
        dataframe = pd.DataFrame()
        for uid, iid, r_ui, _, _ in predictions:
            users.append(uid)
            items.append(iid)
            ratings.append(r_ui)

        dataframe["itemID"] = items
        dataframe["ratings"] = ratings
        dataframe["userID"] = users
        self.predictions = dataframe

    def predict(self, user, item):
        """
        Predict the probability that input user will like input item
        :param user: int | user ID
        :param item: int | item ID
        :return: float | probability that user likes item
        """
        cond1 = self.predictions["userID"] == user
        cond2 = self.predictions["itemID"] == item
        mask = cond1 & cond2
        temp = np.array(self.predictions.loc[mask, "ratings"])
        proba = np.sum(temp)
        return proba
Beispiel #27
0
################################### Collaborative Filtering ################################

reader= Reader()

ratings= pd.read_csv("./Movies/ratings_small.csv")
print("\n\nRatings:\n", ratings.head())

data= Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

# Using Singular Value Decomposition (SVD) from Surprise package
svd= SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])

trainset= data.build_full_trainset()
svd.train(trainset)

print(ratings[ratings['userId']==1])

print(svd.predict(1,302, 3))

def convert_int(x):
	try:
		return int(x)
	except:
		return np.nan
		
id_map= pd.read_csv('./Movies/links_small.csv')[['movieId', 'tmdbId']]
id_map['tmdbId']= id_map['tmdbId'].apply(convert_int)
id_map.columns= ['movieId', 'id']
id_map= id_map.merge(smd[['title', 'id']], on='id').set_index('title')
Beispiel #28
0
    # Delete unused columns
    del dfRatings['date']
    del dfRatings['train_id']
    del dfTest['date']
    del dfTest['test_id']

    # Set the rating scale and create the data for Surprise to use
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(
        dfRatings[['user_id', 'business_id', 'rating']], reader)

    factors = 50

    train_set = data.build_full_trainset()

    # Use SVD with surprise
    algo = SVD(n_factors=factors)
    algo.train(train_set)

    f = open('SVDOutput.csv', 'w')
    f.write("test_id,rating\n")
    for i in range(len(dfTest)):
        prediction = algo.predict(dfTest.at[i, 'user_id'],
                                  dfTest.at[i, 'business_id'],
                                  r_ui=4,
                                  verbose=True)
        predRating = prediction.est
        f.write(str(i) + "," + str(predRating) + '\n')

    f.close()
from surprise import Reader, Dataset
import surprise
# Define the format
reader = Reader(line_format='user item rating', sep=',')
# Load the data from the file using the reader format
data = Dataset.load_from_file('recomm.csv', reader=reader)

# Split data into 5 folds
data.split(n_folds=5)

from surprise import SVD, evaluate
algo = SVD()
evaluate(algo, data, measures=['RMSE', 'MAE'])

# Retrieve the trainset.
trainset = data.build_full_trainset()
algo.train(trainset)

userid = str(10)
itemid = str(20)
actual_rating = 3
print(algo.predict(userid, 40))

a = algo.predict(userid, 20)
t = a.est / 3
print(t)

from sklearn.externals import joblib
joblib.dump(algo, 'reccc.pkl')
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import Dataset
from surprise import SVD
from surprise import accuracy


data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.train(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

data.split(3)
for i, (trainset_cv, testset_cv) in enumerate(data.folds()):
    print('fold number', i + 1)
    algo.train(trainset_cv)

    print('On testset,', end='  ')