Beispiel #1
0
def executeTraining(modelFileName, simOptions):
    knn = KNNBasic(sim_options=sim_options, k=3)
    knn.train(trainingSet)
    testSet = trainingSet.build_anti_testset()
    predictions = knn.test(testSet)

    os.makedirs('./outputs', exist_ok=True)

    with open(modelFileName, "wb") as file:
        joblib.dump(knn, os.path.join('./outputs/', modelFileName))
Beispiel #2
0
def run_train(trainingSet):
    try:
        # build training set
        # KNN model
        sim_options = {'name': 'cosine', 'user_based': False}
        knn = KNNBasic(sim_options=sim_options)
        knn.train(trainingSet)
        return knn
    except:
        raise
Beispiel #3
0
def knn_running_time(data):
    '''
        Calculates the running times for training and predictions for Basic KNN

        Args:
            data(Dataset): a list of datasets with different numbers of users

        Returns:
            elapsedtime_KnnBasictrain: running time for training
            elapsedtime_KnnBasictest: running time for predictions on testset
    '''
    elapsedtime_KnnBasictrain = []
    elapsedtime_KnnBasictest = []

    # tune the parameters on the entire data
    param_grid = {
        'k': [5, 10, 20],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson'],
            'min_support': [1, 5],
            'user_based': [False]
        }
    }
    grid_search = GridSearch(KNNBasic,
                             param_grid,
                             measures=['RMSE'],
                             verbose=False)
    grid_search.evaluate(data[3])
    param = grid_search.best_params['RMSE']
    k = param['k']
    sim = param['sim_options']['name']
    min_support = param['sim_options']['min_support']
    user_based = param['sim_options']['user_based']

    # using the tuned parameters calculate running times
    for i in range(len(data)):
        # training running time
        training_start = time.time()
        training = data[i].build_full_trainset()
        testing = training.build_anti_testset()
        knn = KNNBasic(k=k,
                       name=sim,
                       min_support=min_support,
                       user_based=user_based)
        knn.train(training)
        elapsedtime_KnnBasictrain.append(time.time() - training_start)

        # prediction running time
        test_start = time.time()
        knn.test(testing)
        elapsedtime_KnnBasictest.append(time.time() - test_start)
    return elapsedtime_KnnBasictrain, elapsedtime_KnnBasictest
Beispiel #4
0
def user_based_cf(co_pe):
    # INITIALIZE REQUIRED PARAMETERS
    # path = 'ml-100k/u.user'
    prnt = "USER"
    sim_op = {'name': co_pe, 'user_based': True}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating",
                    sep='\t',
                    rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    algo.train(trainset)
    print "ALGORITHM USED", co_pe

    # -------------------------------`-------------- MARKERS

    f = io.open("_AlgoHist_ub.txt", "wb")
    f.write(repr(co_pe))
    f.close()

    # --------------------------------------------- MARKERS END

    print "CF Type:", prnt, "BASED"

    # PEEKING PREDICTED VALUES
    search_key = raw_input("Enter User ID:")
    item_id = raw_input("Enter Item ID:")
    actual_rating = input("Enter actual Rating:")

    print algo.predict(str(search_key), item_id, actual_rating)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions, 5)
    result_u = True

    k = input("Enter size of Neighborhood (Min:1, Max:40)")

    inner_id = algo.trainset.to_inner_iid(search_key)
    neighbors = algo.get_neighbors(inner_id, k=k)
    print "Nearest Matching users are:"
    for i in neighbors:
        print "\t " * 6, i
    return top_n, result_u
Beispiel #5
0
def test_nearest_neighbors():
    """Ensure the nearest neighbors are different when using user-user
    similarity vs item-item."""

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))

    data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train'
    data = Dataset.load_from_file(data_file, reader)
    trainset = data.build_full_trainset()

    algo_ub = KNNBasic(sim_options={'user_based': True})
    algo_ub.train(trainset)
    algo_ib = KNNBasic(sim_options={'user_based': False})
    algo_ib.train(trainset)
    assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
Beispiel #6
0
class FactPrediction:
    """FactPrediction definition."""
    def train(self):
        """Trains the model."""
        from surprise import Reader, Dataset, KNNBasic

        directory = path.dirname(path.realpath(__file__))

        ratings = read_csv(path.join(directory, 'fact_ratings.csv'))
        ratings = Dataset.load_from_df(ratings[['userId', 'factId', 'rating']],
                                       Reader())

        trainset = ratings.build_full_trainset()
        self.model = KNNBasic()
        self.model.train(trainset)

    def predict(self, u_id, f_id):
        """Performs a prediction."""
        return self.model.predict(u_id, f_id)
Beispiel #7
0
    def item_based_cf(self, co_pe, df_path):
        # INITIALIZE REQUIRED PARAMETERS
        # INITIALIZE REQUIRED PARAMETERS
        path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item'
        prnt = "ITEM"
        sim_op = {'name': co_pe, 'user_based': False}
        algo = KNNBasic(sim_options=sim_op)

        reader = Reader(line_format="user item rating",
                        sep=',',
                        rating_scale=(1, 5))
        df = Dataset.load_from_file(df_path, reader=reader)

        # START TRAINING
        trainset = df.build_full_trainset()

        # APPLYING ALGORITHM KNN Basic
        res = algo.train(trainset)
        print "\t\t >>>TRAINED SET<<<<\n\n", res

        # Read the mappings raw id <-> movie name
        rid_to_name, name_to_rid = self.read_item_names(path)
        print "CF Type:", prnt, "BASED"

        search_key = raw_input(
            "Enter a Movie Name, \n ex. Toy Story (1995) or Seven (Se7en) (1995)\n Movie name:"
        )
        print "ALGORITHM USED : ", co_pe
        raw_id = name_to_rid[search_key]

        # --------------------------------------------- MARKERS

        f = io.open("cluster/AlgoHist_ib.txt", "wb")
        f.write(repr(co_pe))
        f.close()

        # --------------------------------------------- MARKERS END

        print "\t\t RAW ID>>>>>>>", raw_id, "<<<<<<<"
        inner_id = algo.trainset.to_inner_iid(raw_id)

        print "INNER ID >>>>>", inner_id

        # Retrieve inner ids of the nearest neighbors of Toy Story.
        k = input("Enter size of Neighborhood (Min:1, Max:40)")
        neighbors = algo.get_neighbors(inner_id, k=k)

        neighbors = (algo.trainset.to_raw_iid(inner_id)
                     for inner_id in neighbors)
        neighbors = (rid_to_name[rid] for rid in neighbors)

        print "Nearest ", k, " Matching Items are:"
        for i in neighbors:
            print "\t " * 6, i
Beispiel #8
0
def gen_pred_matrix_ubcf(co_pe):

    # ---------------------------------------------------- UBCF as is

    # INITIALIZE REQUIRED PARAMETERS
    # path = 'ml-100k/u.user'
    prnt = "USER"
    sim_op = {'name': co_pe, 'user_based': True}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating",
                    sep='\t',
                    rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    algo.train(trainset)
    print "ALGORITHM USED", co_pe

    print "CF Type:", prnt, "BASED"

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions, 5)

    # ---------------------------------------------------- UBCF as is

    csvfile = 'pred_matrix-full_ubcf.csv'
    with open(csvfile, "w") as output:
        writer = csv.writer(output, delimiter=',', lineterminator='\n')
        writer.writerow(['uid', 'iid', 'rat'])
        for uid, user_ratings in top_n.items():
            for (iid, r) in user_ratings:
                value = uid, iid, r
                writer.writerow(value)
    print "Done! You may now check the file in same Dir. as of Program"
Beispiel #9
0
def knn(data, training, testing):
    '''
        Tune Basic KNN parameters then calculates RMSE, coverage and running time of Basic KNN

        Args:
            data(Dataset): the whole dataset divided into 5 folds
            training(Dataset): training dataset
            testing(Dataset): test dataset

        Returns:
            rmse: RMSE of Basic KNN with optimized parameters
            top_n: number of unique predictions for top n items
    '''

    # candidate parameters
    knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                                                        'min_support': [1, 5], 'user_based': [False]}}

    # optimize parameters
    knn_grid_search = GridSearch(KNNBasic, knn_param_grid, measures=['RMSE'], verbose=False)
    knn_grid_search.evaluate(data)
    param = knn_grid_search.best_params['RMSE']
    print('KNNBasic:', param)
    # RMSE against parameters
    result_df = pd.DataFrame.from_dict(knn_grid_search.cv_results)
    result_df.to_csv('data/knn_rmse_against_param.csv')


    # fit model using the optimized parameters
    knn = KNNBasic(k=param['k'], name=param['sim_options']['name'], min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'] )
    knn.train(training)

    # evaluate the model using test data
    predictions = knn.test(testing)
    top_n = get_top_n(predictions, n=5)

    rmse = accuracy.rmse(predictions, verbose=True)
    return rmse, top_n
Beispiel #10
0
def gen_pred_matrix_ibcf(co_pe):
    # ---------------------------------------------------- IBCF as is

    # INITIALIZE REQUIRED PARAMETERS
    path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item'
    prnt = "ITEM"
    sim_op = {'name': co_pe, 'user_based': False}
    algo = KNNBasic(sim_options=sim_op)

    reader = Reader(line_format="user item rating",
                    sep='\t',
                    rating_scale=(1, 5))
    df = Dataset.load_from_file('ml-100k/u.data', reader=reader)

    # START TRAINING
    trainset = df.build_full_trainset()

    # APPLYING ALGORITHM KNN Basic
    res = algo.train(trainset)
    print "\t\t >>>TRAINED SET<<<<\n\n", res

    # Read the mappings raw id <-> movie name
    # rid_to_name, name_to_rid = read_item_names(path)
    print "CF Type:", prnt, "BASED"
    print "Please be Patient while 'pred_matrix-full_ibcf.csv' is being Generated"
    for i in range(5):
        print "."
        time.sleep(0.5)
    # --------------------------------------------------------- EXPERIMENTAL

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset=testset)

    top_n = get_top_n(predictions, 5)

    # --------------------------------------------------------- EXPERIMENTAL

    # ---------------------------------------------------- IBCF as is

    csvfile = 'pred_matrix-full_ibcf.csv'
    with open(csvfile, "w") as output:
        writer = csv.writer(output, delimiter=',', lineterminator='\n')
        writer.writerow(['uid', 'iid', 'rat'])
        for uid, user_ratings in top_n.items():
            for (iid, r) in user_ratings:
                value = uid, iid, r
                writer.writerow(value)
    print "Done! You may now check the file in same Dir. as of Program"
def compute_recommendations():
    #connecting to the database
    # engine = create_engine("mysql://*****:*****@localhost/ratingsx?charset=utf8", echo=True)
    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(
        sessionmaker(bind=engine, autocommit=False, autoflush=False))
    # disable print

    blockPrint()

    #reading in the database
    df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine)
    df_ratings = df_ratings[['user_id', 'item_id', 'rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()

    #formatting the dataset using the surprise library
    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 5))
    data = Dataset.load_from_df(df_ratings, reader=reader)
    training_set = data.build_full_trainset()

    algorithm = KNNBasic()  # use the singular value decomposition

    algorithm.train(training_set)  # fit the data to the model
    testing_set = training_set.build_anti_testset()
    predictions = algorithm.test(testing_set)  # make prediction

    #writing the function for top predictions
    def get_top_n(predictions, n=10):
        #     Return the top-N recommendation for each user from a set of predictions.

        #     Args:
        #         predictions(list of Prediction objects): The list of predictions, as
        #             returned by the test method of an algorithm.
        #         n(int): The number of recommendation to output for each user. Default
        #             is 10.

        #     Returns:
        #     A dict where keys are user (raw) ids and values are lists of tuples:
        #         [(raw item id, rating estimation), ...] of size n.

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n
# getting the top 10 predictions
    top_n = get_top_n(predictions, n=10)

    # Print the recommended items for each user
    a = []
    for uid, user_ratings in top_n.items():
        a.append([uid, [iid for (iid, _) in user_ratings]])
    df_list_pred = pd.DataFrame.from_records(a, columns=['A', 'B'])

    df_user = pd.DataFrame(df_list_pred.A.values.tolist())
    df_pred = pd.DataFrame(df_list_pred.B.values.tolist())

    df_pred.columns = [
        'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7',
        'pred_8', 'pred_9', 'pred_10'
    ]

    df_items = pd.read_sql('SELECT * FROM items;', con=engine)

    # df_pred = df_pred.applymap(lambda x: df_items.loc[x, 'title'])
    df_pred[['id']] = df_user
    df_pred = df_pred[[
        'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10'
    ]]

    df_pred['id'] = df_pred['id'].astype(int)

    # Append recomemndations
    df_pred.to_sql('recommendations', engine, if_exists='append',
                   index=False)  #if_exists='append'
    session.commit()

    #logging the predictions
    df_log = df_pred
    df_log['algorithm'] = 'KNNBasic'
    df_log = df_log.rename(columns={'id': 'user_id'})
    df_log = df_log[[
        'user_id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6',
        'pred_7', 'pred_8', 'pred_9', 'pred_10', 'algorithm'
    ]]

    df_log.to_sql('predictionlogs', engine, if_exists='append',
                  index=False)  #if_exists='append'
    session.commit()

    global mae1
    global rmse1
    mae1 = accuracy.mae(predictions)
    rmse1 = accuracy.rmse(predictions)
    mae1 = float(mae1)
    rmse1 = float(rmse1)
Beispiel #12
0
    surprise_cross_validate(algo, data, sim_options)

    # Gridsearch KNNBaseline
    param_grid = {'k': [18, 19, 20, 21, 22]}
    print(surprise_gridsearch(param_grid, KNNBasic, data))

    # Cross-Validate KNNBaseline
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(k=19, sim_options=sim_options)
    surprise_cross_validate(algo, data, sim_options)

    # Predictions
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(k=19, sim_options=sim_options)
    algo.train(trainset)
    predictions = algo.test(trainset.build_testset())

    # Build Pandas DF of Ratings and Predictions
    df = pd.DataFrame(predictions,
                      columns=['uid', 'iid', 'rui', 'est', 'details'])
    df['#_of_Movies_Rated_By_User'] = df.uid.apply(get_Iu)
    df['#_of_Users_That_Rated_This_Movie'] = df.iid.apply(get_Ui)
    df['Error_in_Rating_Prediction'] = abs(df.est - df.rui)
    df.rename(columns={
        'uid': 'User_ID',
        'iid': 'Movie_ID',
        'rui': 'User_Rating',
        'est': 'Predicted_Rating'
    },
              inplace=True)
def compute_recommendations(user_id, prediction_table, numeric_prediction_table):


    algo = 'Item-based KNN'

    sim_options = {'user_based': False}

    algorithm = KNNBasic(sim_options=sim_options)


    # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview



    engine = create_engine(config.DB_URI, echo=True)
    session = scoped_session(sessionmaker(bind=engine,
                                      autocommit = False,
                                      autoflush = False))



    #reading in the database


    df_ratings = pd.read_sql('SELECT * FROM ratings;', con = engine)
    df_ratings=df_ratings[['user_id','item_id','rating']]
    df_ratings = df_ratings.dropna()
    df_ratings = df_ratings.drop_duplicates()


    df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False)
    df_ratings2 = df_ratings2.rename(columns = {'movie_id': 'item_id'})
    df_ratings2 = df_ratings2[['user_id','item_id','rating']]
    df_ratings2 = df_ratings2.dropna()
    df_ratings2 = df_ratings2.drop_duplicates()

    df_ratings = pd.concat([df_ratings, df_ratings2], axis=0)




    reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10))
    data = Dataset.load_from_df(df_ratings, reader=reader)

    trainset = data.build_full_trainset()


#     algorithm = eval(algo + "()")# set the algorithm...............................................


    algorithm.train(trainset)

    items = pd.read_sql('SELECT distinct id FROM items;', con = engine)
    df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id]
    total_items = items.id.unique()
    user_items = df_user_items.item_id.unique()
    # user_id = str(user_id)
    prediction_items = [x for x in total_items if x not in user_items]

    predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction'])


    predicted_ratings = []

    for i in prediction_items:
        a = user_id
        b = i
        est = algorithm.predict(a, b)
        predicted_ratings.append(est[3])

    predictions['item_id'] = prediction_items
    predictions['user_id'] = pd.Series([user_id for x in range(len(predictions.index))], index=predictions.index)


    predictions['prediction'] = predicted_ratings


    predictions = predictions.sort_values('prediction', ascending=False)
    test_prediction = predictions
    predictions = predictions.head(n=10)


    cols =['pred_1', 'pred_2','pred_3','pred_4',
                                   'pred_5','pred_6','pred_7','pred_8',
                                  'pred_9','pred_10']




    df_pred = predictions[['item_id']].T

    df_pred.columns = cols

    df_pred['id'] = user_id



    df_pred = df_pred[['id','pred_1', 'pred_2','pred_3','pred_4',
                                       'pred_5','pred_6','pred_7','pred_8',
                                      'pred_9','pred_10']]

    df_pred['id'] = df_pred['id'].astype(int)



    df_pred.to_sql(prediction_table, engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    df_num_ratings = test_prediction

    df_num_ratings = df_num_ratings.head(n=20)

    df_num_ratings['algorithm'] = algo
    df_num_ratings.rename(columns={'prediction':'predicted_rating'}, inplace=True)


    df_num_ratings.to_sql('numeric_predictions',engine,if_exists='append', index=False)#if_exists='append'
    session.commit()


    predcols =['num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']

    df_num_ratings_transpose = predictions[['prediction']].T
    df_num_ratings_transpose.columns = predcols




    df_num_ratings_transpose['id'] = user_id

    df_num_ratings_transpose = df_num_ratings_transpose[['id','num_1', 'num_2','num_3','num_4',
                                       'num_5','num_6','num_7','num_8',
                                      'num_9','num_10']]

    df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int)








    df_num_ratings_transpose.to_sql(numeric_prediction_table,engine,if_exists='append', index=False)#if_exists='append'
    session.commit()
Beispiel #14
0
pred[3]

#########################
# http://surprise.readthedocs.io/en/stable/prediction_algorithms.html
# change the prediction algorithm to knn
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
# http://surprise.readthedocs.io/en/stable/similarities.html
#sim_options = {'name': 'pearson',
#               ''user_based': True
#               }

algo_1 = KNNBasic(sim_options= sim_options)
trainset = data.build_full_trainset()
algo_1.train(trainset)

pred = algo_1.predict('374', '500')

print("Prediction Object:")
pred

print("Predicted Rating:")
pred[3]


# print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
Beispiel #15
0
from surprise import Reader, Dataset, KNNBasic

# break data file down into an array full of strings
with open('./data.txt') as f:
    all_lines = f.readlines()
# load information from file into dataset using reader
reader = Reader(line_format='item user rating', sep=',', rating_scale=(1, 5))
data = Dataset.load_from_file('./data.txt', reader=reader)
# split dataset into n folds, can be changed
data.split(n_folds=5)
# using mean squared difference similarity measure here, with min_support set to 1 to consider only users who have at least 1 movie in common
sim_options = {'name': 'msd', 'user_based': False, 'min_support': 1}
trainingset = data.build_full_trainset()
# uses basic KNN algorithm to create a training set
algorithm = KNNBasic(sim_options=sim_options)
algorithm.train(trainingset)

# predict rating using item and user ID as input
userid = str(input("Please enter user ID: "))
itemid = str(input("Please enter movie ID: "))
print(algorithm.predict(userid, itemid))
Beispiel #16
0
class BaselineMF:
    def __init__(self, cf_algo=None, logit=False):
        """
        fit method takes a ContentDataset and fits it for num_epochs (passed at initialisation)

        Parameters
        ----------

        batch_size (int): the size of each training batch

        network (ContentMF): a network that fits using user_ids and item_texts

        num_epochs (int): the number of training epochs

        optim_params (dict): parameters passed to the Stochastic Gradient Descent (SGD) class

        use_cuda (bool): set to True to use the GPU

        """
        self.logit = logit
        self.question_truth_dict = {}
        self.average_true_rating = 0.5
        self.average_false_rating = 0.5
        self.loss_fn = nn.MSELoss(size_average=True)

        if cf_algo is None:
            self.cf_algo = KNNBasic(k=2)
        else:
            self.cf_algo = cf_algo

        #self.svd = SVD(n_epochs=500, verbose=True, lr_all=0.001, n_factors=50)

    def dataloader_extract(self, sample):
        ratings = pd.Series(np.array(list(sample['rating'])))
        user_ids = pd.Series(sample['user_id']).astype(str)
        item_ids = pd.Series(sample['item_id']).astype(str)

        return ratings, user_ids, item_ids

    def logit_fn(self, p, epsilon=1e-3):
        for item in p:
            if item == 0:
                item = epsilon
            if item == 1:
                item = 1 - epsilon
        return np.log(p / (1 - p))

    def sigmoid_fn(self, x):
        return 1 / (1 + np.exp(-x))

    def fit(self, dataset, train_sampler):
        """Runs the fit method which simply works out the average response
        for 'true' and 'false' questions, where 'true' questions are those
        where the average rating is greater than 0.5"""
        t0 = time.time()
        data_loader = DataLoader(dataset,
                                 batch_size=len(train_sampler),
                                 sampler=train_sampler)
        sample = iter(data_loader).next()
        ratings, user_ids, item_ids = self.dataloader_extract(sample)
        if self.logit:
            ratings = self.logit_fn(ratings)
        possible_ratings = ratings.unique()

        ratings_dict = {
            'itemID': item_ids,
            'userID': user_ids,
            'rating': ratings
        }
        df = pd.DataFrame(ratings_dict)
        reader = Reader(rating_scale=(0, 1))
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
        trainset = data.build_full_trainset()
        self.cf_algo.train(trainset)

    def predict(self, dataset, sampler, batch_size=64):
        # I'm not entirely sure that the build_full_testset
        # function works as I'd expect, so instead we loop
        # through all the test ids and predict one-at-a-time
        preds = []
        data_loader = DataLoader(dataset,
                                 batch_size=len(dataset),
                                 sampler=sampler)
        sample = iter(data_loader).next()
        ratings, user_ids, item_ids = self.dataloader_extract(sample)
        for user_id, item_id in zip(user_ids, item_ids):
            pred = self.cf_algo.predict(str(user_id), str(item_id))[3]
            if self.logit:
                pred = self.sigmoid_fn(pred)
            preds.append(pred)

        return (preds)

    def score(self, dataset, sampler, batch_size=64, only_slow=True):
        """Scores the baseline on predictions made on the dataset provided,
        sampled with the given sampler. If `only_slow` is true, then only
        the slow judgments in the sampled part of the dataset are scored"""
        predictions = self.predict(dataset, sampler, batch_size)
        data_loader = DataLoader(dataset,
                                 batch_size=len(dataset),
                                 sampler=sampler)
        testset = iter(data_loader).next()
        ratings, user_ids, item_ids, = self.dataloader_extract(testset)
        user_ids = user_ids.astype(int)
        ratings = torch.Tensor(ratings)
        predictions = torch.Tensor(predictions)

        #Note that all baselines are passed flattened datasets, so we
        # have to work out which of the users correspond to the latest
        # times
        if only_slow:
            long_time_uids = [i for i in np.unique(user_ids) if i % 3 == 2]
            new_ratings = []
            new_preds = []
            for index, rating in enumerate(ratings):
                if user_ids[index] in long_time_uids:
                    new_ratings.append(rating)
            for index, pred in enumerate(predictions):
                if user_ids[index] in long_time_uids: new_preds.append(pred)
            loss = self.loss_fn(torch.Tensor(new_preds),
                                torch.Tensor(new_ratings).cpu())
            return loss.cpu().data.item()

        else:
            loss = self.loss_fn(predictions, ratings.cpu())
            return loss.cpu().data.item()
Beispiel #17
0
from surprise import KNNBasic, Reader, Prediction
from surprise import Dataset
from surprise.model_selection import KFold
from surprise import accuracy
import surprise.prediction_algorithms.algo_base.AlgoBase

reader = Reader(line_format='user	item	rating',
                sep='	',
                skip_lines=1,
                rating_scale=(1, 40000))

data = Dataset.load_from_file('collaborative.csv', reader=reader)

sim_options = {'name': 'cosine', 'user_based': True}

algo = KNNBasic(sim_options=sim_options)
algo.train(data)
kf = KFold(n_splits=10)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset, verbose=True)
    rmse = accuracy.rmse(predictions, verbose=True)
    mae = accuracy.mae(predictions, verbose=True)
for i in range(0, len(k_neig)):
    knnbasic_ambiente = KNNBasic(k=k_neig[i])
    perf = evaluate(knnbasic_ambiente,
                    data,
                    measures=['RMSE', 'MAE'],
                    verbose=0)
    print('K es ', k_neig[i], 'media', np.array(perf['rmse']).mean())

#mejor k de ambiente es 40

knnbasic_ambiente = KNNBasic(k=40)
# Retrieve the trainset.
trainset = data.build_full_trainset()

knnbasic_ambiente.train(trainset)

from sklearn.externals import joblib
joblib.dump(knnbasic_ambiente, 'knnbasic_ambiente.pkl')

####comida knn######

train_reducido[['id_usuario', 'id_restaurante', 'rating_comida',
                'fecha']].to_csv('knn_comida.csv', index=False)

file_path = 'knn_comida.csv'

reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
Beispiel #19
0
#importing surprise package and builtin data
from surprise import Dataset, evaluate
from surprise import KNNBasic
from collections import defaultdict

# loading data
dataset = Dataset.load_builtin("ml-100k")
trainingSet = dataset.build_full_trainset()
trainingSet

# cosine similarity between 2 vectors
sim_options = {'name': 'cosine', 'user_based': False}
knn = KNNBasic(sim_options=sim_options)

# training the model
knn.train(trainingSet)

# movie recommendations for users
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)

#top three movie recommendations for each user.


def get_top5_recommendations(predictions, topN=5):

    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))

    for uid, user_ratings in top_recs.items():