def __init__(self, data, popularityRankings): self.rankings = popularityRankings #Build a full training set for evaluating overall properties self.fullTrainSet = data.build_full_trainset() #And build an anti-test-set for building predictions self.fullAntiTestSet = self.fullTrainSet.build_anti_testset( ) # return A list of tuples ``(uid, iid, fill)`` where ids are raw ids. The data set are all the ratings that are **not** in the trainset # Build a 75/25 train/test split for measuring accuracy # todo, k-folder cross validation self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) #Build a "leave one out" train/test split for evaluating top-N recommenders, extract one rated movie from each user LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() #Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet)
def __init__(self): # build the full data """ The constructor that build different type of dataset prepared for fitting the model. """ self.fulldata = self.LoadRating() self.fulldata = self.fulldata self.popularitydata = self.loadPopularityData() self.fullTrainData = self.fulldata.build_full_trainset() #build the full anti data test set self.fullAntiTestData = self.fullTrainData.build_anti_testset() self.fullTestData = self.fullTrainData.build_testset() #get 80% train data and 20% test data self.traindata, self.testdata = train_test_split(self.fulldata, test_size=0.2) #build leave-one-out cross validation self.LOO_Data = LeaveOneOut() for train, test in self.LOO_Data.split(self.fulldata): self.LOO_Train = train self.LOO_Test = test self.LOOAntiTest = self.LOO_Train.build_anti_testset() #pass the popularitydata self.rank = self.popularitydata #similarity used for diversity sim_options = {'name': 'cosine', 'user_based': False} # compute similarities between items self.sim_matrix = KNNBaseline(sim_options=sim_options) self.sim_matrix.fit(self.fullTrainData)
def __init__(self, data, popularityRankings): self.rankings = popularityRankings #すべての訓練データセットを構築する self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() #75対25に訓練データとテストデータとを分けて、精度を測定する self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) #「leave one out」 法で、訓練データとテストデータとを分割しつつ、Top-Nを求める #さらに、テストデータに含まれないデータセットで予測する LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() #多様性が測定できるように、類似度の行列演算を行う sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet)
def __init__(self, data, popularity_rankings): """ Init Data related variables to be used in evaluation. Parameters ---------- data: DatasetAutoFolds Data which we are creating a model from. Should be variable derived from suprise Dataset class. popularity_rankings: defaultdict A dict contains the ranking of items """ # Build a full training set for evaluating overall properties self.full_trainset = data.build_full_trainset() self.full_antiset = self.full_trainset.build_anti_testset() # Build a 75/25 train/test split for measuring accuracy self.trainset, self.testset = train_test_split(data, test_size=.25, random_state=1) # Build a "leave one out" train/test split for evaluating top-N recommenders # And build an anti-test-set for building predictions loocv = LeaveOneOut(n_splits=1, random_state=1) for train, test in loocv.split(data): self.loocv_train = train self.loocv_test = test self.loocv_anti_testset = self.loocv_train.build_anti_testset() self.rankings = popularity_rankings # Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.sims_algo = KNNBaseline(sim_options=sim_options) self.sims_algo.fit(self.full_trainset)
def __init__(self, data, popularity_ranks, diversity=False, leave_one_out=False, anti_test=False): self.rankings = popularity_ranks self.random_state = 100 # Build a full training set for evaluating overall properties self.full_train = data.build_full_trainset() if anti_test: self.full_test = self.full_train.build_anti_testset() # Build a 75/25 train/test split for measuring accuracy self.train, self.test = train_test_split(data, test_size=0.25, random_state=self.random_state) # Build a "leave one out" train/test split for evaluating top-N recommenders # Build an anti-test-set for building predictions if leave_one_out: LOOCV = LeaveOneOut(n_splits=1, random_state=self.random_state) for train, test in LOOCV.split(data): self.LOOCV_train = train self.LOOCV_test = test self.LOOCV_anti_test = self.LOOCV_train.build_anti_testset() # Build interaction matrix for diversity if diversity: sim_options = {'name': 'cosine', 'user_based': False} self.similarites = KNNBaseline(sim_options=sim_options) self.similarites.fit(self.full_train)
def __init__(self, data, popularityRankings): self.rankings = popularityRankings #Build a full training set for evaluating overall properties self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() #Build a 75/25 train/test split for measuring accuracy self.trainSet, self.testSet = train_test_split(data, test_size=.25, random_state=1) #Build a "leave one out" train/test split for evaluating top-N recommenders #And build an anti-test-set for building predictions LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() #Compute similarty matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet)
def __init__(self, data): self.train_set, self.test_set = train_test_split(data, test_size=0.25, random_state=1) LOOX = LeaveOneOut(1, random_state=1) for x_train, x_test in LOOX.split(data): self.LOOX_trainSet = x_train self.LOOX_testSet = x_test del x_test, x_train self.LOOX_anti_testSet = self.LOOX_trainSet.build_anti_testset() self.full_trainSet = data.buid_full_trainset() self.full_anti_testSet = self.full_trainSet.build_anti_testset()
def __init__(self, data): self.fullTrainSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=1) LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainSet)
def __init__(self,data,withSim=False): self.trainSet, self.testSet = train_test_split(data, test_size=0.25, random_state=0) LOOX = LeaveOneOut(1, random_state=1) for xtrain, xtest in LOOX.split(data): self.LOOX_trainSet = xtrain self.LOOX_testSet = xtest del xtrain, xtest self.LOOX_antitestSet = self.LOOX_trainSet.build_anti_testset() self.full_trainSet = data.build_full_trainset() self.full_antitestSet = self.full_trainSet.build_anti_testset() if withSim: sim_options = {'name': 'cosine', 'user_based': False} self.simAlgo = KNNBaseline(sim_options=sim_options) self.simAlgo.fit(self.full_trainSet)
def __init__(self, ratingsFilePath, moviesFilePath, verbose=True): self.ratingsPath = ratingsFilePath self.moviesPath = moviesFilePath if(verbose): print("\nLoading Movies and Ratings...") # load data self.movielens = MovieLens(self.ratingsPath, self.moviesPath) self.ratings = self.movielens.loadMovieLensLatestSmall() self.popularity_rankings = self.movielens.getPopularityRanks() ## Section for creating dataset for using full-input-dataset for training/test self.trainset_full = self.ratings.build_full_trainset() # create antitest set from full training set self.antitestset_full = self.trainset_full.build_anti_testset() ## Section for creating dataset for using train-test-split for training/test # 75/25 train/test split self.trainset_percent_split, self.testset_percent_split = train_test_split(self.ratings,test_size=0.25, random_state=1, shuffle=True) # ## Section for creating dataset for using leave-one-out method for training/cv/test #Build a "leave one out" train/test split for evaluating top-N recommenders LOOCV = LeaveOneOut(n_splits=1, random_state=1) for loocv_train, loocv_test in LOOCV.split(self.ratings): self.trainset_loocv = loocv_train self.testset_loocv = loocv_test self.antitestset_loocv = self.trainset_loocv.build_anti_testset() ## Compute similarty matrix between items so we can measure diversity similarity_options = {'name': 'cosine', 'user_based': False} self.similarity_algorithm = KNNBaseline(sim_options=similarity_options) self.similarity_algorithm.fit(self.trainset_full) if(verbose): print("\nMovies and Ratings loaded\n")
def build_train_test(self, test_size=.25): # Train Set, Test Set to test results self.train_set, self.test_set = train_test_split(self.dataset, test_size=test_size, random_state=1) # https://surprise.readthedocs.io/en/stable/trainset.html#surprise.Trainset.build_anti_testset # Situation when the user u is known, the item is known, but the rating is not in the trainset self.anti_test_set = self.full_dataset.build_anti_testset() # Cross-validation iterator where each user has exactly one rating in the testset. leave_one_out_set = LeaveOneOut(n_splits=1, random_state=1) loo_train_set, loo_test_set = list( leave_one_out_set.split(self.dataset))[0] self.leave_one_out_train_set = loo_train_set self.leave_one_out_test_set = loo_test_set self.leave_one_out_anti_test_set = loo_train_set.build_anti_testset() # Compute similarity matrix between items so we can measure diversity sim_options = {'name': 'cosine', 'user_based': False} self.similarity_algorithm = KNNBaseline(sim_options=sim_options) self.similarity_algorithm.fit(self.full_dataset)
def __init__(self, data, popularityRanking): self.ranking = popularityRanking #create train and anti test set to be used for prediction using KNNBasleine algorithm self.fullTrainingSet = data.build_full_trainset() self.fullAntiTestSet = self.fullTrainingSet.build_anti_testset() #create a training(75%) and test(25%) split. random_state specifies seed for Random Number Generator self.trainset, self.testset = train_test_split(data, test_size=0.25, random_state=1) #To check using Leave-One-Out-Cross-Validation for Top-N recommenders LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data): self.LOOCVTrain = train self.LOOCVTest = test #Build anti test set for predictions self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset() #Calculate similarity to measure diversity using cosine similarity sim_options = {'name': 'cosine', 'user_based': False} self.simsAlgo = KNNBaseline(sim_options=sim_options) self.simsAlgo.fit(self.fullTrainingSet)
def __init__(self, df, popRankings): #Build a full training set for evaluating overall properties self.df = df self.data = self._convertToSurprise() self.rankings = popRankings # training set for the entire data self.fullTrainSet = self.data.build_full_trainset() # anti-test set for the entire training data self.fullAntiTestSet = self.fullTrainSet.build_anti_testset() #Build a 75/25 train/test split for measuring accuracy self.trainSet, self.testSet = train_test_split(self.data, test_size=.25, random_state=1) #Build a "leave one out" train/test split for evaluating top-N recommenders #And build an anti-test-set for building predictions LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(self.data): self.LOOCVTrain = train self.LOOCVTest = test self.LOOCVAntiTestSet = self.LOOCVTrain.build_anti_testset()
trainSet, testSet = train_test_split(data, test_size=.25, random_state=1) algo = SVD(random_state=10) algo.fit(trainSet) print("\nComputing recommendations...") predictions = algo.test(testSet) print("\nEvaluating accuracy of model...") print("RMSE: ", RecommenderMetrics.RMSE(predictions)) print("MAE: ", RecommenderMetrics.MAE(predictions)) print("\nEvaluating top-10 recommendations...") # Set aside one rating per user for testing LOOCV = LeaveOneOut(n_splits=1, random_state=1) for trainSet, testSet in LOOCV.split(data): print("Computing recommendations with leave-one-out...") # Train model without left-out ratings algo.fit(trainSet) # Predicts ratings for left-out ratings only print("Predict ratings for left-out set...") leftOutPredictions = algo.test(testSet) # Build predictions for all ratings not in the training set print("Predict all missing ratings...") bigTestSet = trainSet.build_anti_testset() allPredictions = algo.test(bigTestSet)
# Cross validation and then prediction # define a cross-validation iterator kf = KFold(n_splits=5, random_state=22) print("\nKFold Cross Validation") for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse( predictions, verbose=True) # cross validation also gives around 87% accuracy loo = LeaveOneOut(n_splits=5, random_state=22) print("\nLeave One Out Cross Validation") for trainset, testset in loo.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True) # to know which parameter combination yields the best results, the GridSearchCV # use GridSearchCV scheme param_grid = { 'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]
def test_LeaveOneOut(toy_data): loo = LeaveOneOut() with pytest.raises(ValueError): next(loo.split(toy_data)) # each user only has 1 item so trainsets fail reader = Reader('ml-100k') data_path = (os.path.dirname(os.path.realpath(__file__)) + '/u1_ml100k_test') data = Dataset.load_from_file(file_path=data_path, reader=reader, rating_scale=(1, 5)) # Test random_state parameter # If random_state is None, you get different split each time (conditioned # by rng of course) loo = LeaveOneOut(random_state=None) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a != testsets_b # Repeated called to split when random_state is set lead to the same folds loo = LeaveOneOut(random_state=1) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a == testsets_b # Make sure only one rating per user is present in the testset loo = LeaveOneOut() for _, testset in loo.split(data): cnt = Counter([uid for (uid, _, _) in testset]) assert all(val == 1 for val in itervalues(cnt)) # test the min_n_ratings parameter loo = LeaveOneOut(min_n_ratings=5) for trainset, _ in loo.split(data): assert all(len(ratings) >= 5 for ratings in itervalues(trainset.ur)) loo = LeaveOneOut(min_n_ratings=10) for trainset, _ in loo.split(data): assert all(len(ratings) >= 10 for ratings in itervalues(trainset.ur)) loo = LeaveOneOut(min_n_ratings=10000) # too high with pytest.raises(ValueError): next(loo.split(data))
def test_LeaveOneOut(toy_data): loo = LeaveOneOut() with pytest.raises(ValueError): next( loo.split(toy_data)) # each user only has 1 item so trainsets fail reader = Reader('ml-100k') data_path = (os.path.dirname(os.path.realpath(__file__)) + '/u1_ml100k_test') data = Dataset.load_from_file(file_path=data_path, reader=reader, rating_scale=(1, 5)) # Test random_state parameter # If random_state is None, you get different split each time (conditioned # by rng of course) loo = LeaveOneOut(random_state=None) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a != testsets_b # Repeated called to split when random_state is set lead to the same folds loo = LeaveOneOut(random_state=1) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a == testsets_b # Make sure only one rating per user is present in the testset loo = LeaveOneOut() for _, testset in loo.split(data): cnt = Counter([uid for (uid, _, _) in testset]) assert all(val == 1 for val in itervalues(cnt)) # test the min_n_ratings parameter loo = LeaveOneOut(min_n_ratings=5) for trainset, _ in loo.split(data): assert all(len(ratings) >= 5 for ratings in itervalues(trainset.ur)) loo = LeaveOneOut(min_n_ratings=10) for trainset, _ in loo.split(data): assert all(len(ratings) >= 10 for ratings in itervalues(trainset.ur)) loo = LeaveOneOut(min_n_ratings=10000) # too high with pytest.raises(ValueError): next(loo.split(data))
class DataHandler: rating = './ml-latest-small/ratings.csv' movies = './ml-latest-small/movies.csv' # for testing purpose # rating = './test-data/ratings.csv' # movies = './test-data/movies.csv' """ Load the rating data -- main dataset. Return: the main dataset """ def LoadRating(self): reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) return Dataset.load_from_file(self.rating, reader=reader) """ Load the popularity data. Return: return the dictionary of rankings """ def loadPopularityData(self): # similart to getOrDefault in Java ratingTimes = defaultdict(int) rankings = defaultdict(int) with open(self.rating, newline='') as csvfile: reader = csv.reader(csvfile) next(reader) for row in reader: movieId = int(row[1]) ratingTimes[movieId] += 1 rank = 1 for movieID, count in sorted(ratingTimes.items(), key=lambda x: x[1], reverse=True): rankings[movieID] = rank rank += 1 return rankings def getEvaluation(self): """ Getter for evaluation data Return: the full dataset """ return self.fulldata def getRank(self): """ Getter for the rank data Return: the popularity data set """ return self.popularitydata def __init__(self): # build the full data """ The constructor that build different type of dataset prepared for fitting the model. """ self.fulldata = self.LoadRating() self.fulldata = self.fulldata self.popularitydata = self.loadPopularityData() self.fullTrainData = self.fulldata.build_full_trainset() #build the full anti data test set self.fullAntiTestData = self.fullTrainData.build_anti_testset() self.fullTestData = self.fullTrainData.build_testset() #get 80% train data and 20% test data self.traindata, self.testdata = train_test_split(self.fulldata, test_size=0.2) #build leave-one-out cross validation self.LOO_Data = LeaveOneOut() for train, test in self.LOO_Data.split(self.fulldata): self.LOO_Train = train self.LOO_Test = test self.LOOAntiTest = self.LOO_Train.build_anti_testset() #pass the popularitydata self.rank = self.popularitydata #similarity used for diversity sim_options = {'name': 'cosine', 'user_based': False} # compute similarities between items self.sim_matrix = KNNBaseline(sim_options=sim_options) self.sim_matrix.fit(self.fullTrainData) """ Getter for different datasets. """ def GetFullTrainData(self): return self.fullTrainData def GetAntiTestData(self): return self.fullAntiTestData def GetAntiUserTestData(self,userId): #the same logic as the build_anti_test but for the spefic user trainset = self.fullTrainData temp = trainset.global_mean antiUserDataSet = [] uidint = trainset.to_inner_uid(str(userId)) #find the specific user inner id user_watched_movies =set(x for (x,y) in trainset.ur[uidint]) #since int the train set, we use innter id antiUserDataSet+=[(trainset.to_raw_uid(uidint),trainset.to_raw_iid(i),temp) for i in trainset.all_items() if i not in user_watched_movies] #since we find the data in the pandas later, we record the raw id return antiUserDataSet def GetFullTestData(self): return self.fullTestData def GetTrainData(self): return self.traindata def GetTestData(self): return self.testdata def GetLOOTrain(self): return self.LOO_Train def GetLOOTest(self): return self.LOO_Test def GetLOOAntiTestSet(self): return self.LOOAntiTest def GetPopularRankings(self): return self.rank def GetSimilarities(self): return self.sim_matrix
def test_LeaveOneOut(): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) + '/custom_dataset') data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader) loo = LeaveOneOut() with pytest.raises(ValueError): next(loo.split(data)) # Each user only has 1 item so trainsets fail reader = Reader('ml-100k') custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) + '/u1_ml100k_test') data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader) # Test random_state parameter # If random_state is None, you get different split each time (conditioned # by rng of course) loo = LeaveOneOut(random_state=None) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a != testsets_b # Repeated called to split when random_state is set lead to the same folds loo = LeaveOneOut(random_state=1) testsets_a = [testset for (_, testset) in loo.split(data)] testsets_b = [testset for (_, testset) in loo.split(data)] assert testsets_a == testsets_b # Make sure only one rating per user is present in the testset loo = LeaveOneOut() for _, testset in loo.split(data): cnt = Counter([uid for (uid, _, _) in testset]) assert all(val == 1 for val in itervalues(cnt))