def main(): timeStart = time() print('Pre-processing cf data...') train_data = readFile('./data/train.csv', separator=' ', columns=['userID', 'movieID', 'rating'], types={'userID': np.int32, 'movieID': np.int32, 'rating': np.float32}) train_data = train_data.sample(n=len(train_data)) test_data = readFile('./data/test.csv', separator=' ', columns=['userID', 'movieID'], types={'userID': np.int32, 'movieID': np.int32}) reader = Reader(rating_scale=(0, 5)) train_data = Dataset.load_from_df(train_data, reader) model = SVD(n_factors=5, n_epochs=50) train_data = train_data.build_full_trainset() cf_predictions = cf_predict(train_data, test_data, model) print('\ncf train-predict costs (%d seconds)' % (time() - timeStart)) with open("./data/train.csv", "r") as data: data = data.readlines() for row in data[1:]: tokens = word_tokenize(row) if (user_movie_rating.get(int(tokens[0])) == None): user_movie_rating[int(tokens[0])] = [] user_movie_rating[int(tokens[0])].append((int(tokens[1]), float(tokens[2]))) fill_content_dict("./data/additional_files/movie_genres.data", movie_genres) fill_content_dict("./data/additional_files/movie_directors.data", movie_directors) fill_content_dict("./data/additional_files/movie_actors.data", movie_actors) fill_content_dict("./data/additional_files/movie_tags.data", movie_tags) print("Done! (%d seconds)\n" % (time() - timeStart)) print("Predicting...") timePred = time() content_predictions = [] with open("./data/test.csv", "r") as testFile: for row in testFile.readlines()[1:]: tokens = word_tokenize(row) tmp = contentBased_predict(int(tokens[0]), int(tokens[1])) content_predictions.append(contentBased_predict(int(tokens[0]), int(tokens[1]))) print("Done! (%d seconds)\n" % (time() - timePred)) print("Writing to file...") with open("./data/res.data", 'w') as predFile: for p1, p2 in zip(content_predictions, cf_predictions): predFile.write("%f\n" % ((float(p1)+float(p2))/2))
def main(): timeStart = time() #preprocess data print('Pre-processing cf data...') # Read train (shuffled) and test data as DataFrames train_data = readFile('./data/train.csv', separator=' ', columns=['userID', 'movieID', 'rating'], types={ 'userID': np.int32, 'movieID': np.int32, 'rating': np.float32 }) train_data = train_data.sample(n=len(train_data)) test_data = readFile('./data/test.csv', separator=' ', columns=['userID', 'movieID'], types={ 'userID': np.int32, 'movieID': np.int32 }) # Build the train data as a Surprise's DataSet object reader = Reader(rating_scale=(0, 5)) # Standardized rating scale train_data = Dataset.load_from_df(train_data, reader) model = SVD(n_factors=5, n_epochs=50) # Build a Trainset object to feed into the prediction algorithm. train_data = train_data.build_full_trainset() # Predict ratings for each user and associated movie cf_predictions = cf_predict(train_data, test_data, model) print('\ncf train-predict costs (%d seconds)' % (time() - timeStart)) #create dict {userID: [(movieID1, rating1), (movieID2, rating2), ...]} #only includes ratings for movies that the user has seen with open("./data/train.csv", "r") as data: data = data.readlines() for row in data[1:]: tokens = word_tokenize(row) if (user_movie_rating.get(int(tokens[0])) == None): user_movie_rating[int(tokens[0])] = [] user_movie_rating[int(tokens[0])].append( (int(tokens[1]), float(tokens[2]))) #create dict {movieID: [genre1, genre2, ...]} fill_content_dict("./data/additional_files/movie_genres.data", movie_genres) #create dict {movieID: director} fill_content_dict("./data/additional_files/movie_directors.data", movie_directors) #create dict {movieID: [actor1, actor2, ...]} fill_content_dict("./data/additional_files/movie_actors.data", movie_actors) #create dict {movieID: [tag1, tag2, ...]} fill_content_dict("./data/additional_files/movie_tags.data", movie_tags) print("Done! (%d seconds)\n" % (time() - timeStart)) #predict ratings print("Predicting...") timePred = time() content_predictions = [] with open("./data/test.csv", "r") as testFile: for row in testFile.readlines()[1:]: tokens = word_tokenize(row) content_predictions.append( contentBased_predict(int(tokens[0]), int(tokens[1]))) print("Done! (%d seconds)\n" % (time() - timePred)) #write results to file print("Writing to file...") timeWrite = time() with open("./data/res.data", 'w') as predFile: for p1, p2 in zip(content_predictions, cf_predictions): predFile.write("%f\n" % ((float(p1) + float(p2)) / 2))