def example(): """simple test and performance measure """ # reviews = movielens_extractor.get_ml_1m_dataset() reviews = movielens_extractor.get_ml_100K_dataset() ratings = movielens_extractor.reviews_to_numpy_matrix(reviews) # suffle_data np.random.seed(0) np.set_printoptions(precision=16) # print(NormalRandom.generate_matrix(1, 10)) # np.random.shuffle(ratings) # split data to training & validation train_pct = 0.9 train_size = int(train_pct * len(ratings)) train = ratings[:train_size] validation = ratings[train_size:] # params num_features = 10 bmf_model = BayesianMatrixFactorization() start_time = time.clock() bmf_model.load(ratings, train, validation) end_time = time.clock() print("time spent = %.3f" % (end_time - start_time)) return bmf_model
def generate_report_ml100k(): reviews = movielens_extractor.get_ml_100K_dataset() file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_ml100k.ipynb' load_reviews_code =\ 'from tripadvisor.fourcity import movielens_extractor\n' +\ 'reviews = movielens_extractor.get_ml_100K_dataset()' dataset_name = 'MovieLens 100k' ReviewsDatasetAnalyzerReport.generate_report( reviews, dataset_name, file_name, load_reviews_code)
def test(): R = [ # [5, 3, 0, 1], # [4, 0, 0, 1], # [1, 1, 0, 5], # [1, 0, 0, 4], # [0, 1, 5, 4], [5., 7., 5., 7., 0.], [5., 7., 5., 7., 9.], [5., 7., 5., 7., 9.], [6., 6., 6., 6., 5.], [6., 6., 6., 6., 5.] ] my_reviews = movielens_extractor.clean_reviews(movielens_extractor.get_ml_100K_dataset()) R = create_matrix(my_reviews).todense() R = np.array(R) N = len(R) M = len(R[0]) K = 2 P = np.random.rand(N, K) Q = np.random.rand(M, K) # P.fill(0.1) # Q.fill(0.1) nP, nQ = matrix_factorization(R, P, Q, K) # print(nP) # print(nQ) predicted = np.dot(nP[0], nQ[4]) # predicted = np.dot(nP[0, :], nQ[4, :]) # print('Predicted', predicted) # my_reviews = movielens_extractor.get_ml_100K_dataset() # my_reviews = movielens_extractor.clean_reviews(movielens_extractor.get_ml_100K_dataset()) # sgd = StochasticGradientDescent(2) # sgd.load(reviews_matrix_5) # print(build_user_index_map(test_reviews)) # print(build_item_index_map(test_reviews)) # # print(create_matrix(test_reviews)) # print(create_matrix(test_reviews).todense()) # print(create_matrix(test_reviews).nonzero()) # # print(create_matrix(reviews_matrix_5)) # print(create_matrix(reviews_matrix_5).todense()) # print(create_matrix(reviews_matrix_5).nonzero()) # print(sgd.predict(4, 0)) # start_time = time.time() # test() # end_time = time.time() - start_time # print('Total time', end_time)
__author__ = 'fpena' start_time = time.time() # main() # file_path = '/Users/fpena/tmp/filtered_reviews_multi.json' # file_path = '/Users/fpena/tmp/filtered_reviews_multi_new.json' # file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json' # file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/all_reviews.json' file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json' # reviews = ETLUtils.load_json_file(file_path) # reviews = movielens_extractor.clean_reviews(movielens_extractor.get_ml_100K_dataset()) # reviews = extractor.pre_process_reviews() reviews = movielens_extractor.get_ml_100K_dataset() # shuffle(reviews) # ETLUtils.save_json_file('/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json', reviews) # print(reviews[0]) # print(reviews[1]) # print(reviews[2]) # print(reviews[10]) # print(reviews[100]) # # for review in reviews: # print(review) my_recommender_list = [ # SingleCF(),
from etl.reviews_dataset_analyzer import ReviewsDatasetAnalyzer from tripadvisor.fourcity.recommender_evaluator import evaluate_recommenders __author__ = 'fpena' start_time = time.time() # main() # file_path = '/Users/fpena/tmp/filtered_reviews_multi.json' # file_path = '/Users/fpena/tmp/filtered_reviews_multi_new.json' # file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json' # file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/all_reviews.json' file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json' # reviews = ETLUtils.load_json_file(file_path) # reviews = movielens_extractor.clean_reviews(movielens_extractor.get_ml_100K_dataset()) # reviews = extractor.pre_process_reviews() reviews = movielens_extractor.get_ml_100K_dataset() # shuffle(reviews) # ETLUtils.save_json_file('/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json', reviews) # print(reviews[0]) # print(reviews[1]) # print(reviews[2]) # print(reviews[10]) # print(reviews[100]) # # for review in reviews: # print(review) my_recommender_list = [ # SingleCF(), # AdjustedWeightedSumRecommender(SingleSimilarityMatrixBuilder('euclidean')),