def run_top_n_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 min_like_score = 5.0 top_n = 10 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\nProgress: %d/%d\n**************' % (count, len(recommenders))) print(get_knn_recommender_info(recommender)) results = precision_in_top_n.calculate_recall_in_top_n( records, recommender, top_n, num_folds, split, min_like_score, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append( process_topn_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
def test_calculate_recall_in_top_n(self): recommender = ItemAverageRecommender() recommender.load(reviews_matrix_5) actual_value = precision_in_top_n.calculate_recall_in_top_n( reviews_matrix_5, recommender, 2, 2, None, 4.0)['Top N'] expected_value = 0.875 self.assertEqual(expected_value, actual_value)
def test_calculate_recall_in_top_n(self): recommender = ItemAverageRecommender() recommender.load(reviews_matrix_5) actual_value = precision_in_top_n.calculate_recall_in_top_n( reviews_matrix_5, recommender, 2, 2, None, 4.0)['Top N'] expected_value = 0.875 self.assertEqual(expected_value, actual_value)
def run_top_n_test( records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 min_like_score = 5.0 top_n = 10 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\nProgress: %d/%d\n**************' % (count, len(recommenders))) print(get_knn_recommender_info(recommender)) results = precision_in_top_n.calculate_recall_in_top_n( records, recommender, top_n, num_folds, split, min_like_score, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append(process_topn_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
def run_topn_test_wrapper(args): try: return precision_in_top_n.calculate_recall_in_top_n(*args) except Exception as e: print('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e
def run_topn_test_wrapper(args): try: return precision_in_top_n.calculate_recall_in_top_n(*args) except Exception as e: print('Caught exception in worker thread') # This prints the type, value, and stack trace of the # current exception being handled. traceback.print_exc() print() raise e
def main(): reviews_file =\ "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json" my_records = load_data(reviews_file) my_ratings_matrix = create_ratings_matrix(my_records) my_records = extractor.remove_users_with_low_reviews(my_records, 1) print(len(my_records)) # print(len(my_ratings_matrix)) # basic_knn = BasicKNN(1) # basic_knn.load(my_records) # for record in my_records: # print(basic_knn.predict_rating(record['user_id'], record['offering_id'])) # print(basic_knn.predict_rating('qLCpuCWCyPb4G2vN-WZz-Q', '8ZwO9VuLDWJOXmtAdc7LXQ')) # 4 # print(basic_knn.predict_rating('rVlgz-MGYRPa8UzTYO0RGQ', 'c0iszTWZwYtO3TgBx0Z0fQ')) # 2 # print(basic_knn.predict_rating('4o7r-QSYhOkxpxRMqpXcCg', 'EcHuaHD9IcoPEWNsU8vDTw')) # 4 # print(basic_knn.predict_rating('msgAEWFbD4df0EvyOR3TnQ', 'EcHuaHD9IcoPEWNsU8vDTw')) # 5 shuffle(my_records) # Split 80-20 and see the results num_records = len(my_records) num_unknown_records = 0 training_size = int(num_records*0.8) my_train_data = my_records[:training_size] my_test_data = my_records[training_size:] basic_knn = BasicKNN(None) basic_knn.load(my_train_data) # basic_knn.load(my_records) # recommender_evaluator.perform_cross_validation(my_records, basic_knn, 3) # precision_in_top_n.calculate_top_n_precision(my_records, basic_knn, 10000, 5.0, 5) precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn, 10, 5)
def main(): # reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json" reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_hotels_shuffled.json" # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_restaurants_shuffled.json" # my_records = context_utils.load_reviews(reviews_file) my_records = load_data(reviews_file) print("records:", len(my_records)) my_num_topics = 150 print("\n***************************\n") # my_records = load_data(reviews_file) # my_records = extractor.remove_users_with_low_reviews(my_records, 200) # my_records = extractor.remove_users_with_low_reviews(my_records, 2) # shuffle(my_records) # my_index = 0 # my_reviews = [] # for record in my_records: # my_index += 1 # my_reviews.append(Review(record)) # print('index', my_index) # my_file = '/Users/fpena/UCC/Thesis/datasets/context/reviews_context_restaurants_200.pkl' # my_file = '/Users/fpena/UCC/Thesis/datasets/context/reviews_context_hotel_2.pkl' my_file = '/Users/fpena/tmp/reviews_hotel_shuffled.pkl' # my_file = '/Users/fpena/tmp/reviews_restaurant_shuffled.pkl' # with open(my_file, 'wb') as write_file: # pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL) with open(my_file, 'rb') as read_file: my_cache_reviews = pickle.load(read_file) print("reviews:", len(my_cache_reviews)) context_knn = ContextKnn(my_num_topics) tknc = TopKNeighbourhoodCalculator() nc = ContextNeighbourhoodCalculator() ncc = NeighbourContributionCalculator() ubc = UserBaselineCalculator() usc = PBCSimilarityCalculator() # contextual_knn2 = ContextualKNN(my_num_topics, tknc, ncc, ubc, usc, my_reviews) bnc = BasicNeighbourhoodCalculator() bncc = BasicNeighbourContributionCalculator() bubc = BasicUserBaselineCalculator() busc = BasicUserSimilarityCalculator() snc = SimpleNeighbourhoodCalculator() chnc = ContextHybridNeighbourhoodCalculator() contextual_knn = ContextualKNN(my_num_topics, nc, ncc, ubc, usc, has_context=True) contextual_knn2 = ContextualKNN(my_num_topics, nc, ncc, ubc, busc, has_context=True) contextual_knn3 = ContextualKNN(my_num_topics, bnc, bncc, bubc, busc) # basic_contextual_knn = BasicContextualKNN(my_num_topics, bnc, bncc, bubc, busc) # contextual_knn.threshold1 = 0.8 # contextual_knn.threshold2 = 0.8 # contextual_knn.threshold3 = 0.8 # contextual_knn.threshold4 = 0.8 # print('Context KNN') # context_knn.load(my_records) # recommender_evaluator.perform_cross_validation(my_records, context_knn, 5, True) basic_knn_rec = BasicKNN(None) # print('Basic KNN') # recommender_evaluator.perform_cross_validation(my_records, basic_knn_rec, 5) # print('Context KNN') # recommender_evaluator.perform_cross_validation(my_records, context_knn, 5, my_cache_reviews) print('Contextual KNN') recommender_evaluator.perform_cross_validation(my_records, contextual_knn, 5, my_cache_reviews) print('Contextual KNN2') recommender_evaluator.perform_cross_validation(my_records, contextual_knn2, 5, my_cache_reviews) print('Contextual KNN3') # recommender_evaluator.perform_cross_validation(my_records, contextual_knn3, 5) # recommender_evaluator.perform_cross_validation(my_records, contextual_knn3, 5) # recommender_evaluator.perform_cross_validation(my_records, contextual_knn3, 5, True, my_cache_reviews) # precision_in_top_n.calculate_recall_in_top_n(my_records, basic_contextual_knn, 10, 65) # precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn_rec, 10, 65) print('Basic KNN') precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn_rec, 10, 5, 5.0, my_cache_reviews) # precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn_rec, 10, 5, 5.0, False, my_cache_reviews) print('Context KNN') precision_in_top_n.calculate_recall_in_top_n(my_records, context_knn, 10, 5, 5.0, my_cache_reviews) print('Contextual KNN') precision_in_top_n.calculate_recall_in_top_n(my_records, contextual_knn, 10, 5, 5.0, my_cache_reviews) print('Contextual KNN 2') precision_in_top_n.calculate_recall_in_top_n(my_records, contextual_knn2, 10, 5, 5.0, my_cache_reviews) print('Contextual KNN 3') precision_in_top_n.calculate_recall_in_top_n(my_records, contextual_knn3, 10, 5, 5.0, my_cache_reviews)