def run_top_n_test(records_file,
                   recommenders,
                   binary_reviews_file,
                   reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) -
                                                      count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')
    def test_calculate_recall_in_top_n(self):
        recommender = ItemAverageRecommender()
        recommender.load(reviews_matrix_5)

        actual_value = precision_in_top_n.calculate_recall_in_top_n(
            reviews_matrix_5, recommender, 2, 2, None, 4.0)['Top N']
        expected_value = 0.875
        self.assertEqual(expected_value, actual_value)
    def test_calculate_recall_in_top_n(self):
        recommender = ItemAverageRecommender()
        recommender.load(reviews_matrix_5)

        actual_value = precision_in_top_n.calculate_recall_in_top_n(
            reviews_matrix_5, recommender, 2, 2, None, 4.0)['Top N']
        expected_value = 0.875
        self.assertEqual(expected_value, actual_value)
def run_top_n_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = load_records(records_file)
    # records = extractor.remove_users_with_low_reviews(records, 2)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    min_like_score = 5.0
    top_n = 10

    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_list = []
    results_log_list = []
    count = 0
    print('Total recommenders: %d' % (len(recommenders)))

    for recommender in recommenders:

        print('\n**************\nProgress: %d/%d\n**************' %
              (count, len(recommenders)))
        print(get_knn_recommender_info(recommender))

        results = precision_in_top_n.calculate_recall_in_top_n(
            records, recommender, top_n, num_folds, split, min_like_score,
            binary_reviews, reviews_type)

        results_list.append(results)

        remaining_time = results['Execution time'] * (len(recommenders) - count)
        remaining_time /= 3600
        print('Estimated remaining time: %.2f hours' % remaining_time)
        count += 1

    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(process_topn_results(recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
def run_topn_test_wrapper(args):
    try:
        return precision_in_top_n.calculate_recall_in_top_n(*args)
    except Exception as e:
        print('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e
def run_topn_test_wrapper(args):
    try:
        return precision_in_top_n.calculate_recall_in_top_n(*args)
    except Exception as e:
        print('Caught exception in worker thread')

        # This prints the type, value, and stack trace of the
        # current exception being handled.
        traceback.print_exc()

        print()
        raise e
Beispiel #7
0
def main():

    reviews_file =\
        "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
    my_records = load_data(reviews_file)
    my_ratings_matrix = create_ratings_matrix(my_records)

    my_records = extractor.remove_users_with_low_reviews(my_records, 1)

    print(len(my_records))
    # print(len(my_ratings_matrix))

    # basic_knn = BasicKNN(1)
    # basic_knn.load(my_records)

    # for record in my_records:
    #     print(basic_knn.predict_rating(record['user_id'], record['offering_id']))

    # print(basic_knn.predict_rating('qLCpuCWCyPb4G2vN-WZz-Q', '8ZwO9VuLDWJOXmtAdc7LXQ')) # 4
    # print(basic_knn.predict_rating('rVlgz-MGYRPa8UzTYO0RGQ', 'c0iszTWZwYtO3TgBx0Z0fQ')) # 2
    # print(basic_knn.predict_rating('4o7r-QSYhOkxpxRMqpXcCg', 'EcHuaHD9IcoPEWNsU8vDTw')) # 4
    # print(basic_knn.predict_rating('msgAEWFbD4df0EvyOR3TnQ', 'EcHuaHD9IcoPEWNsU8vDTw')) # 5

    shuffle(my_records)

    # Split 80-20 and see the results

    num_records = len(my_records)
    num_unknown_records = 0
    training_size = int(num_records*0.8)
    my_train_data = my_records[:training_size]
    my_test_data = my_records[training_size:]

    basic_knn = BasicKNN(None)
    basic_knn.load(my_train_data)
    # basic_knn.load(my_records)
    # recommender_evaluator.perform_cross_validation(my_records, basic_knn, 3)
    # precision_in_top_n.calculate_top_n_precision(my_records, basic_knn, 10000, 5.0, 5)
    precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn, 10, 5)
Beispiel #8
0
def main():
    # reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
    reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_hotels_shuffled.json"
    # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_restaurants_shuffled.json"
    # my_records = context_utils.load_reviews(reviews_file)
    my_records = load_data(reviews_file)
    print("records:", len(my_records))
    my_num_topics = 150

    print("\n***************************\n")

    # my_records = load_data(reviews_file)
    # my_records = extractor.remove_users_with_low_reviews(my_records, 200)
    # my_records = extractor.remove_users_with_low_reviews(my_records, 2)
    # shuffle(my_records)

    # my_index = 0
    # my_reviews = []
    # for record in my_records:
    #     my_index += 1
    #     my_reviews.append(Review(record))
    #     print('index', my_index)
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/reviews_context_restaurants_200.pkl'
    # my_file = '/Users/fpena/UCC/Thesis/datasets/context/reviews_context_hotel_2.pkl'
    my_file = '/Users/fpena/tmp/reviews_hotel_shuffled.pkl'
    # my_file = '/Users/fpena/tmp/reviews_restaurant_shuffled.pkl'
    # with open(my_file, 'wb') as write_file:
    #     pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL)

    with open(my_file, 'rb') as read_file:
        my_cache_reviews = pickle.load(read_file)

    print("reviews:", len(my_cache_reviews))
    context_knn = ContextKnn(my_num_topics)

    tknc = TopKNeighbourhoodCalculator()
    nc = ContextNeighbourhoodCalculator()
    ncc = NeighbourContributionCalculator()
    ubc = UserBaselineCalculator()
    usc = PBCSimilarityCalculator()

    # contextual_knn2 = ContextualKNN(my_num_topics, tknc, ncc, ubc, usc, my_reviews)

    bnc = BasicNeighbourhoodCalculator()
    bncc = BasicNeighbourContributionCalculator()
    bubc = BasicUserBaselineCalculator()
    busc = BasicUserSimilarityCalculator()

    snc = SimpleNeighbourhoodCalculator()
    chnc = ContextHybridNeighbourhoodCalculator()

    contextual_knn = ContextualKNN(my_num_topics, nc, ncc, ubc, usc, has_context=True)
    contextual_knn2 = ContextualKNN(my_num_topics, nc, ncc, ubc, busc, has_context=True)
    contextual_knn3 = ContextualKNN(my_num_topics, bnc, bncc, bubc, busc)
    # basic_contextual_knn = BasicContextualKNN(my_num_topics, bnc, bncc, bubc, busc)

    # contextual_knn.threshold1 = 0.8
    # contextual_knn.threshold2 = 0.8
    # contextual_knn.threshold3 = 0.8
    # contextual_knn.threshold4 = 0.8

    # print('Context KNN')
    # context_knn.load(my_records)
    # recommender_evaluator.perform_cross_validation(my_records, context_knn, 5, True)
    basic_knn_rec = BasicKNN(None)
    # print('Basic KNN')
    # recommender_evaluator.perform_cross_validation(my_records, basic_knn_rec, 5)
    # print('Context KNN')
    # recommender_evaluator.perform_cross_validation(my_records, context_knn, 5, my_cache_reviews)
    print('Contextual KNN')
    recommender_evaluator.perform_cross_validation(my_records, contextual_knn, 5, my_cache_reviews)
    print('Contextual KNN2')
    recommender_evaluator.perform_cross_validation(my_records, contextual_knn2, 5, my_cache_reviews)
    print('Contextual KNN3')
    # recommender_evaluator.perform_cross_validation(my_records, contextual_knn3, 5)
    # recommender_evaluator.perform_cross_validation(my_records, contextual_knn3, 5)
    # recommender_evaluator.perform_cross_validation(my_records, contextual_knn3, 5, True, my_cache_reviews)
    # precision_in_top_n.calculate_recall_in_top_n(my_records, basic_contextual_knn, 10, 65)
    # precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn_rec, 10, 65)
    print('Basic KNN')
    precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn_rec, 10, 5, 5.0, my_cache_reviews)
    # precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn_rec, 10, 5, 5.0, False, my_cache_reviews)
    print('Context KNN')
    precision_in_top_n.calculate_recall_in_top_n(my_records, context_knn, 10, 5, 5.0, my_cache_reviews)
    print('Contextual KNN')
    precision_in_top_n.calculate_recall_in_top_n(my_records, contextual_knn, 10, 5, 5.0, my_cache_reviews)
    print('Contextual KNN 2')
    precision_in_top_n.calculate_recall_in_top_n(my_records, contextual_knn2, 10, 5, 5.0, my_cache_reviews)
    print('Contextual KNN 3')
    precision_in_top_n.calculate_recall_in_top_n(my_records, contextual_knn3, 10, 5, 5.0, my_cache_reviews)