def clean_reviews(reviews):
    """
    Returns a copy of the original reviews list with only that are useful for
    recommendation purposes

    :param reviews: a list of reviews
    :return: a copy of the original reviews list with only that are useful for
    recommendation purposes
    """
    # filtered_reviews = remove_empty_user_reviews(reviews)
    # filtered_reviews = remove_missing_ratings_reviews(filtered_reviews)
    # print('Finished remove_missing_ratings_reviews')
    filtered_reviews = extractor.remove_users_with_low_reviews(reviews, 10)
    print('Finished remove_users_with_low_reviews')
    filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20)
    print('Finished remove_single_review_hotels')
    filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20)
    filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20)
    filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20)
    filtered_reviews = extractor.remove_users_with_low_reviews(filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(filtered_reviews, 20)
    print('Finished remove_users_with_low_reviews')
    print('Number of reviews', len(filtered_reviews))
    return filtered_reviews
Example #2
0
def main():
    # reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
    # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_hotels_shuffled.json"
    reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_restaurants_shuffled.json"
    # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_spas_shuffled.json"
    # my_records = context_utils.load_reviews(reviews_file)
    my_records = load_data(reviews_file)
    print("records:", len(my_records))
    my_num_topics = 150

    print("\n***************************\n")

    # my_records = load_data(reviews_file)
    # my_records = extractor.remove_users_with_low_reviews(my_records, 200)
    my_records = extractor.remove_users_with_low_reviews(my_records, 200)
    # shuffle(my_records)

    my_index = 0
    my_reviews = []
    for record in my_records:
        my_index += 1
        review = Review(record['text'])
        review.id = record['review_id']
        my_reviews.append(review)
        print('index', my_index)
Example #3
0
def main():
    # reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
    # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_hotels_shuffled.json"
    reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_restaurants_shuffled.json"
    # reviews_file = "/Users/fpena/UCC/Thesis/datasets/context/yelp_training_set_review_spas_shuffled.json"
    # my_records = context_utils.load_reviews(reviews_file)
    my_records = load_data(reviews_file)
    print("records:", len(my_records))
    my_num_topics = 150

    print("\n***************************\n")

    # my_records = load_data(reviews_file)
    # my_records = extractor.remove_users_with_low_reviews(my_records, 200)
    my_records = extractor.remove_users_with_low_reviews(my_records, 200)
    # shuffle(my_records)

    my_index = 0
    my_reviews = []
    for record in my_records:
        my_index += 1
        review = Review(record['text'])
        review.id = record['review_id']
        my_reviews.append(review)
        print('index', my_index)
def parallel_run_topn_test(
        records_file, recommenders, binary_reviews_file, reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product(
        [records],
        recommenders,
        [top_n],
        [num_folds],
        [split],
        [min_like_score],
        [binary_reviews],
        [reviews_type]
    )

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(context_recommender_tests.process_topn_results(
            recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')

    return results_list
Example #5
0
    def remove_users_with_low_reviews(self):
        print('%s: remove users with low reviews' %
              time.strftime("%Y/%m/%d-%H:%M:%S"))

        # Remove from the dataset users with a low number of reviews
        min_reviews_per_user = Constants.MIN_REVIEWS_PER_USER
        if min_reviews_per_user is None or min_reviews_per_user < 2:
            return
        self.records = extractor.remove_users_with_low_reviews(
            self.records, min_reviews_per_user)
def parallel_run_topn_test(records_file,
                           recommenders,
                           binary_reviews_file,
                           reviews_type=None):

    records = context_recommender_tests.load_records(records_file)
    records = extractor.remove_users_with_low_reviews(records, 20)
    with open(binary_reviews_file, 'rb') as read_file:
        binary_reviews = pickle.load(read_file)

    if len(records) != len(binary_reviews):
        raise ValueError("The records and reviews should have the same length")

    num_folds = 5
    split = 0.986
    top_n = 10
    min_like_score = 5.0

    args = itertools.product([records], recommenders, [top_n], [num_folds],
                             [split], [min_like_score], [binary_reviews],
                             [reviews_type])

    print('Total recommenders: %d' % (len(recommenders)))

    pool = Pool()

    print('Total CPUs: %d' % pool._processes)

    results_list = pool.map(run_topn_test_wrapper, args)
    pool.close()
    pool.join()

    # After we have finished executing, we process the results
    dataset_info_map = {}
    dataset_info_map['dataset'] = records_file.split('/')[-1]
    dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1]
    dataset_info_map['num_records'] = len(records)
    dataset_info_map['reviews_type'] = reviews_type
    dataset_info_map['cross_validation_folds'] = num_folds
    dataset_info_map['min_like_score'] = min_like_score
    dataset_info_map['top_n'] = top_n

    results_log_list = []
    for recommender, results in zip(recommenders, results_list):
        results_log_list.append(
            context_recommender_tests.process_topn_results(
                recommender, results, dataset_info_map))

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    file_name = 'recommender-topn-results-parallel' + timestamp

    ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS,
                           '\t')

    return results_list
Example #7
0
def clean_reviews(reviews):
    """
    Returns a copy of the original reviews list with only that are useful for
    recommendation purposes

    :param reviews: a list of reviews
    :return: a copy of the original reviews list with only that are useful for
    recommendation purposes
    """
    # filtered_reviews = remove_empty_user_reviews(reviews)
    # filtered_reviews = remove_missing_ratings_reviews(filtered_reviews)
    # print('Finished remove_missing_ratings_reviews')
    filtered_reviews = extractor.remove_users_with_low_reviews(reviews, 10)
    print('Finished remove_users_with_low_reviews')
    filtered_reviews = extractor.remove_items_with_low_reviews(
        filtered_reviews, 20)
    print('Finished remove_single_review_hotels')
    filtered_reviews = extractor.remove_users_with_low_reviews(
        filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(
        filtered_reviews, 20)
    filtered_reviews = extractor.remove_users_with_low_reviews(
        filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(
        filtered_reviews, 20)
    filtered_reviews = extractor.remove_users_with_low_reviews(
        filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(
        filtered_reviews, 20)
    filtered_reviews = extractor.remove_users_with_low_reviews(
        filtered_reviews, 10)
    filtered_reviews = extractor.remove_items_with_low_reviews(
        filtered_reviews, 20)
    print('Finished remove_users_with_low_reviews')
    print('Number of reviews', len(filtered_reviews))
    return filtered_reviews
Example #8
0
def main():

    reviews_file =\
        "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json"
    my_records = load_data(reviews_file)
    my_ratings_matrix = create_ratings_matrix(my_records)

    my_records = extractor.remove_users_with_low_reviews(my_records, 1)

    print(len(my_records))
    # print(len(my_ratings_matrix))

    # basic_knn = BasicKNN(1)
    # basic_knn.load(my_records)

    # for record in my_records:
    #     print(basic_knn.predict_rating(record['user_id'], record['offering_id']))

    # print(basic_knn.predict_rating('qLCpuCWCyPb4G2vN-WZz-Q', '8ZwO9VuLDWJOXmtAdc7LXQ')) # 4
    # print(basic_knn.predict_rating('rVlgz-MGYRPa8UzTYO0RGQ', 'c0iszTWZwYtO3TgBx0Z0fQ')) # 2
    # print(basic_knn.predict_rating('4o7r-QSYhOkxpxRMqpXcCg', 'EcHuaHD9IcoPEWNsU8vDTw')) # 4
    # print(basic_knn.predict_rating('msgAEWFbD4df0EvyOR3TnQ', 'EcHuaHD9IcoPEWNsU8vDTw')) # 5

    shuffle(my_records)

    # Split 80-20 and see the results

    num_records = len(my_records)
    num_unknown_records = 0
    training_size = int(num_records*0.8)
    my_train_data = my_records[:training_size]
    my_test_data = my_records[training_size:]

    basic_knn = BasicKNN(None)
    basic_knn.load(my_train_data)
    # basic_knn.load(my_records)
    # recommender_evaluator.perform_cross_validation(my_records, basic_knn, 3)
    # precision_in_top_n.calculate_top_n_precision(my_records, basic_knn, 10000, 5.0, 5)
    precision_in_top_n.calculate_recall_in_top_n(my_records, basic_knn, 10, 5)