def init_reviews(self): print('init_reviews', time.strftime("%H:%M:%S")) self.reviews = [] self.specific_reviews = [] self.generic_reviews = [] # for text_review in self.text_reviews: # self.reviews.append(Review(text_review)) my_file = '/Users/fpena/tmp/reviews_hotel.pkl' # my_file = '/Users/fpena/tmp/reviews_restaurant.pkl' # my_file = '/Users/fpena/tmp/sentences_hotel.pkl' # with open(my_file, 'wb') as write_file: # pickle.dump(self.reviews, write_file, pickle.HIGHEST_PROTOCOL) with open(my_file, 'rb') as read_file: self.reviews = pickle.load(read_file) # self.reviews = self.reviews # for review in self.reviews: # print(review) cluster_labels = reviews_clusterer.cluster_reviews(self.reviews) review_clusters =\ reviews_clusterer.split_list_by_labels(self.reviews, cluster_labels) # print(cluster_labels) self.specific_reviews = review_clusters[0] self.generic_reviews = review_clusters[1] self.all_nouns = context_utils.get_all_nouns(self.reviews) context_utils.generate_stats(self.specific_reviews, self.generic_reviews)
def separate_reviews(self): """ Separates the reviews into specific and generic. The separation is done by clustering """ # print('separating reviews', time.strftime("%H:%M:%S")) cluster_labels = reviews_clusterer.cluster_reviews(self.reviews) review_clusters =\ reviews_clusterer.split_list_by_labels(self.reviews, cluster_labels) self.specific_reviews = review_clusters[0] self.generic_reviews = review_clusters[1]
def init_reviews(self): print('init_reviews', time.strftime("%H:%M:%S")) # self.reviews = reviews self.specific_reviews = [] self.generic_reviews = [] # for text_review in self.text_reviews: # self.reviews.append(Review(text_review)) # my_file = '/Users/fpena/UCC/Thesis/projects/yelp/source/python/topicmodeling/context/reviews_hotel.pkl' records_file = '/Users/fpena/UCC/Thesis/datasets/context/stuff/reviews_hotel_shuffled.json' reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/stuff/reviews_hotel_shuffled.pkl' # my_file = '/Users/fpena/UCC/Thesis/datasets/context/stuff/reviews_restaurant_shuffled.pkl' # my_file = '/Users/fpena/tmp/reviews_restaurant.pkl' # my_file = '/Users/fpena/tmp/sentences_hotel.pkl' # with open(my_file, 'wb') as write_file: # pickle.dump(self.reviews, write_file, pickle.HIGHEST_PROTOCOL) # self.records = ETLUtils.load_json_file(records_file) # # with open(reviews_file, 'rb') as read_file: # self.reviews = pickle.load(read_file)[:100] # # print(self.records[50]['text']) # print(self.reviews[50].text) # self.reviews = self.reviews # for review in self.reviews: # print(review) cluster_labels = reviews_clusterer.cluster_reviews(self.reviews) review_clusters =\ reviews_clusterer.split_list_by_labels(self.reviews, cluster_labels) # print(cluster_labels) self.specific_reviews = review_clusters[0] self.generic_reviews = review_clusters[1] self.all_nouns = context_utils.get_all_nouns(self.reviews) context_utils.generate_stats(self.specific_reviews, self.generic_reviews)
def calculate_recall_in_top_n(records, recommender, n, num_folds, split=None, min_score=5.0, cache_reviews=None, reviews_type=None): start_time = time.time() if split is None: split = 1 - (1 / float(num_folds)) # split = 0.984 total_recall = 0. total_coverage = 0. num_cycles = 0.0 for i in xrange(0, num_folds): print('Fold', i) print('started training', time.strftime("%Y/%d/%m-%H:%M:%S")) start = float(i) / num_folds cluster_labels = None train_records, test_records = ETLUtils.split_train_test(records, split=split, start=start) if cache_reviews: train_reviews, test_reviews = ETLUtils.split_train_test( cache_reviews, split=split, start=start) if reviews_type is not None: cluster_labels = reviews_clusterer.cluster_reviews( test_reviews) recommender.reviews = train_reviews recommender.load(train_records) print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S")) if cluster_labels is not None: separated_records = reviews_clusterer.split_list_by_labels( test_records, cluster_labels) if reviews_type == 'specific': test_records = separated_records[0] if reviews_type == 'generic': test_records = separated_records[1] positive_reviews = \ [review for review in test_records if review['overall_rating'] >= min_score] if len(positive_reviews) == 0: continue num_hits = 0.0 num_predictions = 0.0 for review in positive_reviews: user_id = review['user_id'] item_id = review['offering_id'] if not recommender.has_context: hit = calculate_is_a_hit(test_records, recommender, user_id, item_id, n) else: text_review = review['text'] hit = calculate_is_a_hit(test_records, recommender, user_id, item_id, n, text_review) if hit is None: continue if hit: num_hits += 1 num_predictions += 1 # print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S"))) if num_predictions == 0: continue recommender.clear() recall = num_hits / num_predictions coverage = num_predictions / len(positive_reviews) print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S")) print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S")) total_recall += recall total_coverage += coverage num_cycles += 1 final_recall = total_recall / num_cycles final_coverage = total_coverage / num_cycles execution_time = time.time() - start_time print('Final Top N Precision: %f' % final_recall) print('Final Coverage: %f' % final_coverage) print("--- %s seconds ---" % execution_time) result = { 'Top N': final_recall, 'Coverage': final_coverage, 'Execution time': execution_time } return result
def perform_cross_validation( records, recommender, num_folds, cache_reviews=None, reviews_type=None): start_time = time.time() split = 1 - (1/float(num_folds)) total_mean_absolute_error = 0. total_mean_square_error = 0. total_coverage = 0. num_cycles = 0 for i in range(0, num_folds): print('Num cycles: %d' % i) start = float(i) / num_folds cluster_labels = None train_records, test_records = ETLUtils.split_train_test( records, split=split, start=start) if cache_reviews: train_reviews, test_reviews = ETLUtils.split_train_test( cache_reviews, split=split, start=start) if reviews_type is not None: cluster_labels = reviews_clusterer.cluster_reviews(test_reviews) recommender.reviews = train_reviews recommender.load(train_records) if cluster_labels is not None: separated_records = reviews_clusterer.split_list_by_labels( test_records, cluster_labels) if reviews_type == 'specific': test_records = separated_records[0] if reviews_type == 'generic': test_records = separated_records[1] _, errors, num_unknown_ratings = predict_rating_list(recommender, test_records) recommender.clear() mean_absolute_error = MeanAbsoluteError.compute_list(errors) root_mean_square_error = RootMeanSquareError.compute_list(errors) num_samples = len(test_records) coverage = float((num_samples - num_unknown_ratings) / num_samples) # print('Total length:', len(test)) # print('Unknown ratings:', num_unknown_ratings) # print('Coverage:', coverage) if mean_absolute_error is not None: total_mean_absolute_error += mean_absolute_error total_mean_square_error += root_mean_square_error total_coverage += coverage num_cycles += 1 else: print('Mean absolute error is None!!!') final_mean_absolute_error = total_mean_absolute_error / num_cycles final_root_squared_error = total_mean_square_error / num_cycles final_coverage = total_coverage / num_cycles execution_time = time.time() - start_time print('Final mean absolute error: %f' % final_mean_absolute_error) print('Final root mean square error: %f' % final_root_squared_error) print('Final coverage: %f' % final_coverage) print("--- %s seconds ---" % execution_time) result = { 'MAE': final_mean_absolute_error, 'RMSE': final_root_squared_error, 'Coverage': final_coverage, 'Execution time': execution_time } return result
def calculate_recall_in_top_n( records, recommender, n, num_folds, split=None, min_score=5.0, cache_reviews=None, reviews_type=None): start_time = time.time() if split is None: split = 1 - (1/float(num_folds)) # split = 0.984 total_recall = 0. total_coverage = 0. num_cycles = 0.0 for i in xrange(0, num_folds): print('Fold', i) print('started training', time.strftime("%Y/%d/%m-%H:%M:%S")) start = float(i) / num_folds cluster_labels = None train_records, test_records = ETLUtils.split_train_test( records, split=split, shuffle_data=False, start=start) if cache_reviews: train_reviews, test_reviews = ETLUtils.split_train_test( cache_reviews, split=split, shuffle_data=False, start=start) if reviews_type is not None: cluster_labels = reviews_clusterer.cluster_reviews(test_reviews) recommender.reviews = train_reviews recommender.load(train_records) print('finished training', time.strftime("%Y/%d/%m-%H:%M:%S")) if cluster_labels is not None: separated_records = reviews_clusterer.split_list_by_labels( test_records, cluster_labels) if reviews_type == 'specific': test_records = separated_records[0] if reviews_type == 'generic': test_records = separated_records[1] positive_reviews = \ [review for review in test_records if review['overall_rating'] >= min_score] if len(positive_reviews) == 0: continue num_hits = 0.0 num_predictions = 0.0 for review in positive_reviews: user_id = review['user_id'] item_id = review['offering_id'] if not recommender.has_context: hit = calculate_is_a_hit( test_records, recommender, user_id, item_id, n) else: text_review = review['text'] hit = calculate_is_a_hit( test_records, recommender, user_id, item_id, n, text_review) if hit is None: continue if hit: num_hits += 1 num_predictions += 1 # print('num predictions: %d/%d %s' % (num_predictions, len(positive_reviews), time.strftime("%Y/%d/%m-%H:%M:%S"))) if num_predictions == 0: continue recommender.clear() recall = num_hits / num_predictions coverage = num_predictions / len(positive_reviews) print('recall', recall, time.strftime("%Y/%d/%m-%H:%M:%S")) print('coverage', coverage, time.strftime("%Y/%d/%m-%H:%M:%S")) total_recall += recall total_coverage += coverage num_cycles += 1 final_recall = total_recall / num_cycles final_coverage = total_coverage / num_cycles execution_time = time.time() - start_time print('Final Top N Precision: %f' % final_recall) print('Final Coverage: %f' % final_coverage) print("--- %s seconds ---" % execution_time) result = { 'Top N': final_recall, 'Coverage': final_coverage, 'Execution time': execution_time } return result