def update_labeled_reviews_records(): reviews_label_map = compare_records() agreed_review_ids = set(reviews_label_map.keys()) classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) classifier_review_ids = \ {record[Constants.REVIEW_ID_FIELD] for record in classifier_records} non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids) # for record in classifier_records: # print(record) print('number of records before: %d' % len(classifier_records)) print(reviews_label_map) print(non_agreed_review_ids) review_type_map = {'s': 'yes', 'g': 'no'} # We remove from the classifier records the ones who don't have agreed on a # label classifier_records = ETLUtils.filter_out_records(classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids) # Finally we make the update of the labels for record in classifier_records: review_id = record[Constants.REVIEW_ID_FIELD] record[Constants.SPECIFIC] = review_type_map[ reviews_label_map[review_id]] # print(record) print('number of records after: %d' % len(classifier_records))
def update_labeled_reviews_records(): reviews_label_map = compare_records() agreed_review_ids = set(reviews_label_map.keys()) classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) classifier_review_ids = \ {record[Constants.REVIEW_ID_FIELD] for record in classifier_records} non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids) # for record in classifier_records: # print(record) print('number of records before: %d' % len(classifier_records)) print(reviews_label_map) print(non_agreed_review_ids) review_type_map = {'s': 'yes', 'g': 'no'} # We remove from the classifier records the ones who don't have agreed on a # label classifier_records = ETLUtils.filter_out_records( classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids) # Finally we make the update of the labels for record in classifier_records: review_id = record[Constants.REVIEW_ID_FIELD] record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]] # print(record) print('number of records after: %d' % len(classifier_records))
def main(): # dataset = 'hotel' dataset = 'restaurant' my_folder = '/Users/fpena/UCC/Thesis/datasets/context/' my_training_records_file =\ my_folder + 'classified_' + dataset + '_reviews.json' my_training_reviews_file =\ my_folder + 'classified_' + dataset + '_reviews.pkl' my_training_records = ETLUtils.load_json_file(my_training_records_file) with open(my_training_reviews_file, 'rb') as read_file: my_training_reviews = pickle.load(read_file) classifier = ReviewsClassifier() classifier.train(my_training_records, my_training_reviews) my_input_records_file =\ my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json' my_input_reviews_file =\ my_folder + 'reviews_' + dataset + '_shuffled.pkl' my_output_records_file =\ my_folder + 'yelp_training_set_review_' + dataset +\ 's_shuffled_tagged.json' with open(my_input_reviews_file, 'rb') as read_file: my_input_reviews = pickle.load(read_file) my_input_records = ETLUtils.load_json_file(my_input_records_file) my_output_records =\ classifier.label_json_reviews(my_input_records, my_input_reviews) ETLUtils.save_json_file(my_output_records_file, my_output_records)
def run(self, dataset, output_folder, train_records, test_records, train_reviews=None, test_reviews=None): contextual_train_set, contextual_test_set = self.full_cycle( train_records, test_records, train_reviews, test_reviews ) print("Prepared data: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) # json_train_file = output_folder + 'yelp_' + dataset + '_context_shuffled_train5.json' csv_train_file = output_folder + "yelp_" + dataset + "_context_shuffled_train5.csv" # json_test_file = output_folder + 'yelp_' + dataset + '_context_shuffled_test5.json' csv_test_file = output_folder + "yelp_" + dataset + "_context_shuffled_test5.csv" # ETLUtils.save_json_file(json_train_file, contextual_train_set) ETLUtils.save_csv_file(csv_train_file, contextual_train_set, self.headers) # ETLUtils.save_json_file(json_test_file, contextual_test_set) ETLUtils.save_csv_file(csv_test_file, contextual_test_set, self.headers) print("Exported CSV and JSON files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [csv_train_file, csv_test_file] num_cols = len(self.headers) context_cols = num_cols print("num_cols", num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ",", has_header=True, suffix=".no_context.libfm" ) libfm_converter.csv_to_libfm(csv_files, 0, [1, 2], [], ",", has_header=True, suffix=".context.libfm") print("Exported LibFM files: %s" % time.strftime("%Y/%d/%m-%H:%M:%S"))
def export_records_to_predict(self, records_file): if self.records_to_predict is None: self.records_to_predict = self.get_records_to_predict() ETLUtils.save_json_file(records_file, self.records_to_predict) with open(records_file + '.pkl', 'wb') as write_file: pickle.dump(self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
def multiple_lineal_regression(file_path): records = ReviewETL.load_file(file_path) ratings = np.array([record['stars'] for record in records]) ETLUtils.drop_fields(['stars'], records) data = np.array([record.values() for record in records]) # Create linear regression object regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(data, ratings) model = linear_model.LinearRegression(fit_intercept=True) model.fit(data, ratings) p = np.array([model.predict(xi) for xi in data]) e = p - ratings total_error = np.dot(e, e) rmse_train = np.sqrt(total_error / len(p)) kf = KFold(len(data), n_folds=10) err = 0 for train, test in kf: model.fit(data[train], ratings[train]) p = np.array([model.predict(xi) for xi in data[test]]) e = p - ratings[test] err += np.dot(e, e) rmse_10cv = np.sqrt(err / len(data)) print('RMSE on training: {}'.format(rmse_train)) print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
def main_evaluate(): I = my_i records = ETLUtils.load_json_file(RECORDS_FILE) # print('num_records', len(records)) test_file = RECORDS_FILE + '_test' test_records = ETLUtils.load_json_file(test_file) top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I) top_n_evaluator.find_important_records() # top_n_evaluator.initialize() # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json' top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE) predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt' predictions = rmse_calculator.read_targets_from_txt(predictions_file) # print('total predictions', len(predictions)) top_n_evaluator.evaluate(predictions) # print('precision', top_n_evaluator.precision) print('recall', top_n_evaluator.recall) return top_n_evaluator.recall
def export_without_context(self): print('%s: exporting to CARSKit binary ratings format without context' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] numpy.random.seed(0) for record in self.records: context_na_value = 1 new_records.append({ Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], 'context:na': context_na_value, }) headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, 'context:na' ] ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def export_records(self): print('%s: exporting transformed records' % time.strftime("%Y/%m/%d-%H:%M:%S")) records_to_export = [] desired_fields = [ Constants.USER_INTEGER_ID_FIELD, Constants.ITEM_INTEGER_ID_FIELD, Constants.RATING_FIELD, Constants.CONTEXT_FIELD, ] for record in self.records: new_record = {field: record[field] for field in desired_fields} records_to_export.append(new_record) file_name = Constants.generate_file_name( 'recsys_formatted_context_records', 'json', Constants.CACHE_FOLDER, None, None, True, True, uses_carskit=False, normalize_topics=True, format_context=True) ETLUtils.save_json_file(file_name, records_to_export)
def export_records(self): print('%s: get_records_to_predict_topn records' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.dictionary.save(Constants.DICTIONARY_FILE) ETLUtils.save_json_file( Constants.FULL_PROCESSED_RECORDS_FILE, self.records) self.drop_unnecessary_fields() ETLUtils.save_json_file(Constants.PROCESSED_RECORDS_FILE, self.records)
def export_records_to_predict(self, records_file): if self.records_to_predict is None: self.records_to_predict = self.get_records_to_predict() ETLUtils.save_json_file(records_file, self.records_to_predict) with open(records_file + '.pkl', 'wb') as write_file: pickle.dump( self.items_to_predict, write_file, pickle.HIGHEST_PROTOCOL)
def create_topic_models(): print(Constants._properties) print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) records = ETLUtils.load_json_file(Constants.RECORDS_FILE) plant_seeds() num_cycles = Constants.NUM_CYCLES num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) for i in range(num_cycles): print('\n\nCycle: %d/%d' % ((i + 1), num_cycles)) if Constants.SHUFFLE_DATA: random.shuffle(records) train_records_list = [] for j in range(num_folds): cv_start = float(j) / num_folds train_records, test_records =\ ETLUtils.split_train_test(records, split=split, start=cv_start) train_records_list.append(train_records) args = zip(train_records_list, [i] * Constants.CROSS_VALIDATION_NUM_FOLDS, range(Constants.CROSS_VALIDATION_NUM_FOLDS)) parallel_context_top_n(args)
def drop_unwanted_fields(dictionary_list): """ Drops fields that are not useful for data analysis in the business data set :rtype : void :param dictionary_list: the list of dictionaries containing the data """ unwanted_fields = [ 'attributes', 'business_id', 'categories', 'city', 'full_address', 'latitude', 'longitude', 'hours', 'name', 'neighborhoods', 'open', 'review_count', 'stars', 'state', 'type' ] ETLUtils.drop_fields(unwanted_fields, dictionary_list)
def run_top_n_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 min_like_score = 5.0 top_n = 10 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\nProgress: %d/%d\n**************' % (count, len(recommenders))) print(get_knn_recommender_info(recommender)) results = precision_in_top_n.calculate_recall_in_top_n( records, recommender, top_n, num_folds, split, min_like_score, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append( process_topn_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
def analyze_context_records(): records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) records = ETLUtils.filter_records(records, 'context_type', ['context']) print('num records: %d' % len(records)) for record in records: print(record[Constants.TEXT_FIELD])
def get_categories(file_path): records = ETLUtils.load_json_file(file_path) # Now we obtain the categories for all the businesses records = ETLUtils.add_transpose_list_column('categories', records) BusinessETL.drop_unwanted_fields(records) return records[0].keys()
def parallel_run_topn_test( records_file, recommenders, binary_reviews_file, reviews_type=None): records = context_recommender_tests.load_records(records_file) records = extractor.remove_users_with_low_reviews(records, 20) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 top_n = 10 min_like_score = 5.0 args = itertools.product( [records], recommenders, [top_n], [num_folds], [split], [min_like_score], [binary_reviews], [reviews_type] ) print('Total recommenders: %d' % (len(recommenders))) pool = Pool() print('Total CPUs: %d' % pool._processes) results_list = pool.map(run_topn_test_wrapper, args) pool.close() pool.join() # After we have finished executing, we process the results dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_log_list = [] for recommender, results in zip(recommenders, results_list): results_log_list.append(context_recommender_tests.process_topn_results( recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results-parallel' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t') return results_list
def main(): topic_model_creator.plant_seeds() my_resamplers = [ None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2', 'smote_tomek', 'smoteenn' ] my_classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf', probability=True), SVC(C=1.0, kernel='linear', probability=True), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(probability=True), RandomForestClassifier(n_estimators=100) ] document_levels = ['review', 'sentence', 1] num_cyles = len(my_resamplers) * len(my_classifiers) * len(document_levels) index = 1 results_list = [] for document_level in document_levels: Constants.DOCUMENT_LEVEL = document_level my_records = load_records() preprocess_records(my_records) x_matrix, y_vector = transform(my_records) count_specific_generic(my_records) for resampler, classifier in itertools.product(my_resamplers, my_classifiers): print('Cycle %d/%d' % (index, num_cyles)) classification_results =\ test_classifier(x_matrix, y_vector, resampler, classifier) results_list.append(classification_results) index += 1 for results in results_list: print(results) csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\ '_sentence_classifier_results.csv' ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def test_select_fields(self): select_fields = ['user_id', 'offering_id', 'overall_rating'] result = ETLUtils.select_fields(select_fields, reviews_matrix_5) self.assertEqual(result, reviews_matrix_5_short) select_fields = ['user_id'] result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short) self.assertEqual(result, reviews_matrix_5_users)
def parallel_run_topn_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = context_recommender_tests.load_records(records_file) records = extractor.remove_users_with_low_reviews(records, 20) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 top_n = 10 min_like_score = 5.0 args = itertools.product([records], recommenders, [top_n], [num_folds], [split], [min_like_score], [binary_reviews], [reviews_type]) print('Total recommenders: %d' % (len(recommenders))) pool = Pool() print('Total CPUs: %d' % pool._processes) results_list = pool.map(run_topn_test_wrapper, args) pool.close() pool.join() # After we have finished executing, we process the results dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_log_list = [] for recommender, results in zip(recommenders, results_list): results_log_list.append( context_recommender_tests.process_topn_results( recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results-parallel' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t') return results_list
def main(): topic_model_creator.plant_seeds() my_resamplers = [ None, 'random_over_sampler', 'smote_regular', 'smote_bl1', 'smote_bl2', 'smote_tomek', 'smoteenn' ] my_classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf', probability=True), SVC(C=1.0, kernel='linear', probability=True), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(probability=True), RandomForestClassifier(n_estimators=100) ] max_sentences_list = [None, 1] num_cyles = len(my_resamplers) * len(my_classifiers) * len( max_sentences_list) index = 1 results_list = [] for max_sentences in max_sentences_list: Constants.MAX_SENTENCES = max_sentences my_records = load_records() preprocess_records(my_records) x_matrix, y_vector = transform(my_records) count_specific_generic(my_records) for resampler, classifier in itertools.product(my_resamplers, my_classifiers): print('Cycle %d/%d' % (index, num_cyles)) classification_results =\ test_classifier(x_matrix, y_vector, resampler, classifier) results_list.append(classification_results) index += 1 for results in results_list: print(results) csv_file = Constants.DATASET_FOLDER + Constants.ITEM_TYPE +\ '_sentence_classifier_results.csv' ETLUtils.save_csv_file(csv_file, results_list, results_list[0].keys())
def drop_unnecessary_fields(self): print( '%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S")) unnecessary_fields = [ Constants.TEXT_FIELD, Constants.POS_TAGS_FIELD, # Constants.BOW_FIELD ] ETLUtils.drop_fields(unnecessary_fields, self.records)
def run_top_n_test( records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 split = 0.986 min_like_score = 5.0 top_n = 10 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds dataset_info_map['min_like_score'] = min_like_score dataset_info_map['top_n'] = top_n results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\nProgress: %d/%d\n**************' % (count, len(recommenders))) print(get_knn_recommender_info(recommender)) results = precision_in_top_n.calculate_recall_in_top_n( records, recommender, top_n, num_folds, split, min_like_score, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append(process_topn_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-topn-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, TOPN_HEADERS, '\t')
def drop_unnecessary_fields(self): print('%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S")) unnecessary_fields = [ Constants.TEXT_FIELD, Constants.POS_TAGS_FIELD, Constants.VOTES_FIELD, Constants.BOW_FIELD ] ETLUtils.drop_fields(unnecessary_fields, self.records)
def load_data(json_file): records = ETLUtils.load_json_file(json_file) fields = ['user_id', 'business_id', 'stars'] records = ETLUtils.select_fields(fields, records) # We rename the 'stars' field to 'overall_rating' to take advantage of the # function extractor.get_user_average_overall_rating for record in records: record['overall_rating'] = record.pop('stars') record['offering_id'] = record.pop('business_id') return records
def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.original_records =\ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) # ETLUtils.drop_fields(['tagged_words'], self.original_records) print('num_records: %d' % len(self.original_records)) if not os.path.exists(Constants.USER_ITEM_MAP_FILE): records = ETLUtils.load_json_file(Constants.RECORDS_FILE) user_item_map = create_user_item_map(records) with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file: pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
def load_data(json_file): records = ETLUtils.load_json_file(json_file) fields = ['user_id', 'business_id', 'stars', 'text', 'review_id'] records = ETLUtils.select_fields(fields, records) # We rename the 'stars' field to 'overall_rating' to take advantage of the # function extractor.get_user_average_overall_rating for record in records: record['overall_rating'] = record.pop('stars') record['offering_id'] = record.pop('business_id') return records
def remove_reviews_from_classifier_training_set(self): """ Removes the records that are part of the training set of the reviews classifier """ classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) classifier_review_ids = \ {record[Constants.REVIEW_ID_FIELD] for record in classifier_records} self.records = ETLUtils.filter_out_records( self.records, Constants.REVIEW_ID_FIELD, classifier_review_ids)
def export_as_predefined_context(self): print('%s: exporting to CARSKit ratings binary format with context as ' 'predefined context' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] context_categories = utilities.context_words[Constants.ITEM_TYPE].keys() context_headers = [ 'context:%s' % category for category in context_categories] index = 0 for record in self.records: new_record = { Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], } review_categories = \ find_predefined_context(record[Constants.BOW_FIELD]) context_found = False for category in context_categories: category_key = 'context:' + category category_value = 0 if category in review_categories: category_value = 1 context_found = True new_record[category_key] = category_value context_na_value = 0 if context_found else 1 new_record['context:na'] = context_na_value new_records.append(new_record) index += 1 headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, 'context:na' ] headers.extend(context_headers) ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def run_rmse_test(records_file, recommenders, binary_reviews_file, reviews_type=None): records = load_records(records_file) # records = extractor.remove_users_with_low_reviews(records, 2) with open(binary_reviews_file, 'rb') as read_file: binary_reviews = pickle.load(read_file) if len(records) != len(binary_reviews): raise ValueError("The records and reviews should have the same length") num_folds = 5 dataset_info_map = {} dataset_info_map['dataset'] = records_file.split('/')[-1] dataset_info_map['cache_reviews'] = binary_reviews_file.split('/')[-1] dataset_info_map['num_records'] = len(records) dataset_info_map['reviews_type'] = reviews_type dataset_info_map['cross_validation_folds'] = num_folds results_list = [] results_log_list = [] count = 0 print('Total recommenders: %d' % (len(recommenders))) for recommender in recommenders: print('\n**************\n%d/%d\n**************' % (count, len(recommenders))) results = recommender_evaluator.perform_cross_validation( records, recommender, num_folds, binary_reviews, reviews_type) results_list.append(results) remaining_time = results['Execution time'] * (len(recommenders) - count) remaining_time /= 3600 print('Estimated remaining time: %.2f hours' % remaining_time) count += 1 for recommender, results in zip(recommenders, results_list): results_log_list.append( process_rmse_results(recommender, results, dataset_info_map)) timestamp = time.strftime("%Y%m%d-%H%M%S") file_name = 'recommender-rmse-results' + timestamp ETLUtils.save_csv_file(file_name + '.csv', results_log_list, RMSE_HEADERS, '\t')
def test_drop_fields(self): drop_fields = [ 'cleanliness_rating', 'location_rating', 'rooms_rating', 'service_rating', 'value_rating' ] test_list = list(reviews_matrix_5) ETLUtils.drop_fields(drop_fields, test_list) self.assertEqual(reviews_matrix_5_short, test_list) test_list = list(reviews_matrix_5_short) self.assertEqual(reviews_matrix_5_short, test_list)
def lemmatize_records(self): if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE): print('Records were already lemmatized') self.records = \ ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE) return if Constants.DOCUMENT_LEVEL == 'review': self.records = self.lemmatize_reviews(self.records) elif Constants.DOCUMENT_LEVEL == 'sentence' or\ isinstance(Constants.DOCUMENT_LEVEL, (int, long)): self.records = self.lemmatize_sentences(self.records) ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)
def train_topic_model(self, cycle_index, fold_index): context_extractor = topic_model_creator.create_topic_model( self.train_records, cycle_index, fold_index) self.context_rich_topics = context_extractor.context_rich_topics topics_file_path = Constants.generate_file_name( 'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) ETLUtils.save_json_file(topics_file_path, [dict(self.context_rich_topics)]) print('Trained Context Extractor: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) return context_extractor
def export_as_top_word(self): print('%s: exporting to CARSKit ratings binary format with context as ' 'top words' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(CSV_FILE): print('Binary ratings file already exists') copy_to_workspace(CSV_FILE) return new_records = [] topic_model_string = self.topic_extractor.print_topic_model() top_terms = [get_topic_terms(topic) for topic in topic_model_string] context_headers = ['context:%s' % term[0] for term in top_terms] for record in self.records: new_record = { Constants.USER_ID_FIELD: record[Constants.USER_INTEGER_ID_FIELD], Constants.ITEM_ID_FIELD: record[Constants.ITEM_INTEGER_ID_FIELD], Constants.RATING_FIELD: record[Constants.RATING_FIELD], } topics = record[self.topics_field] context_found = False for topic in topics: topic_index = topic[0] topic_weight = topic[1] context_key = context_headers[topic_index] context_value = 1 if topic_weight > 0.0 else 0 new_record[context_key] = context_value # print(new_record) context_na_value = 0 if context_found else 1 new_record['context:na'] = context_na_value new_records.append(new_record) headers = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD, 'context:na' ] headers.extend(context_headers) ETLUtils.save_csv_file(CSV_FILE, new_records, headers) copy_to_workspace(CSV_FILE)
def classify_reviews(self): print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) print(Constants.CLASSIFIED_RECORDS_FILE) training_records =\ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) # If document level set to sentence (can be either 'sentence' or int) document_level = Constants.DOCUMENT_LEVEL if document_level != 'review': if document_level == 'sentence': document_level = float("inf") training_records = [ record for record in training_records if record['sentence_index'] < document_level ] for record in training_records: record['specific'] = \ 'yes' if record['sentence_type'] == 'specific' else 'no' print('num training records', len(training_records)) training_records = self.lemmatize_reviews(training_records) classifier = ReviewsClassifier(self.classifier, self.resampler) classifier.train(training_records) classifier.label_json_reviews(self.records)
def calculate_top_n_precision(reviews, recommender, n, min_score, num_folds): start_time = time.time() split = 1 - (1 / float(num_folds)) total_precision = 0. num_cycles = 0 for i in xrange(0, num_folds): print('Fold', i) start = float(i) / num_folds train, test = ETLUtils.split_train_test(reviews, split=split, start=start) recommender.load(train) user_ids = recommender.user_ids for user_id in user_ids: precision = calculate_recommender_precision( test, user_id, recommender, n, min_score) if precision is not None: total_precision += precision num_cycles += 1 final_precision = total_precision / num_cycles execution_time = time.time() - start_time print('Final Top N Precision: %f' % final_precision) print("--- %s seconds ---" % execution_time) result = {'Top N': final_precision, 'Execution time': execution_time} return result
def main(): # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json' my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json' my_records = ETLUtils.load_json_file(my_file) # my_reviews = [] # my_index = 0 # # print("records:", len(my_records)) # # for record in my_records: # my_index += 1 # my_reviews.append(Review(record['text'])) # print('index', my_index) # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl' binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl' # with open(binary_reviews_file, 'wb') as write_file: # pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL) with open(binary_reviews_file, 'rb') as read_file: my_reviews = pickle.load(read_file) cluster_labels = cluster_reviews(my_reviews) specific_records = split_list_by_labels(my_records, cluster_labels)[0] generic_records = split_list_by_labels(my_records, cluster_labels)[1]
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-c', '--cycle', metavar='int', type=int, nargs=1, help='The index of the running cycle') parser.add_argument( '-f', '--fold', metavar='int', type=int, nargs=1, help='The index of the cross validation fold') parser.add_argument( '-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() fold = args.fold[0] if args.fold is not None else None cycle = args.cycle[0] if args.cycle is not None else None num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) if fold is None and cycle is None: records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) create_topic_model(records, None, None) else: create_single_topic_model(cycle, fold)
def initialize_cluster_users(reviews, significant_criteria_ranges=None): """ Builds a dictionary containing all the users in the reviews. Each user contains information about its average overall rating, the list of reviews that user has made, and the cluster the user belongs to :param reviews: the list of reviews :return: a dictionary with the users initialized, the keys of the dictionaries are the users' ID """ user_ids = get_groupby_list(reviews, 'user_id') user_dictionary = {} for user_id in user_ids: user = User(user_id) user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id]) user.average_overall_rating = get_user_average_overall_rating( user_reviews, user_id, apply_filter=False) user.criteria_weights = get_criteria_weights( user_reviews, user_id, apply_filter=False) _, user.cluster = get_significant_criteria( user.criteria_weights, significant_criteria_ranges) user.item_ratings = get_user_item_ratings(user_reviews, user_id) user.item_multi_ratings = get_user_item_multi_ratings(user_reviews, user_id) user_dictionary[user_id] = user # print('Total users: %i' % len(user_ids)) return user_dictionary
def calculate_sparsity(self): """ Returns the percentage of missing ratings in the list of reviews of this ReviewsDatasetAnalyzer :return: the rate of missing ratings (i.e. number of missing ratings / (number of items * number of users)) :raise ValueError: in case an empty list is given """ if not self.reviews: raise ValueError("Can not determine the sparsity for an empty list") user_ids = extractor.get_groupby_list(self.reviews, "user_id") item_ids = extractor.get_groupby_list(self.reviews, "offering_id") non_missing_reviews = 0.0 total_expected_reviews = len(user_ids) * len(item_ids) for user in user_ids: user_reviews = ETLUtils.filter_records(self.reviews, "user_id", [user]) user_items = extractor.get_groupby_list(user_reviews, "offering_id") non_missing_reviews += len(set(item_ids).intersection(set(user_items))) return 1 - non_missing_reviews / total_expected_reviews
def get_user_item_ratings(reviews, user_id, apply_filter=False): """ Returns a dictionary that contains the items that the given user has rated, where the key of the dictionary is the ID of the item and the value is the rating that user_id has given to that item :param reviews: a list of reviews :param user_id: the ID of the user :param apply_filter: a boolean that indicates if the reviews have to be filtered by user_id or not. In other word this boolean indicates if the list contains reviews from several users or not. If it does contains reviews from other users, those have to be removed :return: a dictionary with the items that the given user has rated """ if apply_filter: user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id]) else: user_reviews = reviews if not user_reviews: return {} data_frame = DataFrame(user_reviews) column = 'offering_id' counts = data_frame.groupby(column).mean() items = counts.index.get_level_values(0).tolist() items_ratings = {} for item, mean in zip(items, counts['overall_rating']): items_ratings[item] = mean return items_ratings
def dataset_bucket_analysis_by_field(field): # Set the dataset hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'} Constants.update_properties(hotel_dataset_properties) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('Loaded %d records' % len(records)) user_frequency_map = {} for record in records: user_id = record[field] if user_id not in user_frequency_map: user_frequency_map[user_id] = 0 user_frequency_map[user_id] += 1 print('There is a total of %d %ss' % (len(user_frequency_map), field)) sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True) print(sorted_x[0]) print(sorted_x[1]) print(sorted_x[2]) # print(user_frequency_map) # Number of reviews per user rda = ReviewsDatasetAnalyzer(records) users_summary = rda.summarize_reviews_by_field(field) print('Average number of reviews per %s: %f' % (field, float(rda.num_reviews) / rda.num_users)) users_summary.plot(kind='line', rot=0) pandas.set_option('display.max_rows', len(users_summary)) print(users_summary) pandas.reset_option('display.max_rows')
def classify_reviews(self): print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) dataset = Constants.ITEM_TYPE folder = Constants.DATASET_FOLDER file_name_suffix =\ '' if Constants.MAX_SENTENCES is None else '_sentences' training_records_file = folder +\ 'classified_' + dataset + '_reviews' + file_name_suffix + '.json' training_records = ETLUtils.load_json_file(training_records_file) if Constants.MAX_SENTENCES is not None: training_records = [ record for record in training_records if record['sentence_index'] < Constants.MAX_SENTENCES ] for record in training_records: record['specific'] = \ 'yes' if record['sentence_type'] == 'specific' else 'no' print('num training records', len(training_records)) self.lemmatize_reviews(training_records) classifier = ReviewsClassifier(self.classifier, self.resampler) classifier.train(training_records) classifier.label_json_reviews(self.records)
def calculate_sparsity(self): """ Returns the percentage of missing ratings in the list of reviews of this ReviewsDatasetAnalyzer :return: the rate of missing ratings (i.e. number of missing ratings / (number of items * number of users)) :raise ValueError: in case an empty list is given """ if not self.reviews: raise ValueError( 'Can not determine the sparsity for an empty list') user_ids = extractor.get_groupby_list(self.reviews, Constants.USER_ID_FIELD) item_ids = extractor.get_groupby_list(self.reviews, Constants.ITEM_ID_FIELD) non_missing_reviews = 0. total_expected_reviews = len(user_ids) * len(item_ids) for user in user_ids: user_reviews = ETLUtils.filter_records(self.reviews, Constants.USER_ID_FIELD, [user]) user_items = extractor.get_groupby_list(user_reviews, Constants.ITEM_ID_FIELD) non_missing_reviews += len( set(item_ids).intersection(set(user_items))) return 1 - non_missing_reviews / total_expected_reviews
def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE) with open(Constants.REVIEWS_FILE, 'rb') as read_file: self.original_reviews = pickle.load(read_file) print('num_records: %d' % len(self.original_records)) for record, review in zip(self.original_records, self.original_reviews): review.id = record[Constants.REVIEW_ID_FIELD] review.rating = record[Constants.RATING_FIELD] if not os.path.exists(Constants.USER_ITEM_MAP_FILE): records = ETLUtils.load_json_file(Constants.RECORDS_FILE) user_item_map = create_user_item_map(records) with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file: pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
def get_ml_100K_dataset(): # records = ETLUtils.load_csv_file('/Users/fpena/tmp/bpmf/ml-1k.csv', '\t') records = ETLUtils.load_csv_file('/Users/fpena/tmp/bpmf/ml-100k.csv', '\t') # records = ETLUtils.load_csv_file('/Users/fpena/UCC/Thesis/datasets/uncompressed/ml-100k.csv', '\t') for record in records: record['overall_rating'] = float(record['overall_rating']) return records