def main(): # dataset = 'hotel' dataset = 'restaurant' my_folder = '/Users/fpena/UCC/Thesis/datasets/context/' my_training_records_file =\ my_folder + 'classified_' + dataset + '_reviews.json' my_training_reviews_file =\ my_folder + 'classified_' + dataset + '_reviews.pkl' my_training_records = ETLUtils.load_json_file(my_training_records_file) with open(my_training_reviews_file, 'rb') as read_file: my_training_reviews = pickle.load(read_file) classifier = ReviewsClassifier() classifier.train(my_training_records, my_training_reviews) my_input_records_file =\ my_folder + 'yelp_training_set_review_' + dataset + 's_shuffled.json' my_input_reviews_file =\ my_folder + 'reviews_' + dataset + '_shuffled.pkl' my_output_records_file =\ my_folder + 'yelp_training_set_review_' + dataset +\ 's_shuffled_tagged.json' with open(my_input_reviews_file, 'rb') as read_file: my_input_reviews = pickle.load(read_file) my_input_records = ETLUtils.load_json_file(my_input_records_file) my_output_records =\ classifier.label_json_reviews(my_input_records, my_input_reviews) ETLUtils.save_json_file(my_output_records_file, my_output_records)
def main_evaluate(): I = my_i records = ETLUtils.load_json_file(RECORDS_FILE) # print('num_records', len(records)) test_file = RECORDS_FILE + '_test' test_records = ETLUtils.load_json_file(test_file) top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I) top_n_evaluator.find_important_records() # top_n_evaluator.initialize() # records_to_predict_file = DATASET_FOLDER + 'generated/records_to_predict_' + DATASET + '.json' top_n_evaluator.load_records_to_predict(RECORDS_TO_PREDICT_FILE) predictions_file = GENERATED_FOLDER + 'predictions_' + DATASET + '.txt' predictions = rmse_calculator.read_targets_from_txt(predictions_file) # print('total predictions', len(predictions)) top_n_evaluator.evaluate(predictions) # print('precision', top_n_evaluator.precision) print('recall', top_n_evaluator.recall) return top_n_evaluator.recall
def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.original_records =\ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) # ETLUtils.drop_fields(['tagged_words'], self.original_records) print('num_records: %d' % len(self.original_records)) if not os.path.exists(Constants.USER_ITEM_MAP_FILE): records = ETLUtils.load_json_file(Constants.RECORDS_FILE) user_item_map = create_user_item_map(records) with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file: pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
def dataset_bucket_analysis_by_field(field): # Set the dataset hotel_dataset_properties = {Constants.BUSINESS_TYPE_FIELD: 'fourcity_hotel'} Constants.update_properties(hotel_dataset_properties) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('Loaded %d records' % len(records)) user_frequency_map = {} for record in records: user_id = record[field] if user_id not in user_frequency_map: user_frequency_map[user_id] = 0 user_frequency_map[user_id] += 1 print('There is a total of %d %ss' % (len(user_frequency_map), field)) sorted_x = sorted(user_frequency_map.items(), key=operator.itemgetter(1), reverse=True) print(sorted_x[0]) print(sorted_x[1]) print(sorted_x[2]) # print(user_frequency_map) # Number of reviews per user rda = ReviewsDatasetAnalyzer(records) users_summary = rda.summarize_reviews_by_field(field) print('Average number of reviews per %s: %f' % (field, float(rda.num_reviews) / rda.num_users)) users_summary.plot(kind='line', rot=0) pandas.set_option('display.max_rows', len(users_summary)) print(users_summary) pandas.reset_option('display.max_rows')
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-c', '--cycle', metavar='int', type=int, nargs=1, help='The index of the running cycle') parser.add_argument( '-f', '--fold', metavar='int', type=int, nargs=1, help='The index of the cross validation fold') parser.add_argument( '-t', '--numtopics', metavar='int', type=int, nargs=1, help='The number of topics of the topic model') args = parser.parse_args() fold = args.fold[0] if args.fold is not None else None cycle = args.cycle[0] if args.cycle is not None else None num_topics = args.numtopics[0] if args.numtopics is not None else None if num_topics is not None: Constants.update_properties( {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics}) if fold is None and cycle is None: records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) create_topic_model(records, None, None) else: create_single_topic_model(cycle, fold)
def load(self): print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) self.original_records = ETLUtils.load_json_file(Constants.RECORDS_FILE) with open(Constants.REVIEWS_FILE, 'rb') as read_file: self.original_reviews = pickle.load(read_file) print('num_records: %d' % len(self.original_records)) for record, review in zip(self.original_records, self.original_reviews): review.id = record[Constants.REVIEW_ID_FIELD] review.rating = record[Constants.RATING_FIELD] if not os.path.exists(Constants.USER_ITEM_MAP_FILE): records = ETLUtils.load_json_file(Constants.RECORDS_FILE) user_item_map = create_user_item_map(records) with open(Constants.USER_ITEM_MAP_FILE, 'wb') as write_file: pickle.dump(user_item_map, write_file, pickle.HIGHEST_PROTOCOL)
def classify_reviews(self): print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) dataset = Constants.ITEM_TYPE folder = Constants.DATASET_FOLDER file_name_suffix =\ '' if Constants.MAX_SENTENCES is None else '_sentences' training_records_file = folder +\ 'classified_' + dataset + '_reviews' + file_name_suffix + '.json' training_records = ETLUtils.load_json_file(training_records_file) if Constants.MAX_SENTENCES is not None: training_records = [ record for record in training_records if record['sentence_index'] < Constants.MAX_SENTENCES ] for record in training_records: record['specific'] = \ 'yes' if record['sentence_type'] == 'specific' else 'no' print('num training records', len(training_records)) self.lemmatize_reviews(training_records) classifier = ReviewsClassifier(self.classifier, self.resampler) classifier.train(training_records) classifier.label_json_reviews(self.records)
def main(): # my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.json' my_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.json' my_records = ETLUtils.load_json_file(my_file) # my_reviews = [] # my_index = 0 # # print("records:", len(my_records)) # # for record in my_records: # my_index += 1 # my_reviews.append(Review(record['text'])) # print('index', my_index) # binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_hotel_reviews.pkl' binary_reviews_file = '/Users/fpena/UCC/Thesis/datasets/context/classified_restaurant_reviews.pkl' # with open(binary_reviews_file, 'wb') as write_file: # pickle.dump(my_reviews, write_file, pickle.HIGHEST_PROTOCOL) with open(binary_reviews_file, 'rb') as read_file: my_reviews = pickle.load(read_file) cluster_labels = cluster_reviews(my_reviews) specific_records = split_list_by_labels(my_records, cluster_labels)[0] generic_records = split_list_by_labels(my_records, cluster_labels)[1]
def update_labeled_reviews_records(): reviews_label_map = compare_records() agreed_review_ids = set(reviews_label_map.keys()) classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) classifier_review_ids = \ {record[Constants.REVIEW_ID_FIELD] for record in classifier_records} non_agreed_review_ids = classifier_review_ids.difference(agreed_review_ids) # for record in classifier_records: # print(record) print('number of records before: %d' % len(classifier_records)) print(reviews_label_map) print(non_agreed_review_ids) review_type_map = {'s': 'yes', 'g': 'no'} # We remove from the classifier records the ones who don't have agreed on a # label classifier_records = ETLUtils.filter_out_records( classifier_records, Constants.REVIEW_ID_FIELD, non_agreed_review_ids) # Finally we make the update of the labels for record in classifier_records: review_id = record[Constants.REVIEW_ID_FIELD] record[Constants.SPECIFIC] = review_type_map[reviews_label_map[review_id]] # print(record) print('number of records after: %d' % len(classifier_records))
def classify_reviews(self): print('%s: classify reviews' % time.strftime("%Y/%m/%d-%H:%M:%S")) print(Constants.CLASSIFIED_RECORDS_FILE) training_records =\ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) # If document level set to sentence (can be either 'sentence' or int) document_level = Constants.DOCUMENT_LEVEL if document_level != 'review': if document_level == 'sentence': document_level = float("inf") training_records = [ record for record in training_records if record['sentence_index'] < document_level ] for record in training_records: record['specific'] = \ 'yes' if record['sentence_type'] == 'specific' else 'no' print('num training records', len(training_records)) training_records = self.lemmatize_reviews(training_records) classifier = ReviewsClassifier(self.classifier, self.resampler) classifier.train(training_records) classifier.label_json_reviews(self.records)
def get_categories(file_path): records = ETLUtils.load_json_file(file_path) # Now we obtain the categories for all the businesses records = ETLUtils.add_transpose_list_column('categories', records) BusinessETL.drop_unwanted_fields(records) return records[0].keys()
def analyze_context_records(): records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) records = ETLUtils.filter_records(records, 'context_type', ['context']) print('num records: %d' % len(records)) for record in records: print(record[Constants.TEXT_FIELD])
def get_business_ids(file_path, business_type=None): records = ETLUtils.load_json_file(file_path) if not business_type: return [record['business_id'] for record in records] return [record['business_id'] for record in records if business_type in record['categories']]
def generate_report_yelp_phoenix(): reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/filtered_reviews.json' report_file = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_yelp_phoenix.ipynb' reviews = ETLUtils.load_json_file(reviews_file) load_reviews_code =\ 'file_path = \'' + reviews_file + '\'\n' +\ 'reviews = ETLUtils.load_json_file(file_path)\n' ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Yelp Phoenix', report_file, load_reviews_code)
def generate_report_ruihai(): file_path = '/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json' file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_ruihai.ipynb' reviews = ETLUtils.load_json_file(file_path) load_reviews_code =\ 'file_path = \'/Users/fpena/UCC/Thesis/datasets/TripHotelReviewXml/cleaned_reviews.json\'\n' +\ 'reviews = ETLUtils.load_json_file(file_path)\n' ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Ruihai TripAdvisor', file_name, load_reviews_code)
def generate_report_fourcity_filtered(): file_path = '/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json' file_name = '/Users/fpena/UCC/Thesis/projects/yelp/notebooks/dataset_analysis_report_fourcity.ipynb' reviews = ETLUtils.load_json_file(file_path) load_reviews_code =\ 'file_path = \'/Users/fpena/tmp/filtered_reviews_multi_non_sparse_shuffled.json\'\n' +\ 'reviews = ETLUtils.load_json_file(file_path)\n' ReviewsDatasetAnalyzerReport.generate_report(reviews, 'Fourcity TripAdvisor', file_name, load_reviews_code)
def main(): records = ETLUtils.load_json_file( Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE) context_transformer = ContextTransformer(records) context_transformer.load_data() context_transformer.transform_records() context_transformer.export_records()
def load_data(self): """ Loads the records and the topic model from files """ self.records = ETLUtils.load_json_file( Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE) self.topic_extractor = NmfTopicExtractor() self.topic_extractor.load_trained_data()
def main(): classifier_records = \ ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) # count_specific_generic_ratio(classifier_records) # load_data(file_name) # compare_records() # update_labeled_reviews_records() # foo() cohens_kappa()
def main(): # records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) records = ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE) print('num_reviews', len(records)) # lda_context_utils.discover_topics(my_reviews, 150) context_extractor = ContextExtractor(records) context_extractor.separate_reviews() context_extractor.get_context_rich_topics()
def load_data(json_file): records = ETLUtils.load_json_file(json_file) fields = ['user_id', 'business_id', 'stars'] records = ETLUtils.select_fields(fields, records) # We rename the 'stars' field to 'overall_rating' to take advantage of the # function extractor.get_user_average_overall_rating for record in records: record['overall_rating'] = record.pop('stars') record['offering_id'] = record.pop('business_id') return records
def analyze_topics(): start_time = time.time() utilities.plant_seeds() records = \ ETLUtils.load_json_file(Constants.RECSYS_TOPICS_PROCESSED_RECORDS_FILE) print('num_reviews', len(records)) num_topics = Constants.TOPIC_MODEL_NUM_TOPICS num_terms = Constants.TOPIC_MODEL_STABILITY_NUM_TERMS topic_model_string = None if Constants.TOPIC_MODEL_TYPE == 'ensemble': topic_model = NmfTopicExtractor() topic_model.load_trained_data() topic_model_string = topic_model.print_topic_model('max') elif Constants.TOPIC_MODEL_TYPE == 'lda': topic_model = topic_model_creator.load_topic_model(None, None) topic_model_string = [ topic_model.print_topic(topic_id, num_terms) for topic_id in range(num_topics) ] context_extractor = ContextExtractor(records) context_extractor.separate_reviews() context_extractor.get_context_rich_topics() topic_data = [] for topic in range(num_topics): result = {} result['topic_id'] = topic result.update(split_topic(topic_model_string[topic])) result['ratio'] = context_extractor.topic_ratio_map[topic] result['weighted_frequency'] = \ context_extractor.topic_weighted_frequency_map[topic] topic_data.append(result) data_frame = DataFrame.from_dict(topic_data) scores = {} scores['num_topics'] = Constants.TOPIC_MODEL_NUM_TOPICS probability_score = data_frame['probability_score'].mean() scores['probability_score'] = probability_score print('probability score: %f' % scores['probability_score']) end_time = time.time() cycle_time = end_time - start_time scores['cycle_time'] = cycle_time print("Cycle time = %f seconds" % cycle_time) return scores
def main_export(): I = my_i records = ETLUtils.load_json_file(RECORDS_FILE) print('num_records', len(records)) test_records = ETLUtils.load_json_file(TEST_RECORDS_FILE) # test_reviews = review_metrics_extractor.build_reviews(test_records) # with open(TEST_REVIEWS_FILE, 'wb') as write_file: # pickle.dump(test_reviews, write_file, pickle.HIGHEST_PROTOCOL) # with open(TEST_REVIEWS_FILE, 'rb') as read_file: # test_reviews = pickle.load(read_file) # train_file = RECORDS_FILE + '_train' # train_records = ETLUtils.load_json_file(train_file) with open(USER_ITEM_MAP_FILE, 'rb') as read_file: user_item_map = pickle.load(read_file) top_n_evaluator = TopNEvaluator(records, test_records, DATASET, 10, I) top_n_evaluator.initialize(user_item_map) top_n_evaluator.export_records_to_predict(RECORDS_TO_PREDICT_FILE)
def main_converter(): csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv' csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv' # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True) # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True) headers = ['stars', 'user_id', 'business_id'] train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE) records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE) train_records = ETLUtils.select_fields(headers, train_records) records_to_predict = ETLUtils.select_fields(headers, records_to_predict) ETLUtils.save_csv_file(csv_train_file, train_records, headers) ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers) csv_files = [ csv_train_file, csv_test_file ] csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
def main(): records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('num_reviews', len(records)) # lda_context_utils.discover_topics(my_reviews, 150) context_extractor = NmfContextExtractor(records) context_extractor.generate_review_bows() context_extractor.build_document_term_matrix() # context_extractor.build_topic_model() context_extractor.build_stable_topic_model() context_extractor.print_topic_model() context_extractor.update_reviews_with_topics() context_extractor.get_context_rich_topics()
def plot_overall_rating(): # reviews = extractor.pre_process_reviews() reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json') data_frame = DataFrame(reviews) print(data_frame) DataPlotter.plot_data(data_frame, 'overall_rating', plot_type='bar', title='Overall Rating') # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating') # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating') # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating') # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating') # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating') plt.show()
def full_cycle(self): Constants.print_properties() print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S")) utilities.plant_seeds() if self.use_cache and \ os.path.exists(Constants.PROCESSED_RECORDS_FILE): print('Records have already been processed') self.records = \ ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) else: self.preprocess() if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: self.separate_recsys_topic_model_records()
def lemmatize_records(self): if os.path.exists(Constants.LEMMATIZED_RECORDS_FILE): print('Records were already lemmatized') self.records = \ ETLUtils.load_json_file(Constants.LEMMATIZED_RECORDS_FILE) return if Constants.DOCUMENT_LEVEL == 'review': self.records = self.lemmatize_reviews(self.records) elif Constants.DOCUMENT_LEVEL == 'sentence' or\ isinstance(Constants.DOCUMENT_LEVEL, (int, long)): self.records = self.lemmatize_sentences(self.records) ETLUtils.save_json_file(Constants.LEMMATIZED_RECORDS_FILE, self.records)
def load_context_reviews(self, cycle_index, fold_index): train_records_file_path = Constants.generate_file_name( 'context_train_records', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) important_records_file_path = Constants.generate_file_name( 'context_important_records', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) self.train_records = ETLUtils.load_json_file(train_records_file_path) self.important_records = \ ETLUtils.load_json_file(important_records_file_path) self.load_cache_context_topics(cycle_index, fold_index) self.context_topics_map = {} for record in self.important_records: self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \ record[Constants.CONTEXT_TOPICS_FIELD] # self.train_records = self.filter_context_words(self.train_records) # self.print_context_topics(self.important_records) self.important_records = None gc.collect()
def main(): utilities.plant_seeds() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) context_extractor = \ topic_model_creator.create_topic_model(records, None, None) topic_latex_generator = TopicLatexGenerator(context_extractor) topic_latex_generator.generate_pdf()
def export_records_to_text(): print('%s: Exporting bag-of-words to text files' % time.strftime("%Y/%m/%d-%H:%M:%S")) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) print('Total records: %d' % len(records)) folder = '/Users/fpena/tmp/topic-ensemble/data/' + Constants.ITEM_TYPE + '/' for record in records: file_name = folder + Constants.ITEM_TYPE + '_' + \ record[Constants.REVIEW_ID_FIELD] + '.txt' with codecs.open(file_name, "w", encoding="utf-8-sig") as text_file: text_file.write(" ".join(record[Constants.BOW_FIELD]))
def analyze_results(): json_file = Constants.generate_file_name( 'carskit_results', 'json', OUTPUT_FOLDER, None, None, False) records = ETLUtils.load_json_file(json_file) data_frame = pandas.DataFrame(records) print(sorted(list(data_frame.columns.values))) cols = [ 'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format', 'topic_model_num_topics', 'topic_model_normalize'] data_frame = data_frame[cols] data_frame = data_frame.sort_values(['ck_rec10']) print(data_frame) data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
def main2(): records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)[:10] # for record in records: # print(record) cols = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD ] data_frame = pandas.DataFrame(records, columns=cols) # print(data_frame) # data_frame['a'] = data_frame[Constants.USER_ID_FIELD].astype('category') # data_frame['b'] = data_frame[Constants.ITEM_ID_FIELD].astype('category') data_frame[Constants.USER_ID_FIELD] = data_frame[Constants.USER_ID_FIELD].astype('category') data_frame[Constants.ITEM_ID_FIELD] = data_frame[Constants.ITEM_ID_FIELD].astype('category') # category_columns = data_frame.select_dtypes(['category']).columns # print(category_columns) # data_frame[category_columns] = \ # data_frame[category_columns].apply(lambda x: x.cat.codes) # print(data_frame) # print(data_frame['b'].cat.categories[0]) print(data_frame[Constants.USER_ID_FIELD].cat.codes) print(data_frame[Constants.ITEM_ID_FIELD].cat.codes) plays = coo_matrix((data_frame[Constants.RATING_FIELD].astype(float), (data_frame[Constants.USER_ID_FIELD].cat.codes, data_frame[Constants.ITEM_ID_FIELD].cat.codes))) print(plays) # from sklearn.decomposition import NMF model = NMF(n_components=2, init='random', random_state=0) W = model.fit_transform(plays) H = model.components_ nR = numpy.dot(W, H) # print(nR) # print(nR.shape) print 'User-based CF MSE: ' + str( mean_squared_error(nR, plays.toarray())) # get SVD components from train matrix. Choose k. u, s, vt = svds(plays, k=5) s_diag_matrix = numpy.diag(s) X_pred = numpy.dot(numpy.dot(u, s_diag_matrix), vt) # print(X_pred) print 'User-based CF MSE: ' + str(mean_squared_error(X_pred, plays.toarray()))
def create_category_sets(file_path): """ Creates an array of arrays in which each sub-array contains the categories of each business in the Yelp Phoenix Business data set :rtype : numpy array matrix :param file_path: the path for the file that contains the businesses data :return: a numpy array of numpy arrays with the categories that each business has, for example [['Restaurant', 'Mexican', 'Bar'], ['Bar', 'Disco']] """ records = ETLUtils.load_json_file(file_path) sets = numpy.array([set(record['categories']) for record in records]) return sets
def analyze_results(): json_file = Constants.generate_file_name('carskit_results', 'json', OUTPUT_FOLDER, None, None, False) records = ETLUtils.load_json_file(json_file) data_frame = pandas.DataFrame(records) print(sorted(list(data_frame.columns.values))) cols = [ 'ck_rec10', 'ck_pre10', 'ck_algorithm', 'carskit_nominal_format', 'topic_model_num_topics', 'topic_model_normalize' ] data_frame = data_frame[cols] data_frame = data_frame.sort_values(['ck_rec10']) print(data_frame) data_frame.to_csv('/Users/fpena/tmp/' + Constants.ITEM_TYPE + '_carskit.csv')
def load_data(file_name): records = ETLUtils.load_json_file(file_name) data_frame = pandas.DataFrame.from_records(records) column = 'review_type' # column = 'specific' # print(data_frame.describe()) # print(data_frame.head()) # data_frame = data_frame['specific'] # print(data_frame.groupby(column)[column].count()) # reviews = list(data_frame['text']) values = list(data_frame[column]) values = [value.encode('ascii', 'ignore') for value in values] # print(reviews) print(values) return records
def see_topic_analysis_results(): topic_analysis_file = Constants.DATASET_FOLDER + 'topic_model_analysis_' + \ Constants.ITEM_TYPE + '.json' results = ETLUtils.load_json_file(topic_analysis_file) index = 0 for result in results: score_ratio = result['high_ratio_mean_score'] / result[ 'low_ratio_mean_score'] count_ratio = result['weighted_high_ratio_count'] / result[ 'weighted_low_ratio_count'] print(index, score_ratio, count_ratio, result['high_ratio_mean_score'], result['low_ratio_mean_score'], result['lda_epsilon'], result['topic_weighting_method'], result['num_context_topics'], result['lda_num_topics']) index += 1
def plot_overall_rating(): # reviews = extractor.pre_process_reviews() reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json') data_frame = DataFrame(reviews) print(data_frame) DataPlotter.plot_data(data_frame, 'overall_rating', plot_type='bar', title='Overall Rating') # DataPlotter.plot_data(data_frame, 'cleanliness_rating', plot_type='bar', title='Cleanliness Rating') # DataPlotter.plot_data(data_frame, 'location_rating', plot_type='bar', title='Location Rating') # DataPlotter.plot_data(data_frame, 'rooms_rating', plot_type='bar', title='Rooms Rating') # DataPlotter.plot_data(data_frame, 'service_rating', plot_type='bar', title='Service Rating') # DataPlotter.plot_data(data_frame, 'value_rating', plot_type='bar', title='Value Rating') plt.show()
def load_cache_context_topics(self, cycle_index, fold_index): print('load cache context topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) topics_file_path = Constants.generate_file_name( 'context_topics', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, True) self.context_rich_topics = sorted( ETLUtils.load_json_file(topics_file_path)[0].items(), key=operator.itemgetter(1), reverse=True) self.context_topics_map = {} for record in self.important_records: self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \ record[Constants.CONTEXT_TOPICS_FIELD]
def topic_stability_main(): records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) # num_topic_list = range(2, 101) num_topic_list = [2, 5] results = {} for num_topics in num_topic_list: new_properties = {Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: num_topics} Constants.update_properties(new_properties) results[num_topics] = calculate_topic_stability(records) print('Results:') for num_topics in num_topic_list: scores = results[num_topics] print('%d: %.4f [%.4f,%.4f]' % (num_topics, numpy.nanmean(scores), numpy.nanmin(scores), numpy.nanmax(scores)))
def pre_process_reviews(): """ Returns a list of preprocessed reviews, where the reviews have been filtered to obtain only relevant data, have dropped any fields that are not useful, and also have additional fields that are handy to make calculations :return: a list of preprocessed reviews """ reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json' reviews = ETLUtils.load_json_file(reviews_file) select_fields = ['user_id', 'business_id', 'stars'] reviews = ETLUtils.select_fields(select_fields, reviews) extract_fields(reviews) ETLUtils.drop_fields(['business_id', 'stars'], reviews) # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json') reviews = clean_reviews(reviews) return reviews
def main(): # reviews = pre_process_reviews() # save_dictionary_list_to_file(reviews, '/Users/fpena/tmp/filtered_reviews.json') reviews = ETLUtils.load_json_file('/Users/fpena/tmp/filtered_reviews.json') data_frame = DataFrame(reviews) column = 'offering_id' groupby = data_frame.groupby(column) counts = groupby.mean() # print(counts) items = counts.index.get_level_values(0).tolist() for item, mean in zip(items, counts['overall_rating']): print(item, mean) # print(get_item_list(reviews, 2)) # print(len(reviews)) # initialize_users(reviews, 10) pass
def tag_reviews_language(self): print('%s: tag reviews language' % time.strftime("%Y/%m/%d-%H:%M:%S")) if os.path.exists(Constants.LANGUAGE_RECORDS_FILE): print('Records have already been tagged with language field') self.records = \ ETLUtils.load_json_file(Constants.LANGUAGE_RECORDS_FILE) return DetectorFactory.seed = 0 for record in self.records: try: language = langdetect.detect(record[Constants.TEXT_FIELD]) except LangDetectException: language = 'unknown' record[Constants.LANGUAGE_FIELD] = language ETLUtils.save_json_file(Constants.LANGUAGE_RECORDS_FILE, self.records)
def create_single_topic_model(cycle_index, fold_index): print(Constants._properties) print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1 / float(num_folds)) for i in range(cycle_index + 1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) create_topic_model(train_records, cycle_index, fold_index)
def analyze_fourcity(): records = ETLUtils.load_json_file(Constants.FULL_PROCESSED_RECORDS_FILE) # for record in records: # print(record) cols = [ Constants.USER_ID_FIELD, Constants.ITEM_ID_FIELD, Constants.RATING_FIELD ] data_frame = pandas.DataFrame(records, columns=cols) print(data_frame.describe()) zero_records = 0 for record in records: if record[Constants.RATING_FIELD] < 1.0: print(record) zero_records += 1 for record in records: if record[Constants.RATING_FIELD] > 5.0: print(record) zero_records += 1 print('zero records: %d' % zero_records) # Look for duplicates keys_set = set() num_duplicates = 0 print('Looking for duplicates') records_map = {} for record in records: # if record[Constants.USER_ITEM_KEY_FIELD] in keys_set: record_key = record[Constants.USER_ITEM_KEY_FIELD] if record_key in records_map: print('old record', records_map[record_key][Constants.TEXT_FIELD]) print('new record', record[Constants.TEXT_FIELD]) num_duplicates += 1 keys_set.add(record_key) records_map[record_key] = record print('duplicate records: %d' % num_duplicates)
def create_reviews(self): with self.doc.create(Section('Reviews')): with self.doc.create(Subsection('A subsection')): sg_map = {'yes': 'specific', 'no': 'generic'} review_index = 0 # full_records = ETLUtils.load_json_file( # Constants.FULL_PROCESSED_RECORDS_FILE) records_file = Constants.DATASET_FOLDER +\ 'classified_' + Constants.ITEM_TYPE + '_reviews.json' full_records = ETLUtils.load_json_file(records_file) for record in full_records: with self.doc.create(Subsection( 'Review %d (%s)' % ( (review_index + 1), sg_map[record['specific']]))): # for doc_part in self.build_text( # record[Constants.TEXT_FIELD]): for doc_part in self.build_text_manual(record): self.doc.append(doc_part) review_index += 1
def pre_process_reviews(): """ Returns a list of preprocessed reviews, where the reviews have been filtered to obtain only relevant data, have dropped any fields that are not useful, and also have additional fields that are handy to make calculations :return: a list of preprocessed reviews """ data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/' review_file_path = data_folder + 'review.txt' # review_file_path = data_folder + 'review-short.json' reviews = ETLUtils.load_json_file(review_file_path) select_fields = ['ratings', 'author', 'offering_id'] reviews = ETLUtils.select_fields(select_fields, reviews) extract_fields(reviews) ETLUtils.drop_fields(['author', 'ratings'], reviews) # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json') # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat') reviews = clean_reviews(reviews) return reviews
def build_manual_topic_model(): new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \ Constants.ITEM_TYPE + '_reviews_first_sentences.json' records = ETLUtils.load_json_file(new_classified_records_file) # records = ETLUtils.filter_records(records, 'context_type', ['context']) # records = ETLUtils.filter_records(records, 'sentence_type', ['specific']) # records = ETLUtils.filter_records(records, 'sentence_index', [0]) print('total records: %d' % len(records)) # print(records[0]) count = 0 for i in range(len(records)): record = records[i] if record['sentence_index'] == 0.0: # if record['context_type'] == 'context' and record['context_summary'] != 'all_context': if record['sentence_type'] == 'specific': print('%d:\t%s' % (i+1, records[i]['text'])) count += 1 print('count: %d' % count)
def create_category_matrix(file_path): """ Creates a matrix with all the categories for businesses that are contained in the Yelp Phoenix Business data set. Each column of the matrix represents a category, and each row a business. This is a binary matrix that contains a 1 at the position i,j if the business i contains the category j, and a 0 otherwise. :rtype : numpy array matrix :param file_path: the path for the file that contains the businesses data :return: a numpy array binary matrix """ records = ETLUtils.load_json_file(file_path) # Now we obtain the categories for all the businesses records = ETLUtils.add_transpose_list_column('categories', records) BusinessETL.drop_unwanted_fields(records) matrix = numpy.array( [numpy.array(record.values()) for record in records]) return matrix
def load_records(): """ Loads the reviews that have been manually tagged at a sentence level, this are the reviews that we will use to train our classifier. Only the first sentence of each review will be used """ print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S")) # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_hotel_reviews.json' # file_name = '/Users/fpena/UCC/Thesis/datasets/context/oldClassifiedFiles/classified_yelp_restaurant_reviews.json' # records = ETLUtils.load_json_file(file_name) records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) # Take only the first sentence # document_level = 1 if isinstance(Constants.DOCUMENT_LEVEL, (int, long)): records = [ record for record in records if record['sentence_index'] < Constants.DOCUMENT_LEVEL ] return records
def create_topic_model_with_context_records(): processed_records_file = Constants.generate_file_name( 'classified_processed_reviews', 'json', Constants.CACHE_FOLDER, None, None, False, True) records = ETLUtils.load_json_file(processed_records_file) print('records length: %d' % len(records)) context_records = ETLUtils.filter_records(records, 'context_type', ['context']) print('context records length: %d' % len(context_records)) context_specific_records = ETLUtils.filter_records(context_records, 'predicted_class', ['specific']) print('context specific records length: %d' % len(context_specific_records)) for i in range(len(context_specific_records)): # print('%d:\t%s' % (i, context_records[i]['text'])) print('%d:\t%s' % (i, context_specific_records[i]['bow'])) for i in range(1, len(context_records)+1): Constants.update_properties({Constants.TOPIC_MODEL_NUM_TOPICS_FIELD: i}) context_extractor = \ topic_model_creator.create_topic_model(records, None, None) topic_data = [] for topic in range(Constants.TOPIC_MODEL_NUM_TOPICS): result = {} result['topic_id'] = topic result.update(split_topic(context_extractor.print_topic_model( num_terms=Constants.TOPIC_MODEL_STABILITY_NUM_TERMS)[topic])) result['ratio'] = context_extractor.topic_ratio_map[topic] result['weighted_frequency'] = \ context_extractor.topic_weighted_frequency_map[topic] topic_data.append(result) file_name = Constants.generate_file_name( 'manual_topic_model', 'xlsx', Constants.DATASET_FOLDER, None, None, True) generate_excel_file(topic_data, file_name)
def create_single_topic_model(cycle_index, fold_index, check_exists=True): Constants.print_properties() print('%s: Start' % time.strftime("%Y/%m/%d-%H:%M:%S")) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: msg = 'This function shouldn\'t be used when the ' \ 'separate_topic_model_recsys_reviews property is set to True' raise ValueError(msg) records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.CROSS_VALIDATION_STRATEGY == 'nested_test': pass elif Constants.CROSS_VALIDATION_STRATEGY == 'nested_validate': num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS cycle = Constants.NESTED_CROSS_VALIDATION_CYCLE split = 1 - (1 / float(num_folds)) cv_start = float(cycle) / num_folds print('cv_start', cv_start) records, _ = ETLUtils.split_train_test(records, split, cv_start) else: raise ValueError('Unknown cross-validation strategy') utilities.plant_seeds() num_folds = Constants.CROSS_VALIDATION_NUM_FOLDS split = 1 - (1/float(num_folds)) for i in range(cycle_index+1): if Constants.SHUFFLE_DATA: random.shuffle(records) cv_start = float(fold_index) / num_folds train_records, test_records = \ ETLUtils.split_train_test(records, split=split, start=cv_start) return create_topic_model( train_records, cycle_index, fold_index, check_exists)
def load_records(): """ Loads the reviews that have been manually tagged at a sentence level, this are the reviews that we will use to train our classifier. Only the first sentence of each review will be used """ print('%s: load records' % time.strftime("%Y/%m/%d-%H:%M:%S")) dataset = Constants.ITEM_TYPE folder = Constants.DATASET_FOLDER records_file = folder + \ 'classified_' + dataset + '_reviews_sentences.json' records = ETLUtils.load_json_file(records_file) # Take only the first sentence # max_sentences = 1 if Constants.MAX_SENTENCES is not None: records = [ record for record in records if record['sentence_index'] < Constants.MAX_SENTENCES ] return records
def evaluate_topic_model(metric): print('%s: evaluating topic model' % time.strftime("%Y/%m/%d-%H:%M:%S")) Constants.update_properties({ Constants.NUMPY_RANDOM_SEED_FIELD: Constants.NUMPY_RANDOM_SEED + 10, Constants.RANDOM_SEED_FIELD: Constants.RANDOM_SEED + 10 }) utilities.plant_seeds() Constants.print_properties() records = ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE) if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS: num_records = len(records) records = records[:num_records / 2] print('num_reviews', len(records)) all_term_rankings = None topic_model_type = Constants.TOPIC_MODEL_TYPE if topic_model_type in ['lda', 'nmf']: all_term_rankings = create_all_term_rankings(records, metric) elif topic_model_type == 'ensemble': all_term_rankings = create_all_term_rankings_from_ensemble() else: raise ValueError('Unrecognized topic modeling algorithm: \'%s\'' % topic_model_type) print('Total iterations: %d' % len(all_term_rankings)) if metric == TERM_STABILITY_REFERENCE: return eval_term_stability_reference(all_term_rankings) if metric == TERM_STABILITY_PAIRWISE: return eval_term_stability_pairwise(all_term_rankings) elif metric == TERM_DIFFERENCE: return eval_term_difference(all_term_rankings) else: raise ValueError('Unknown evaluation metric: \'%s\'' % metric)
def find_reviews_topics(self, context_extractor, cycle_index, fold_index): print('find topics: %s' % time.strftime("%Y/%m/%d-%H:%M:%S")) train_records_file_path = Constants.generate_file_name( 'context_train_records', 'json', Constants.CACHE_FOLDER, cycle_index, fold_index, Constants.USE_CONTEXT) if os.path.exists(train_records_file_path): self.train_records = \ ETLUtils.load_json_file(train_records_file_path) else: context_extractor.find_contextual_topics(self.train_records) ETLUtils.save_json_file(train_records_file_path, self.train_records) context_extractor.find_contextual_topics( self.important_records, Constants.TEXT_SAMPLING_PROPORTION) self.context_topics_map = {} for record in self.important_records: self.context_topics_map[record[Constants.REVIEW_ID_FIELD]] = \ record[Constants.CONTEXT_TOPICS_FIELD] self.important_records = None gc.collect()
def load_records_to_predict(self, records_file): self.records_to_predict = ETLUtils.load_json_file(records_file) with open(records_file + '.pkl', 'rb') as read_file: self.items_to_predict = pickle.load(read_file)
def transform_manually_labeled_reviews(): full_records = ETLUtils.load_json_file(Constants.DATASET_FOLDER + 'yelp_training_set_review_restaurants_shuffled_tagged.json') records = ETLUtils.load_json_file(Constants.CLASSIFIED_RECORDS_FILE) print('total records: %d' % len(records)) new_records = [] for record in records: sentence_index = record['sentence_index'] if sentence_index > 0: continue record['predicted_class'] = record['sentence_type'] new_records.append(record) # count = 0 # for new_record in new_records: # internal_count = 0 # for full_record in full_records: # if full_record['text'].startswith(new_record['text']): # # print(full_record['text']) # internal_count += 1 # count += 1 # print('internal count: %d\treview_id: %s' % (internal_count, full_record['review_id'])) # # if internal_count > 1: # print('internal count: %d\treview_id: %s' % (internal_count, new_record['text'])) # print('count: %d' % count) index = 0 for new_record in new_records: while True: full_record = full_records[index] if full_record['text'].startswith(new_record['text']): new_record[Constants.USER_ID_FIELD] = full_record['user_id'] new_record[Constants.ITEM_ID_FIELD] = full_record['business_id'] new_record[Constants.REVIEW_ID_FIELD] = full_record['review_id'] new_record[Constants.RATING_FIELD] = full_record['stars'] break index += 1 index += 1 print('index: %d' % index) for new_record in new_records: for full_record in full_records: if new_record['review_id'] == full_record['review_id']: print('%s ====' % new_record['text']) print(full_record['text']) print('******************\n******************\n******************\n******************') break # reviews_preprocessor = ReviewsPreprocessor() # new_records = reviews_preprocessor.lemmatize_sentences(new_records) # reviews_preprocessor.records = new_records # reviews_preprocessor.build_bag_of_words() # reviews_preprocessor.drop_unnecessary_fields() new_classified_records_file = Constants.DATASET_FOLDER + 'classified_' + \ Constants.ITEM_TYPE + '_reviews_first_sentences.json' print(new_records[0]) ETLUtils.save_json_file(new_classified_records_file, new_records)
if review['business_id'] in business_ids: filtered_reviews.append(review) return filtered_reviews @staticmethod def sort_records(records, field, reverse=False): return sorted(records, key=itemgetter(field), reverse=reverse) start = time.time() review_etl = ReviewETL() my_business_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_business.json" my_reviews_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review.json" my_business_ids = BusinessETL.get_business_ids(my_business_file, 'Hotels') my_reviews = ETLUtils.load_json_file(my_reviews_file) # print(len(ReviewETL.filter_reviews_by_business(my_reviews, my_business_ids, 'text'))) my_restaurant_reviews = ReviewETL.filter_reviews_by_business_slow(my_reviews, my_business_ids) my_restaurants_file = "/Users/fpena/tmp/yelp_training_set/yelp_training_set_review_hotels.json" ETLUtils.save_json_file(my_restaurants_file, my_restaurant_reviews) # my_sorted_reviews = ReviewETL.sort_records(my_reviews, 'business_id') # print(len(my_sorted_reviews)) # main() end = time.time() total_time = end - start print("Total time = %f seconds" % total_time)
def main(): item_type = 'hotel' # item_type = 'restaurant' my_folder = '/Users/fpena/UCC/Thesis/datasets/context/' my_file = my_folder + 'classified_' + item_type + '_reviews.json' binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl' my_records = ETLUtils.load_json_file(my_file) with open(binary_reviews_file, 'rb') as read_file: my_reviews = pickle.load(read_file) num_features = 2 my_metrics = numpy.zeros((len(my_reviews), num_features)) for index in range(len(my_reviews)): my_metrics[index] =\ review_metrics_extractor.get_review_metrics(my_reviews[index]) review_metrics_extractor.normalize_matrix_by_columns(my_metrics) count_specific = 0 count_generic = 0 for record in my_records: if record['specific'] == 'yes': count_specific += 1 if record['specific'] == 'no': count_generic += 1 print('count_specific: %d' % count_specific) print('count_generic: %d' % count_generic) print('specific percentage: %f%%' % (float(count_specific)/len(my_records))) print('generic percentage: %f%%' % (float(count_generic)/len(my_records))) my_labels = numpy.array([record['specific'] == 'yes' for record in my_records]) classifiers = [ DummyClassifier(strategy='most_frequent', random_state=0), DummyClassifier(strategy='stratified', random_state=0), DummyClassifier(strategy='uniform', random_state=0), # DummyClassifier(strategy='constant', random_state=0, constant=True), LogisticRegression(C=100), SVC(C=1.0, kernel='rbf'), SVC(C=1.0, kernel='linear'), KNeighborsClassifier(n_neighbors=10), tree.DecisionTreeClassifier(), NuSVC(), LinearSVC() ] scores = [[] for _ in range(len(classifiers))] Xtrans = my_metrics cv = KFold(n=len(my_metrics), n_folds=5) for i in range(len(classifiers)): for train, test in cv: x_train, y_train = Xtrans[train], my_labels[train] x_test, y_test = Xtrans[test], my_labels[test] clf = classifiers[i] clf.fit(x_train, y_train) scores[i].append(clf.score(x_test, y_test)) for classifier, score in zip(classifiers, scores): print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score))) plot(my_metrics, my_labels)