def prepare(self): print('prepare: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) contextual_train_set =\ ETLUtils.select_fields(self.headers, self.train_records) contextual_test_set =\ ETLUtils.select_fields(self.headers, self.records_to_predict) ETLUtils.save_csv_file( self.csv_train_file, contextual_train_set, self.headers) ETLUtils.save_csv_file( self.csv_test_file, contextual_test_set, self.headers) print('Exported CSV and JSON files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S")) csv_files = [ self.csv_train_file, self.csv_test_file ] num_cols = len(self.headers) context_cols = num_cols print('num_cols', num_cols) # print('context_cols', context_cols) libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], range(3, context_cols), ',', has_header=True, suffix='.no_context.libfm') libfm_converter.csv_to_libfm( csv_files, 0, [1, 2], [], ',', has_header=True, suffix='.context.libfm') print('Exported LibFM files: %s' % time.strftime("%Y/%d/%m-%H:%M:%S"))
def test_select_fields(self): select_fields = ['user_id', 'offering_id', 'overall_rating'] result = ETLUtils.select_fields(select_fields, reviews_matrix_5) self.assertEqual(result, reviews_matrix_5_short) select_fields = ['user_id'] result = ETLUtils.select_fields(select_fields, reviews_matrix_5_short) self.assertEqual(result, reviews_matrix_5_users)
def full_cycle(self, train_records, test_records, train_reviews, test_reviews): self.lda_based_context = LdaBasedContext(train_records, train_reviews) self.lda_based_context.get_context_rich_topics() print("Trained LDA Model: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) contextual_train_set = self.lda_based_context.find_contextual_topics(train_records) contextual_test_set = self.lda_based_context.find_contextual_topics(test_records) print("contextual test set size: %d" % len(contextual_test_set)) self.build_headers() contextual_train_set = ETLUtils.select_fields(self.headers, contextual_train_set) contextual_test_set = ETLUtils.select_fields(self.headers, contextual_test_set) print("Exported contextual topics: %s" % time.strftime("%Y/%d/%m-%H:%M:%S")) return contextual_train_set, contextual_test_set
def load_data(json_file): records = ETLUtils.load_json_file(json_file) fields = ['user_id', 'business_id', 'stars', 'text', 'review_id'] records = ETLUtils.select_fields(fields, records) # We rename the 'stars' field to 'overall_rating' to take advantage of the # function extractor.get_user_average_overall_rating for record in records: record['overall_rating'] = record.pop('stars') record['offering_id'] = record.pop('business_id') return records
def load_data(json_file): records = ETLUtils.load_json_file(json_file) fields = ['user_id', 'business_id', 'stars'] records = ETLUtils.select_fields(fields, records) # We rename the 'stars' field to 'overall_rating' to take advantage of the # function extractor.get_user_average_overall_rating for record in records: record['overall_rating'] = record.pop('stars') record['offering_id'] = record.pop('business_id') return records
def main_converter(): csv_train_file = GENERATED_FOLDER + 'yelp_training_set_review_' + DATASET + 's_shuffled_train.csv' csv_test_file = GENERATED_FOLDER + 'records_to_predict_' + DATASET + '.csv' # ETLUtils.json_to_csv(TRAIN_RECORDS_FILE, csv_train_file, 'user_id', 'business_id', 'stars', False, True) # ETLUtils.json_to_csv(RECORDS_TO_PREDICT_FILE, csv_test_file, 'user_id', 'business_id', 'stars', False, True) headers = ['stars', 'user_id', 'business_id'] train_records = ETLUtils.load_json_file(TRAIN_RECORDS_FILE) records_to_predict = ETLUtils.load_json_file(RECORDS_TO_PREDICT_FILE) train_records = ETLUtils.select_fields(headers, train_records) records_to_predict = ETLUtils.select_fields(headers, records_to_predict) ETLUtils.save_csv_file(csv_train_file, train_records, headers) ETLUtils.save_csv_file(csv_test_file, records_to_predict, headers) csv_files = [ csv_train_file, csv_test_file ] csv_to_libfm(csv_files, 0, [1, 2], [], ',', has_header=True)
def pre_process_reviews(): """ Returns a list of preprocessed reviews, where the reviews have been filtered to obtain only relevant data, have dropped any fields that are not useful, and also have additional fields that are handy to make calculations :return: a list of preprocessed reviews """ reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json' reviews = ETLUtils.load_json_file(reviews_file) select_fields = ['user_id', 'business_id', 'stars'] reviews = ETLUtils.select_fields(select_fields, reviews) extract_fields(reviews) ETLUtils.drop_fields(['business_id', 'stars'], reviews) # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json') reviews = clean_reviews(reviews) return reviews
def pre_process_reviews(): """ Returns a list of preprocessed reviews, where the reviews have been filtered to obtain only relevant data, have dropped any fields that are not useful, and also have additional fields that are handy to make calculations :return: a list of preprocessed reviews """ data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/' review_file_path = data_folder + 'review.txt' # review_file_path = data_folder + 'review-short.json' reviews = ETLUtils.load_json_file(review_file_path) select_fields = ['ratings', 'author', 'offering_id'] reviews = ETLUtils.select_fields(select_fields, reviews) extract_fields(reviews) ETLUtils.drop_fields(['author', 'ratings'], reviews) # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json') # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat') reviews = clean_reviews(reviews) return reviews