def multiple_lineal_regression(file_path): records = ReviewETL.load_file(file_path) ratings = np.array([record['stars'] for record in records]) ETLUtils.drop_fields(['stars'], records) data = np.array([record.values() for record in records]) # Create linear regression object regr = linear_model.LinearRegression() # Train the model using the training sets regr.fit(data, ratings) model = linear_model.LinearRegression(fit_intercept=True) model.fit(data, ratings) p = np.array([model.predict(xi) for xi in data]) e = p - ratings total_error = np.dot(e, e) rmse_train = np.sqrt(total_error / len(p)) kf = KFold(len(data), n_folds=10) err = 0 for train, test in kf: model.fit(data[train], ratings[train]) p = np.array([model.predict(xi) for xi in data[test]]) e = p - ratings[test] err += np.dot(e, e) rmse_10cv = np.sqrt(err / len(data)) print('RMSE on training: {}'.format(rmse_train)) print('RMSE on 10-fold CV: {}'.format(rmse_10cv))
def drop_unwanted_fields(dictionary_list): """ Drops fields that are not useful for data analysis in the business data set :rtype : void :param dictionary_list: the list of dictionaries containing the data """ unwanted_fields = [ 'attributes', 'business_id', 'categories', 'city', 'full_address', 'latitude', 'longitude', 'hours', 'name', 'neighborhoods', 'open', 'review_count', 'stars', 'state', 'type' ] ETLUtils.drop_fields(unwanted_fields, dictionary_list)
def drop_unnecessary_fields(self): print('%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S")) unnecessary_fields = [ Constants.TEXT_FIELD, Constants.POS_TAGS_FIELD, Constants.VOTES_FIELD, Constants.BOW_FIELD ] ETLUtils.drop_fields(unnecessary_fields, self.records)
def drop_unnecessary_fields(self): print( '%s: drop unnecessary fields' % time.strftime("%Y/%m/%d-%H:%M:%S")) unnecessary_fields = [ Constants.TEXT_FIELD, Constants.POS_TAGS_FIELD, # Constants.BOW_FIELD ] ETLUtils.drop_fields(unnecessary_fields, self.records)
def test_drop_fields(self): drop_fields = [ 'cleanliness_rating', 'location_rating', 'rooms_rating', 'service_rating', 'value_rating' ] test_list = list(reviews_matrix_5) ETLUtils.drop_fields(drop_fields, test_list) self.assertEqual(reviews_matrix_5_short, test_list) test_list = list(reviews_matrix_5_short) self.assertEqual(reviews_matrix_5_short, test_list)
def pre_process_reviews(): """ Returns a list of preprocessed reviews, where the reviews have been filtered to obtain only relevant data, have dropped any fields that are not useful, and also have additional fields that are handy to make calculations :return: a list of preprocessed reviews """ reviews_file = '/Users/fpena/UCC/Thesis/datasets/yelp_phoenix_academic_dataset/yelp_academic_dataset_review.json' reviews = ETLUtils.load_json_file(reviews_file) select_fields = ['user_id', 'business_id', 'stars'] reviews = ETLUtils.select_fields(select_fields, reviews) extract_fields(reviews) ETLUtils.drop_fields(['business_id', 'stars'], reviews) # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json') reviews = clean_reviews(reviews) return reviews
def pre_process_reviews(): """ Returns a list of preprocessed reviews, where the reviews have been filtered to obtain only relevant data, have dropped any fields that are not useful, and also have additional fields that are handy to make calculations :return: a list of preprocessed reviews """ data_folder = '/Users/fpena/UCC/Thesis/datasets/TripAdvisor/Four-City/' review_file_path = data_folder + 'review.txt' # review_file_path = data_folder + 'review-short.json' reviews = ETLUtils.load_json_file(review_file_path) select_fields = ['ratings', 'author', 'offering_id'] reviews = ETLUtils.select_fields(select_fields, reviews) extract_fields(reviews) ETLUtils.drop_fields(['author', 'ratings'], reviews) # reviews = load_json_file('/Users/fpena/tmp/filtered_reviews.json') # reviews = preflib_extractor.load_csv_file('/Users/fpena/UCC/Thesis/datasets/TripAdvisor/PrefLib/trip/CD-00001-00000001-copy.dat') reviews = clean_reviews(reviews) return reviews