# Tools from utils import disp, data # ML from collections import Counter from math import log print "> Loading data" root = data.getParent(__file__) alltoken = data.loadFile(root + '/computed/alltoken.pkl') reviews_feature = data.loadFile(root + '/computed/reviews_feature.pkl') n = len(reviews_feature) print "Total reviews:", n # TF-IDF print "> Computing TF" TF = dict() i = 0 for review in reviews_feature: i += 1 disp.tempPrint(str(i)) TF[review] = Counter() for token in reviews_feature[review]: TF[review][token] = float(reviews_feature[review][token]) / float(max(reviews_feature[review].values())) print "> Computing IDF" IDF = dict()
""" In this script we look at the distribution of the number of reviews by category to see which one to choose for the per category sLDA. """ from __future__ import print_function import json from utils import tokenizer, disp, data from collections import Counter import numpy as np """ Files & Folders Parameters """ root = data.getParent("") def categories_info(): filepath = root + "/dataset/yelp_academic_dataset_business.json" """ Generate the count of reviews per category """ business_file = open(filepath); lines_file = business_file.readlines(); business_file.close(); business_by_category = dict(); categories_business_counts = Counter(); categories_reviews_counts = Counter(); for line_json in lines_file: business_dict = json.loads(line_json); business_id = business_dict["business_id"]; categories_list = business_dict["categories"]; for category in categories_list: