def filter_reviews(): businesses = load_data("food_businesses.json") business_ids = set(food_lib.map_to_arg(food_lib.filter_by_city(businesses, "Las Vegas"), "business_id")) out_file = open('vegas_reviews.json', 'w+') with open(env.DATASET_PATH + 'yelp_academic_dataset_review.json', 'r') as f: for line in f: line_json = json.loads(line) if line_json['business_id'] in business_ids: out_file.write(json.dumps(line_json) + '\n')
def process_categories(): reviews = load_data("vegas_reviews.json") def pizza(line): return 'Pizza' in line['categories'] vegas_cats = set(food_lib.map_to_arg(load_data("food_businesses.json", pizza), 'business_id')) times = [] for r in reviews: if r['business_id'] in vegas_cats: times.append(r['date']) print(sorted(times))
text = r['text'] date = r['date'] converted_date = datetime.datetime.strptime(date, "%Y-%m-%d") time_constraint_valid = in_time_range(date, (time_start, time_end)) location_constraint_valid = not business_ids or (r['business_id'] in business_ids) if time_constraint_valid and location_constraint_valid: for q in query: if q in text: key = (converted_date.month, converted_date.year) if key in date_counts: date_counts[key] += 1 else: date_counts[key] = 1 return date_counts if len(sys.argv) > 1: businesses = load_data("food_businesses.json") business_ids = set(food_lib.map_to_arg(food_lib.filter_by_city(businesses, "Las Vegas"), "business_id")) reviews = load_data("review_test.json") query = sys.argv[1].strip().split(',') count_occurrences(query, reviews) date_counts = count_occurrences(query, reviews, business_ids=business_ids) print(sorted(date_counts.items(), key=lambda x: x[1], reverse=True)[0:10])