# Save a map of biz ID to categories biz_id_to_categories = {} for business in simplejson.load(open(business_in_path, "rb")): roots = set() for cat in business["categories"]: roots.update(categories.get_yelp_roots_for_cat(cat)) biz_id_to_categories[business["id"]] = list(roots) simplejson.dump(biz_id_to_categories, open(biz_to_cats_out_path, "wbc"), indent=2) print >>sys.stderr, "Saved biz to category map to %s" % biz_to_cats_out_path # Save the map of categories to biz counts category_to_biz_count = defaultdict(int) for biz_id, cats in biz_id_to_categories.iteritems(): if not cats: category_to_biz_count["__uncategorized"] += 1 else: for cat in cats: category_to_biz_count[cat] += 1 simplejson.dump(category_to_biz_count, open(cat_to_biz_count_out_path, "wbc"), indent=2) print >>sys.stderr, "Saved category to biz count to %s" % cat_to_biz_count_out_path # Now build the review dump with open(review_dump_out_path, "wbc") as reviews_out_file: for review in simplejson.load(open(reviews_in_path, "rb")): print >> reviews_out_file, encode_document(review["biz_id"], review["text"]) print >>sys.stderr, "Saved review dump to %s" % review_dump_out_path
open(biz_to_cats_out_path, 'wbc'), indent=2) print >>sys.stderr, "Saved biz to category map to %s" % biz_to_cats_out_path # Save the map of categories to biz counts category_to_biz_count = defaultdict(int) for biz_id, cats in biz_id_to_categories.iteritems(): if not cats: category_to_biz_count['__uncategorized'] += 1 else: for cat in cats: category_to_biz_count[cat] += 1 simplejson.dump(category_to_biz_count, open(cat_to_biz_count_out_path, 'wbc'), indent=2) print >>sys.stderr, "Saved category to biz count to %s" % cat_to_biz_count_out_path # Now build the review dump with open(review_dump_out_path, 'wbc') as reviews_out_file: for review in simplejson.load(open(reviews_in_path, 'rb')): print >>reviews_out_file, \ encode_document(review['biz_id'], review['text']) print >>sys.stderr, "Saved review dump to %s" % review_dump_out_path
simplejson.dump(product_id_to_categories, open(product_to_cats_out_path, 'wbc'), indent=2) print >>sys.stderr, "Saved product to category map to %s" % product_to_cats_out_path # Save the map of categories to product counts category_to_product_count = defaultdict(int) for product_id, cats in product_id_to_categories.iteritems(): if not cats: category_to_product_count['__uncategorized'] += 1 else: for cat in cats: category_to_product_count[cat] += 1 simplejson.dump(category_to_product_count, open(cat_to_doc_count_out_path, 'wbc'), indent=2) print >>sys.stderr, "Saved category to product count to %s" % cat_to_doc_count_out_path # Now build the review dump with open(review_dump_out_path, 'wbc') as reviews_out_file: for review in csv.DictReader(open(reviews_in_path, 'rb')): print >>reviews_out_file, \ encode_document(review['product_id'], review['text']) print >>sys.stderr, "Saved review dump to %s" % review_dump_out_path
categories.get_amazon_roots_for_cat_id(product['category_id']) simplejson.dump(product_id_to_categories, open(product_to_cats_out_path, 'wbc'), indent=2) print >> sys.stderr, "Saved product to category map to %s" % product_to_cats_out_path # Save the map of categories to product counts category_to_product_count = defaultdict(int) for product_id, cats in product_id_to_categories.iteritems(): if not cats: category_to_product_count['__uncategorized'] += 1 else: for cat in cats: category_to_product_count[cat] += 1 simplejson.dump(category_to_product_count, open(cat_to_doc_count_out_path, 'wbc'), indent=2) print >> sys.stderr, "Saved category to product count to %s" % cat_to_doc_count_out_path # Now build the review dump with open(review_dump_out_path, 'wbc') as reviews_out_file: for review in csv.DictReader(open(reviews_in_path, 'rb')): print >>reviews_out_file, \ encode_document(review['product_id'], review['text']) print >> sys.stderr, "Saved review dump to %s" % review_dump_out_path